]> git.ipfire.org Git - people/ms/ipfire-3.x.git/blob - pkgs/core/kernel/patches/reiser4-for-2.6.33.patch.off
kernel: glibc is not a dependency.
[people/ms/ipfire-3.x.git] / pkgs / core / kernel / patches / reiser4-for-2.6.33.patch.off
1 diff -urN linux-2.6.33.orig/Documentation/Changes linux-2.6.33/Documentation/Changes
2 --- linux-2.6.33.orig/Documentation/Changes 2010-02-24 19:52:17.000000000 +0100
3 +++ linux-2.6.33/Documentation/Changes 2010-03-04 19:33:22.000000000 +0100
4 @@ -36,6 +36,7 @@
5 o e2fsprogs 1.41.4 # e2fsck -V
6 o jfsutils 1.1.3 # fsck.jfs -V
7 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
8 +o reiser4progs 1.0.0 # fsck.reiser4 -V
9 o xfsprogs 2.6.0 # xfs_db -V
10 o squashfs-tools 4.0 # mksquashfs -version
11 o btrfs-progs 0.18 # btrfsck
12 @@ -157,6 +158,13 @@
13 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
14 reiserfsck. These utils work on both i386 and alpha platforms.
15
16 +Reiser4progs
17 +------------
18 +
19 +The reiser4progs package contains utilities for the reiser4 file system.
20 +Detailed instructions are provided in the README file located at:
21 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
22 +
23 Xfsprogs
24 --------
25
26 @@ -345,6 +353,10 @@
27 -------------
28 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
29
30 +Reiser4progs
31 +------------
32 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
33 +
34 Xfsprogs
35 --------
36 o <ftp://oss.sgi.com/projects/xfs/download/>
37 diff -urN linux-2.6.33.orig/Documentation/filesystems/reiser4.txt linux-2.6.33/Documentation/filesystems/reiser4.txt
38 --- linux-2.6.33.orig/Documentation/filesystems/reiser4.txt 1970-01-01 01:00:00.000000000 +0100
39 +++ linux-2.6.33/Documentation/filesystems/reiser4.txt 2010-03-04 19:33:22.000000000 +0100
40 @@ -0,0 +1,75 @@
41 +Reiser4 filesystem
42 +==================
43 +Reiser4 is a file system based on dancing tree algorithms, and is
44 +described at http://www.namesys.com
45 +
46 +
47 +References
48 +==========
49 +web page http://namesys.com/v4/v4.html
50 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
51 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
52 +install page http://www.namesys.com/install_v4.html
53 +
54 +Compile options
55 +===============
56 +Enable reiser4 debug mode
57 + This checks everything imaginable while reiser4
58 + runs
59 +
60 +Mount options
61 +=============
62 +tmgr.atom_max_size=N
63 + Atoms containing more than N blocks will be forced to commit.
64 + N is decimal.
65 + Default is nr_free_pagecache_pages() / 2 at mount time.
66 +
67 +tmgr.atom_max_age=N
68 + Atoms older than N seconds will be forced to commit. N is decimal.
69 + Default is 600.
70 +
71 +tmgr.atom_max_flushers=N
72 + Limit of concurrent flushers for one atom. 0 means no limit.
73 + Default is 0.
74 +
75 +tree.cbk_cache.nr_slots=N
76 + Number of slots in the cbk cache.
77 +
78 +flush.relocate_threshold=N
79 + If flush finds more than N adjacent dirty leaf-level blocks it
80 + will force them to be relocated.
81 + Default is 64.
82 +
83 +flush.relocate_distance=N
84 + If flush finds can find a block allocation closer than at most
85 + N from the preceder it will relocate to that position.
86 + Default is 64.
87 +
88 +flush.scan_maxnodes=N
89 + The maximum number of nodes to scan left on a level during
90 + flush.
91 + Default is 10000.
92 +
93 +optimal_io_size=N
94 + Preferred IO size. This value is used to set st_blksize of
95 + struct stat.
96 + Default is 65536.
97 +
98 +bsdgroups
99 + Turn on BSD-style gid assignment.
100 +
101 +32bittimes
102 + By default file in reiser4 have 64 bit timestamps. Files
103 + created when filesystem is mounted with 32bittimes mount
104 + option will get 32 bit timestamps.
105 +
106 +mtflush
107 + Turn off concurrent flushing.
108 +
109 +nopseudo
110 + Disable pseudo files support. See
111 + http://namesys.com/v4/pseudo.html for more about pseudo files.
112 +
113 +dont_load_bitmap
114 + Don't load all bitmap blocks at mount time, it is useful for
115 + machines with tiny RAM and large disks.
116 diff -urN linux-2.6.33.orig/fs/fs-writeback.c linux-2.6.33/fs/fs-writeback.c
117 --- linux-2.6.33.orig/fs/fs-writeback.c 2010-02-24 19:52:17.000000000 +0100
118 +++ linux-2.6.33/fs/fs-writeback.c 2010-03-04 20:21:39.000000000 +0100
119 @@ -549,108 +549,85 @@
120 return ret;
121 }
122
123 -static void unpin_sb_for_writeback(struct super_block **psb)
124 +static void unpin_sb_for_writeback(struct super_block *sb)
125 {
126 - struct super_block *sb = *psb;
127 -
128 - if (sb) {
129 - up_read(&sb->s_umount);
130 - put_super(sb);
131 - *psb = NULL;
132 - }
133 + up_read(&sb->s_umount);
134 + put_super(sb);
135 }
136
137 +enum sb_pin_state {
138 + SB_PINNED,
139 + SB_NOT_PINNED,
140 + SB_PIN_FAILED
141 +};
142 +
143 /*
144 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
145 * before calling writeback. So make sure that we do pin it, so it doesn't
146 * go away while we are writing inodes from it.
147 - *
148 - * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
149 - * 1 if we failed.
150 */
151 -static int pin_sb_for_writeback(struct writeback_control *wbc,
152 - struct inode *inode, struct super_block **psb)
153 +static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
154 + struct super_block *sb)
155 {
156 - struct super_block *sb = inode->i_sb;
157 -
158 - /*
159 - * If this sb is already pinned, nothing more to do. If not and
160 - * *psb is non-NULL, unpin the old one first
161 - */
162 - if (sb == *psb)
163 - return 0;
164 - else if (*psb)
165 - unpin_sb_for_writeback(psb);
166 -
167 /*
168 * Caller must already hold the ref for this
169 */
170 if (wbc->sync_mode == WB_SYNC_ALL) {
171 WARN_ON(!rwsem_is_locked(&sb->s_umount));
172 - return 0;
173 + return SB_NOT_PINNED;
174 }
175 -
176 spin_lock(&sb_lock);
177 sb->s_count++;
178 if (down_read_trylock(&sb->s_umount)) {
179 if (sb->s_root) {
180 spin_unlock(&sb_lock);
181 - goto pinned;
182 + return SB_PINNED;
183 }
184 /*
185 * umounted, drop rwsem again and fall through to failure
186 */
187 up_read(&sb->s_umount);
188 }
189 -
190 sb->s_count--;
191 spin_unlock(&sb_lock);
192 - return 1;
193 -pinned:
194 - *psb = sb;
195 - return 0;
196 + return SB_PIN_FAILED;
197 }
198
199 -static void writeback_inodes_wb(struct bdi_writeback *wb,
200 +/*
201 + * Write a portion of b_io inodes which belong to @sb.
202 + * If @wbc->sb != NULL, then find and write all such
203 + * inodes. Otherwise write only ones which go sequentially
204 + * in reverse order.
205 + * Return 1, if the caller writeback routine should be
206 + * interrupted. Otherwise return 0.
207 + */
208 +int generic_writeback_sb_inodes(struct super_block *sb,
209 + struct bdi_writeback *wb,
210 struct writeback_control *wbc)
211 {
212 - struct super_block *sb = wbc->sb, *pin_sb = NULL;
213 - const unsigned long start = jiffies; /* livelock avoidance */
214 -
215 - spin_lock(&inode_lock);
216 -
217 - if (!wbc->for_kupdate || list_empty(&wb->b_io))
218 - queue_io(wb, wbc->older_than_this);
219 -
220 while (!list_empty(&wb->b_io)) {
221 - struct inode *inode = list_entry(wb->b_io.prev,
222 - struct inode, i_list);
223 long pages_skipped;
224 -
225 - /*
226 - * super block given and doesn't match, skip this inode
227 - */
228 - if (sb && sb != inode->i_sb) {
229 + struct inode *inode = list_entry(wb->b_io.prev,
230 + struct inode, i_list);
231 + if (wbc->sb && sb != inode->i_sb) {
232 + /* super block given and doesn't
233 + match, skip this inode */
234 redirty_tail(inode);
235 continue;
236 }
237 -
238 + if (sb != inode->i_sb)
239 + /* finish with this superblock */
240 + return 0;
241 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
242 requeue_io(inode);
243 continue;
244 }
245 -
246 /*
247 * Was this inode dirtied after sync_sb_inodes was called?
248 * This keeps sync from extra jobs and livelock.
249 */
250 - if (inode_dirtied_after(inode, start))
251 - break;
252 -
253 - if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
254 - requeue_io(inode);
255 - continue;
256 - }
257 + if (inode_dirtied_after(inode, wbc->wb_start))
258 + return 1;
259
260 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
261 __iget(inode);
262 @@ -669,14 +646,78 @@
263 spin_lock(&inode_lock);
264 if (wbc->nr_to_write <= 0) {
265 wbc->more_io = 1;
266 - break;
267 + return 1;
268 }
269 if (!list_empty(&wb->b_more_io))
270 wbc->more_io = 1;
271 }
272 + /* b_io is empty */
273 + return 1;
274 +}
275 +EXPORT_SYMBOL(generic_writeback_sb_inodes);
276 +
277 +/*
278 + * This function is for file systems which have their
279 + * own means of periodical write-out of old data.
280 + * NOTE: inode_lock should be hold.
281 + *
282 + * Skip a portion of b_io inodes which belong to @sb
283 + * and go sequentially in reverse order.
284 + */
285 +void writeback_skip_sb_inodes(struct super_block *sb,
286 + struct bdi_writeback *wb)
287 +{
288 + while (1) {
289 + struct inode *inode;
290 +
291 + if (list_empty(&wb->b_io))
292 + break;
293 + inode = list_entry(wb->b_io.prev, struct inode, i_list);
294 + if (sb != inode->i_sb)
295 + break;
296 + redirty_tail(inode);
297 + }
298 +}
299 +EXPORT_SYMBOL(writeback_skip_sb_inodes);
300
301 - unpin_sb_for_writeback(&pin_sb);
302 +static void writeback_inodes_wb(struct bdi_writeback *wb,
303 + struct writeback_control *wbc)
304 +{
305 + int ret = 0;
306
307 + wbc->wb_start = jiffies; /* livelock avoidance */
308 + spin_lock(&inode_lock);
309 + if (!wbc->for_kupdate || list_empty(&wb->b_io))
310 + queue_io(wb, wbc->older_than_this);
311 +
312 + while (!list_empty(&wb->b_io)) {
313 + struct inode *inode = list_entry(wb->b_io.prev,
314 + struct inode, i_list);
315 + struct super_block *sb = inode->i_sb;
316 + enum sb_pin_state state;
317 +
318 + if (wbc->sb && sb != wbc->sb) {
319 + /* super block given and doesn't
320 + match, skip this inode */
321 + redirty_tail(inode);
322 + continue;
323 + }
324 + state = pin_sb_for_writeback(wbc, sb);
325 +
326 + if (state == SB_PIN_FAILED) {
327 + requeue_io(inode);
328 + continue;
329 + }
330 + if (sb->s_op->writeback_inodes)
331 + ret = sb->s_op->writeback_inodes(sb, wb, wbc);
332 + else
333 + ret = generic_writeback_sb_inodes(sb, wb, wbc);
334 +
335 + if (state == SB_PINNED)
336 + unpin_sb_for_writeback(sb);
337 + if (ret)
338 + break;
339 + }
340 spin_unlock(&inode_lock);
341 /* Leave any unwritten inodes on b_io */
342 }
343 @@ -687,6 +728,7 @@
344
345 writeback_inodes_wb(&bdi->wb, wbc);
346 }
347 +EXPORT_SYMBOL(writeback_inodes_wbc);
348
349 /*
350 * The maximum number of pages to writeout in a single bdi flush/kupdate
351 @@ -1272,3 +1314,12 @@
352 return ret;
353 }
354 EXPORT_SYMBOL(sync_inode);
355 +/*
356 + * Local variables:
357 + * c-indentation-style: "K&R"
358 + * mode-name: "LC"
359 + * c-basic-offset: 8
360 + * tab-width: 8
361 + * fill-column: 79
362 + * End:
363 + */
364 diff -urN linux-2.6.33.orig/fs/inode.c linux-2.6.33/fs/inode.c
365 --- linux-2.6.33.orig/fs/inode.c 2010-02-24 19:52:17.000000000 +0100
366 +++ linux-2.6.33/fs/inode.c 2010-03-04 19:33:22.000000000 +0100
367 @@ -85,6 +85,7 @@
368 * the i_state of an inode while it is in use..
369 */
370 DEFINE_SPINLOCK(inode_lock);
371 +EXPORT_SYMBOL_GPL(inode_lock);
372
373 /*
374 * iprune_sem provides exclusion between the kswapd or try_to_free_pages
375 diff -urN linux-2.6.33.orig/fs/Kconfig linux-2.6.33/fs/Kconfig
376 --- linux-2.6.33.orig/fs/Kconfig 2010-02-24 19:52:17.000000000 +0100
377 +++ linux-2.6.33/fs/Kconfig 2010-03-04 19:33:22.000000000 +0100
378 @@ -27,6 +27,7 @@
379 default y if EXT4_FS=y && EXT4_FS_XATTR
380 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
381
382 +source "fs/reiser4/Kconfig"
383 source "fs/reiserfs/Kconfig"
384 source "fs/jfs/Kconfig"
385
386 diff -urN linux-2.6.33.orig/fs/Makefile linux-2.6.33/fs/Makefile
387 --- linux-2.6.33.orig/fs/Makefile 2010-02-24 19:52:17.000000000 +0100
388 +++ linux-2.6.33/fs/Makefile 2010-03-04 19:33:22.000000000 +0100
389 @@ -65,6 +65,7 @@
390 # Do not add any filesystems before this line
391 obj-$(CONFIG_FSCACHE) += fscache/
392 obj-$(CONFIG_REISERFS_FS) += reiserfs/
393 +obj-$(CONFIG_REISER4_FS) += reiser4/
394 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
395 obj-$(CONFIG_EXT2_FS) += ext2/
396 # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
397 diff -urN linux-2.6.33.orig/fs/reiser4/as_ops.c linux-2.6.33/fs/reiser4/as_ops.c
398 --- linux-2.6.33.orig/fs/reiser4/as_ops.c 1970-01-01 01:00:00.000000000 +0100
399 +++ linux-2.6.33/fs/reiser4/as_ops.c 2010-03-04 19:33:22.000000000 +0100
400 @@ -0,0 +1,337 @@
401 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
402 +
403 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
404 +
405 +#include "forward.h"
406 +#include "debug.h"
407 +#include "dformat.h"
408 +#include "coord.h"
409 +#include "plugin/item/item.h"
410 +#include "plugin/file/file.h"
411 +#include "plugin/security/perm.h"
412 +#include "plugin/disk_format/disk_format.h"
413 +#include "plugin/plugin.h"
414 +#include "plugin/plugin_set.h"
415 +#include "plugin/object.h"
416 +#include "txnmgr.h"
417 +#include "jnode.h"
418 +#include "znode.h"
419 +#include "block_alloc.h"
420 +#include "tree.h"
421 +#include "vfs_ops.h"
422 +#include "inode.h"
423 +#include "page_cache.h"
424 +#include "ktxnmgrd.h"
425 +#include "super.h"
426 +#include "reiser4.h"
427 +#include "entd.h"
428 +
429 +#include <linux/profile.h>
430 +#include <linux/types.h>
431 +#include <linux/mount.h>
432 +#include <linux/vfs.h>
433 +#include <linux/mm.h>
434 +#include <linux/buffer_head.h>
435 +#include <linux/dcache.h>
436 +#include <linux/list.h>
437 +#include <linux/pagemap.h>
438 +#include <linux/slab.h>
439 +#include <linux/seq_file.h>
440 +#include <linux/init.h>
441 +#include <linux/module.h>
442 +#include <linux/writeback.h>
443 +#include <linux/backing-dev.h>
444 +#include <linux/quotaops.h>
445 +#include <linux/security.h>
446 +
447 +/* address space operations */
448 +
449 +/**
450 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
451 + * @page: page to be dirtied
452 + *
453 + * Operation of struct address_space_operations. This implementation is used by
454 + * unix and cryptcompress file plugins.
455 + *
456 + * This is called when reiser4 page gets dirtied outside of reiser4, for
457 + * example, when dirty bit is moved from pte to physical page.
458 + *
459 + * Tags page in the mapping's page tree with special tag so that it is possible
460 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
461 + * capturing by an atom) later because it can not be done in the contexts where
462 + * set_page_dirty is called.
463 + */
464 +int reiser4_set_page_dirty(struct page *page)
465 +{
466 + /* this page can be unformatted only */
467 + assert("vs-1734", (page->mapping &&
468 + page->mapping->host &&
469 + reiser4_get_super_fake(page->mapping->host->i_sb) !=
470 + page->mapping->host &&
471 + reiser4_get_cc_fake(page->mapping->host->i_sb) !=
472 + page->mapping->host &&
473 + reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
474 + page->mapping->host));
475 + return __set_page_dirty_nobuffers(page);
476 +}
477 +
478 +/* ->invalidatepage method for reiser4 */
479 +
480 +/*
481 + * this is called for each truncated page from
482 + * truncate_inode_pages()->truncate_{complete,partial}_page().
483 + *
484 + * At the moment of call, page is under lock, and outstanding io (if any) has
485 + * completed.
486 + */
487 +
488 +/**
489 + * reiser4_invalidatepage
490 + * @page: page to invalidate
491 + * @offset: starting offset for partial invalidation
492 + *
493 + */
494 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
495 +{
496 + int ret = 0;
497 + reiser4_context *ctx;
498 + struct inode *inode;
499 + jnode *node;
500 +
501 + /*
502 + * This is called to truncate file's page.
503 + *
504 + * Originally, reiser4 implemented truncate in a standard way
505 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
506 + * first, then file system ->truncate() call-back is invoked).
507 + *
508 + * This lead to the problem when ->invalidatepage() was called on a
509 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
510 + * process. That is, truncate was bypassing transactions. To avoid
511 + * this, try_capture_page_to_invalidate() call was added here.
512 + *
513 + * After many troubles with vmtruncate() based truncate (including
514 + * races with flush, tail conversion, etc.) it was re-written in the
515 + * top-to-bottom style: items are killed in reiser4_cut_tree_object()
516 + * and pages belonging to extent are invalidated in kill_hook_extent().
517 + * So probably now additional call to capture is not needed here.
518 + */
519 +
520 + assert("nikita-3137", PageLocked(page));
521 + assert("nikita-3138", !PageWriteback(page));
522 + inode = page->mapping->host;
523 +
524 + /*
525 + * ->invalidatepage() should only be called for the unformatted
526 + * jnodes. Destruction of all other types of jnodes is performed
527 + * separately. But, during some corner cases (like handling errors
528 + * during mount) it is simpler to let ->invalidatepage to be called on
529 + * them. Check for this, and do nothing.
530 + */
531 + if (reiser4_get_super_fake(inode->i_sb) == inode)
532 + return;
533 + if (reiser4_get_cc_fake(inode->i_sb) == inode)
534 + return;
535 + if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
536 + return;
537 + assert("vs-1426", PagePrivate(page));
538 + assert("vs-1427",
539 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
540 + assert("", jprivate(page) != NULL);
541 + assert("", ergo(inode_file_plugin(inode) !=
542 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
543 + offset == 0));
544 +
545 + ctx = reiser4_init_context(inode->i_sb);
546 + if (IS_ERR(ctx))
547 + return;
548 +
549 + node = jprivate(page);
550 + spin_lock_jnode(node);
551 + if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) |
552 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
553 + /* there is not need to capture */
554 + jref(node);
555 + JF_SET(node, JNODE_HEARD_BANSHEE);
556 + page_clear_jnode(page, node);
557 + reiser4_uncapture_jnode(node);
558 + unhash_unformatted_jnode(node);
559 + jput(node);
560 + reiser4_exit_context(ctx);
561 + return;
562 + }
563 + spin_unlock_jnode(node);
564 +
565 + /* capture page being truncated. */
566 + ret = try_capture_page_to_invalidate(page);
567 + if (ret != 0)
568 + warning("nikita-3141", "Cannot capture: %i", ret);
569 +
570 + if (offset == 0) {
571 + /* remove jnode from transaction and detach it from page. */
572 + jref(node);
573 + JF_SET(node, JNODE_HEARD_BANSHEE);
574 + /* page cannot be detached from jnode concurrently, because it
575 + * is locked */
576 + reiser4_uncapture_page(page);
577 +
578 + /* this detaches page from jnode, so that jdelete will not try
579 + * to lock page which is already locked */
580 + spin_lock_jnode(node);
581 + page_clear_jnode(page, node);
582 + spin_unlock_jnode(node);
583 + unhash_unformatted_jnode(node);
584 +
585 + jput(node);
586 + }
587 +
588 + reiser4_exit_context(ctx);
589 +}
590 +
591 +/* help function called from reiser4_releasepage(). It returns true if jnode
592 + * can be detached from its page and page released. */
593 +int jnode_is_releasable(jnode * node/* node to check */)
594 +{
595 + assert("nikita-2781", node != NULL);
596 + assert_spin_locked(&(node->guard));
597 + assert_spin_locked(&(node->load));
598 +
599 + /* is some thread is currently using jnode page, later cannot be
600 + * detached */
601 + if (atomic_read(&node->d_count) != 0)
602 + return 0;
603 +
604 + assert("vs-1214", !jnode_is_loaded(node));
605 +
606 + /*
607 + * can only release page if real block number is assigned to it. Simple
608 + * check for ->atom wouldn't do, because it is possible for node to be
609 + * clean, not it atom yet, and still having fake block number. For
610 + * example, node just created in jinit_new().
611 + */
612 + if (reiser4_blocknr_is_fake(jnode_get_block(node)))
613 + return 0;
614 +
615 + /*
616 + * pages prepared for write can not be released anyway, so avoid
617 + * detaching jnode from the page
618 + */
619 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
620 + return 0;
621 +
622 + /*
623 + * dirty jnode cannot be released. It can however be submitted to disk
624 + * as part of early flushing, but only after getting flush-prepped.
625 + */
626 + if (JF_ISSET(node, JNODE_DIRTY))
627 + return 0;
628 +
629 + /* overwrite set is only written by log writer. */
630 + if (JF_ISSET(node, JNODE_OVRWR))
631 + return 0;
632 +
633 + /* jnode is already under writeback */
634 + if (JF_ISSET(node, JNODE_WRITEBACK))
635 + return 0;
636 +
637 + /* don't flush bitmaps or journal records */
638 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
639 + return 0;
640 +
641 + return 1;
642 +}
643 +
644 +/*
645 + * ->releasepage method for reiser4
646 + *
647 + * This is called by VM scanner when it comes across clean page. What we have
648 + * to do here is to check whether page can really be released (freed that is)
649 + * and if so, detach jnode from it and remove page from the page cache.
650 + *
651 + * Check for releasability is done by releasable() function.
652 + */
653 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
654 +{
655 + jnode *node;
656 +
657 + assert("nikita-2257", PagePrivate(page));
658 + assert("nikita-2259", PageLocked(page));
659 + assert("nikita-2892", !PageWriteback(page));
660 + assert("nikita-3019", reiser4_schedulable());
661 +
662 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
663 + is not clear what to do in this case. A lot of deadlocks seems be
664 + possible. */
665 +
666 + node = jnode_by_page(page);
667 + assert("nikita-2258", node != NULL);
668 + assert("reiser4-4", page->mapping != NULL);
669 + assert("reiser4-5", page->mapping->host != NULL);
670 +
671 + if (PageDirty(page))
672 + return 0;
673 +
674 + /* extra page reference is used by reiser4 to protect
675 + * jnode<->page link from this ->releasepage(). */
676 + if (page_count(page) > 3)
677 + return 0;
678 +
679 + /* releasable() needs jnode lock, because it looks at the jnode fields
680 + * and we need jload_lock here to avoid races with jload(). */
681 + spin_lock_jnode(node);
682 + spin_lock(&(node->load));
683 + if (jnode_is_releasable(node)) {
684 + struct address_space *mapping;
685 +
686 + mapping = page->mapping;
687 + jref(node);
688 + /* there is no need to synchronize against
689 + * jnode_extent_write() here, because pages seen by
690 + * jnode_extent_write() are !releasable(). */
691 + page_clear_jnode(page, node);
692 + spin_unlock(&(node->load));
693 + spin_unlock_jnode(node);
694 +
695 + /* we are under memory pressure so release jnode also. */
696 + jput(node);
697 +
698 + return 1;
699 + } else {
700 + spin_unlock(&(node->load));
701 + spin_unlock_jnode(node);
702 + assert("nikita-3020", reiser4_schedulable());
703 + return 0;
704 + }
705 +}
706 +
707 +int reiser4_readpage(struct file *file, struct page *page)
708 +{
709 + assert("edward-1533", PageLocked(page));
710 + assert("edward-1534", !PageUptodate(page));
711 + assert("edward-1535", page->mapping && page->mapping->host);
712 +
713 + return inode_file_plugin(page->mapping->host)->readpage(file, page);
714 +}
715 +
716 +int reiser4_readpages(struct file *file, struct address_space *mapping,
717 + struct list_head *pages, unsigned nr_pages)
718 +{
719 + return inode_file_plugin(mapping->host)->readpages(file, mapping,
720 + pages, nr_pages);
721 +}
722 +
723 +int reiser4_writepages(struct address_space *mapping,
724 + struct writeback_control *wbc)
725 +{
726 + return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
727 +}
728 +
729 +/* Make Linus happy.
730 + Local variables:
731 + c-indentation-style: "K&R"
732 + mode-name: "LC"
733 + c-basic-offset: 8
734 + tab-width: 8
735 + fill-column: 120
736 + End:
737 +*/
738 diff -urN linux-2.6.33.orig/fs/reiser4/block_alloc.c linux-2.6.33/fs/reiser4/block_alloc.c
739 --- linux-2.6.33.orig/fs/reiser4/block_alloc.c 1970-01-01 01:00:00.000000000 +0100
740 +++ linux-2.6.33/fs/reiser4/block_alloc.c 2010-03-04 19:33:22.000000000 +0100
741 @@ -0,0 +1,1142 @@
742 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
743 +reiser4/README */
744 +
745 +#include "debug.h"
746 +#include "dformat.h"
747 +#include "plugin/plugin.h"
748 +#include "txnmgr.h"
749 +#include "znode.h"
750 +#include "block_alloc.h"
751 +#include "tree.h"
752 +#include "super.h"
753 +
754 +#include <linux/types.h> /* for __u?? */
755 +#include <linux/fs.h> /* for struct super_block */
756 +#include <linux/spinlock.h>
757 +
758 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
759 +
760 +/* We need to be able to reserve enough disk space to ensure that an atomic
761 + operation will have enough disk space to flush (see flush.c and
762 + http://namesys.com/v4/v4.html) and commit it once it is started.
763 +
764 + In our design a call for reserving disk space may fail but not an actual
765 + block allocation.
766 +
767 + All free blocks, already allocated blocks, and all kinds of reserved blocks
768 + are counted in different per-fs block counters.
769 +
770 + A reiser4 super block's set of block counters currently is:
771 +
772 + free -- free blocks,
773 + used -- already allocated blocks,
774 +
775 + grabbed -- initially reserved for performing an fs operation, those blocks
776 + are taken from free blocks, then grabbed disk space leaks from grabbed
777 + blocks counter to other counters like "fake allocated", "flush
778 + reserved", "used", the rest of not used grabbed space is returned to
779 + free space at the end of fs operation;
780 +
781 + fake allocated -- counts all nodes without real disk block numbers assigned,
782 + we have separate accounting for formatted and unformatted
783 + nodes (for easier debugging);
784 +
785 + flush reserved -- disk space needed for flushing and committing an atom.
786 + Each dirty already allocated block could be written as a
787 + part of atom's overwrite set or as a part of atom's
788 + relocate set. In both case one additional block is needed,
789 + it is used as a wandered block if we do overwrite or as a
790 + new location for a relocated block.
791 +
792 + In addition, blocks in some states are counted on per-thread and per-atom
793 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
794 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
795 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
796 + blocks, which are reserved for flush processing and atom commit. */
797 +
798 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
799 + number of blocks to grab for most expensive case of balancing when the leaf
800 + node we insert new item to gets split and new leaf node is allocated.
801 +
802 + So, we need to grab blocks for
803 +
804 + 1) one block for possible dirtying the node we insert an item to. That block
805 + would be used for node relocation at flush time or for allocating of a
806 + wandered one, it depends what will be a result (what set, relocate or
807 + overwrite the node gets assigned to) of the node processing by the flush
808 + algorithm.
809 +
810 + 2) one block for either allocating a new node, or dirtying of right or left
811 + clean neighbor, only one case may happen.
812 +
813 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying
814 + of left neighbor, right neighbor, current node, and creation of new node.
815 + Have I forgotten something? email me.
816 +
817 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
818 + counter and in the fs-wide one (both ctx->grabbed_blocks and
819 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
820 + decremented by 2.
821 +
822 + Suppose both two blocks were spent for dirtying of an already allocated clean
823 + node (one block went from "grabbed" to "flush reserved") and for new block
824 + allocating (one block went from "grabbed" to "fake allocated formatted").
825 +
826 + Inserting of a child pointer to the parent node caused parent node to be
827 + split, the balancing code takes care about this grabbing necessary space
828 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
829 + "can use the 5% reserved disk space".
830 +
831 + At this moment insertion completes and grabbed blocks (if they were not used)
832 + should be returned to the free space counter.
833 +
834 + However the atom life-cycle is not completed. The atom had one "flush
835 + reserved" block added by our insertion and the new fake allocated node is
836 + counted as a "fake allocated formatted" one. The atom has to be fully
837 + processed by flush before commit. Suppose that the flush moved the first,
838 + already allocated node to the atom's overwrite list, the new fake allocated
839 + node, obviously, went into the atom relocate set. The reiser4 flush
840 + allocates the new node using one unit from "fake allocated formatted"
841 + counter, the log writer uses one from "flush reserved" for wandered block
842 + allocation.
843 +
844 + And, it is not the end. When the wandered block is deallocated after the
845 + atom gets fully played (see wander.c for term description), the disk space
846 + occupied for it is returned to free blocks. */
847 +
848 +/* BLOCK NUMBERS */
849 +
850 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
851 + indexing in hash tables, so if a block has not yet been assigned a location
852 + on disk we need to give it a temporary fake block number.
853 +
854 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
855 + use highest bit in 64-bit block number to distinguish fake and real block
856 + numbers. So, only 63 bits may be used to addressing of real device
857 + blocks. That "fake" block numbers space is divided into subspaces of fake
858 + block numbers for data blocks and for shadow (working) bitmap blocks.
859 +
860 + Fake block numbers for data blocks are generated by a cyclic counter, which
861 + gets incremented after each real block allocation. We assume that it is
862 + impossible to overload this counter during one transaction life. */
863 +
864 +/* Initialize a blocknr hint. */
865 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
866 +{
867 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
868 +}
869 +
870 +/* Release any resources of a blocknr hint. */
871 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
872 +{
873 +/* No resources should be freed in current blocknr_hint implementation. */
874 +}
875 +
876 +/* see above for explanation of fake block number. */
877 +/* Audited by: green(2002.06.11) */
878 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
879 +{
880 + /* The reason for not simply returning result of '&' operation is that
881 + while return value is (possibly 32bit) int, the reiser4_block_nr is
882 + at least 64 bits long, and high bit (which is the only possible
883 + non zero bit after the masking) would be stripped off */
884 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
885 +}
886 +
887 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
888 + arithmetic. Mostly, they are isolated to not to code same assertions in
889 + several places. */
890 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
891 +{
892 + BUG_ON(ctx->grabbed_blocks < count);
893 + assert("zam-527", ctx->grabbed_blocks >= count);
894 + ctx->grabbed_blocks -= count;
895 +}
896 +
897 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
898 +{
899 + ctx->grabbed_blocks += count;
900 +}
901 +
902 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
903 +{
904 + assert("zam-525", sbinfo->blocks_grabbed >= count);
905 + sbinfo->blocks_grabbed -= count;
906 +}
907 +
908 +/* Decrease the counter of block reserved for flush in super block. */
909 +static void
910 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
911 +{
912 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
913 + sbinfo->blocks_flush_reserved -= count;
914 +}
915 +
916 +static void
917 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
918 + reiser4_ba_flags_t flags)
919 +{
920 + if (flags & BA_FORMATTED) {
921 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
922 + sbinfo->blocks_fake_allocated -= count;
923 + } else {
924 + assert("zam-528",
925 + sbinfo->blocks_fake_allocated_unformatted >= count);
926 + sbinfo->blocks_fake_allocated_unformatted -= count;
927 + }
928 +}
929 +
930 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
931 +{
932 + assert("zam-530",
933 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
934 + sbinfo->blocks_used -= count;
935 +}
936 +
937 +static void
938 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
939 +{
940 + assert("edward-501", sbinfo->blocks_clustered >= count);
941 + sbinfo->blocks_clustered -= count;
942 +}
943 +
944 +/* Increase the counter of block reserved for flush in atom. */
945 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
946 +{
947 + assert("zam-772", atom != NULL);
948 + assert_spin_locked(&(atom->alock));
949 + atom->flush_reserved += count;
950 +}
951 +
952 +/* Decrease the counter of block reserved for flush in atom. */
953 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
954 +{
955 + assert("zam-774", atom != NULL);
956 + assert_spin_locked(&(atom->alock));
957 + assert("nikita-2790", atom->flush_reserved >= count);
958 + atom->flush_reserved -= count;
959 +}
960 +
961 +/* super block has 6 counters: free, used, grabbed, fake allocated
962 + (formatted and unformatted) and flush reserved. Their sum must be
963 + number of blocks on a device. This function checks this */
964 +int reiser4_check_block_counters(const struct super_block *super)
965 +{
966 + __u64 sum;
967 +
968 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
969 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
970 + reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
971 + reiser4_clustered_blocks(super);
972 + if (reiser4_block_count(super) != sum) {
973 + printk("super block counters: "
974 + "used %llu, free %llu, "
975 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
976 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
977 + (unsigned long long)reiser4_data_blocks(super),
978 + (unsigned long long)reiser4_free_blocks(super),
979 + (unsigned long long)reiser4_grabbed_blocks(super),
980 + (unsigned long long)reiser4_fake_allocated(super),
981 + (unsigned long long)
982 + reiser4_fake_allocated_unformatted(super),
983 + (unsigned long long)reiser4_flush_reserved(super),
984 + (unsigned long long)reiser4_clustered_blocks(super),
985 + (unsigned long long)sum,
986 + (unsigned long long)reiser4_block_count(super));
987 + return 0;
988 + }
989 + return 1;
990 +}
991 +
992 +/* Adjust "working" free blocks counter for number of blocks we are going to
993 + allocate. Record number of grabbed blocks in fs-wide and per-thread
994 + counters. This function should be called before bitmap scanning or
995 + allocating fake block numbers
996 +
997 + @super -- pointer to reiser4 super block;
998 + @count -- number of blocks we reserve;
999 +
1000 + @return -- 0 if success, -ENOSPC, if all
1001 + free blocks are preserved or already allocated.
1002 +*/
1003 +
1004 +static int
1005 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1006 +{
1007 + __u64 free_blocks;
1008 + int ret = 0, use_reserved = flags & BA_RESERVED;
1009 + reiser4_super_info_data *sbinfo;
1010 +
1011 + assert("vs-1276", ctx == get_current_context());
1012 +
1013 + /* Do not grab anything on ro-mounted fs. */
1014 + if (rofs_super(ctx->super)) {
1015 + ctx->grab_enabled = 0;
1016 + return 0;
1017 + }
1018 +
1019 + sbinfo = get_super_private(ctx->super);
1020 +
1021 + spin_lock_reiser4_super(sbinfo);
1022 +
1023 + free_blocks = sbinfo->blocks_free;
1024 +
1025 + if ((use_reserved && free_blocks < count) ||
1026 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1027 + ret = RETERR(-ENOSPC);
1028 + goto unlock_and_ret;
1029 + }
1030 +
1031 + add_to_ctx_grabbed(ctx, count);
1032 +
1033 + sbinfo->blocks_grabbed += count;
1034 + sbinfo->blocks_free -= count;
1035 +
1036 +#if REISER4_DEBUG
1037 + if (ctx->grabbed_initially == 0)
1038 + ctx->grabbed_initially = count;
1039 +#endif
1040 +
1041 + assert("nikita-2986", reiser4_check_block_counters(ctx->super));
1042 +
1043 + /* disable grab space in current context */
1044 + ctx->grab_enabled = 0;
1045 +
1046 +unlock_and_ret:
1047 + spin_unlock_reiser4_super(sbinfo);
1048 +
1049 + return ret;
1050 +}
1051 +
1052 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1053 +{
1054 + int ret;
1055 + reiser4_context *ctx;
1056 +
1057 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1058 + lock_stack_isclean(get_current_lock_stack
1059 + ())));
1060 + ctx = get_current_context();
1061 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx))
1062 + return 0;
1063 +
1064 + ret = reiser4_grab(ctx, count, flags);
1065 + if (ret == -ENOSPC) {
1066 +
1067 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag
1068 + present */
1069 + if (flags & BA_CAN_COMMIT) {
1070 + txnmgr_force_commit_all(ctx->super, 0);
1071 + ctx->grab_enabled = 1;
1072 + ret = reiser4_grab(ctx, count, flags);
1073 + }
1074 + }
1075 + /*
1076 + * allocation from reserved pool cannot fail. This is severe error.
1077 + */
1078 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1079 + return ret;
1080 +}
1081 +
1082 +/*
1083 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1084 + *
1085 + * Unlink and truncate require space in transaction (to update stat data, at
1086 + * least). But we don't want rm(1) to fail with "No space on device" error.
1087 + *
1088 + * Solution is to reserve 5% of disk space for truncates and
1089 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1090 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1091 + * drain it. Per super block delete mutex is used to allow only one
1092 + * thread at a time to grab from reserved area.
1093 + *
1094 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1095 + * flag.
1096 + *
1097 + */
1098 +
1099 +int reiser4_grab_reserved(struct super_block *super,
1100 + __u64 count, reiser4_ba_flags_t flags)
1101 +{
1102 + reiser4_super_info_data *sbinfo = get_super_private(super);
1103 +
1104 + assert("nikita-3175", flags & BA_CAN_COMMIT);
1105 +
1106 + /* Check the delete mutex already taken by us, we assume that
1107 + * reading of machine word is atomic. */
1108 + if (sbinfo->delete_mutex_owner == current) {
1109 + if (reiser4_grab_space
1110 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1111 + warning("zam-1003",
1112 + "nested call of grab_reserved fails count=(%llu)",
1113 + (unsigned long long)count);
1114 + reiser4_release_reserved(super);
1115 + return RETERR(-ENOSPC);
1116 + }
1117 + return 0;
1118 + }
1119 +
1120 + if (reiser4_grab_space(count, flags)) {
1121 + mutex_lock(&sbinfo->delete_mutex);
1122 + assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
1123 + sbinfo->delete_mutex_owner = current;
1124 +
1125 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1126 + warning("zam-833",
1127 + "reserved space is not enough (%llu)",
1128 + (unsigned long long)count);
1129 + reiser4_release_reserved(super);
1130 + return RETERR(-ENOSPC);
1131 + }
1132 + }
1133 + return 0;
1134 +}
1135 +
1136 +void reiser4_release_reserved(struct super_block *super)
1137 +{
1138 + reiser4_super_info_data *info;
1139 +
1140 + info = get_super_private(super);
1141 + if (info->delete_mutex_owner == current) {
1142 + info->delete_mutex_owner = NULL;
1143 + mutex_unlock(&info->delete_mutex);
1144 + }
1145 +}
1146 +
1147 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1148 +{
1149 + reiser4_context *ctx;
1150 + reiser4_super_info_data *sbinfo;
1151 +
1152 + ctx = get_current_context();
1153 + sub_from_ctx_grabbed(ctx, count);
1154 +
1155 + sbinfo = get_super_private(ctx->super);
1156 + spin_lock_reiser4_super(sbinfo);
1157 +
1158 + sub_from_sb_grabbed(sbinfo, count);
1159 + /* return sbinfo locked */
1160 + return sbinfo;
1161 +}
1162 +
1163 +/* is called after @count fake block numbers are allocated and pointer to
1164 + those blocks are inserted into tree. */
1165 +static void grabbed2fake_allocated_formatted(void)
1166 +{
1167 + reiser4_super_info_data *sbinfo;
1168 +
1169 + sbinfo = grabbed2fake_allocated_head(1);
1170 + sbinfo->blocks_fake_allocated++;
1171 +
1172 + assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1173 +
1174 + spin_unlock_reiser4_super(sbinfo);
1175 +}
1176 +
1177 +/**
1178 + * grabbed2fake_allocated_unformatted
1179 + * @count:
1180 + *
1181 + */
1182 +static void grabbed2fake_allocated_unformatted(int count)
1183 +{
1184 + reiser4_super_info_data *sbinfo;
1185 +
1186 + sbinfo = grabbed2fake_allocated_head(count);
1187 + sbinfo->blocks_fake_allocated_unformatted += count;
1188 +
1189 + assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1190 +
1191 + spin_unlock_reiser4_super(sbinfo);
1192 +}
1193 +
1194 +void grabbed2cluster_reserved(int count)
1195 +{
1196 + reiser4_context *ctx;
1197 + reiser4_super_info_data *sbinfo;
1198 +
1199 + ctx = get_current_context();
1200 + sub_from_ctx_grabbed(ctx, count);
1201 +
1202 + sbinfo = get_super_private(ctx->super);
1203 + spin_lock_reiser4_super(sbinfo);
1204 +
1205 + sub_from_sb_grabbed(sbinfo, count);
1206 + sbinfo->blocks_clustered += count;
1207 +
1208 + assert("edward-504", reiser4_check_block_counters(ctx->super));
1209 +
1210 + spin_unlock_reiser4_super(sbinfo);
1211 +}
1212 +
1213 +void cluster_reserved2grabbed(int count)
1214 +{
1215 + reiser4_context *ctx;
1216 + reiser4_super_info_data *sbinfo;
1217 +
1218 + ctx = get_current_context();
1219 +
1220 + sbinfo = get_super_private(ctx->super);
1221 + spin_lock_reiser4_super(sbinfo);
1222 +
1223 + sub_from_cluster_reserved(sbinfo, count);
1224 + sbinfo->blocks_grabbed += count;
1225 +
1226 + assert("edward-505", reiser4_check_block_counters(ctx->super));
1227 +
1228 + spin_unlock_reiser4_super(sbinfo);
1229 + add_to_ctx_grabbed(ctx, count);
1230 +}
1231 +
1232 +void cluster_reserved2free(int count)
1233 +{
1234 + reiser4_context *ctx;
1235 + reiser4_super_info_data *sbinfo;
1236 +
1237 + ctx = get_current_context();
1238 + sbinfo = get_super_private(ctx->super);
1239 +
1240 + cluster_reserved2grabbed(count);
1241 + grabbed2free(ctx, sbinfo, count);
1242 +}
1243 +
1244 +static DEFINE_SPINLOCK(fake_lock);
1245 +static reiser4_block_nr fake_gen = 0;
1246 +
1247 +/**
1248 + * assign_fake_blocknr
1249 + * @blocknr:
1250 + * @count:
1251 + *
1252 + * Obtain a fake block number for new node which will be used to refer to
1253 + * this newly allocated node until real allocation is done.
1254 + */
1255 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1256 +{
1257 + spin_lock(&fake_lock);
1258 + *blocknr = fake_gen;
1259 + fake_gen += count;
1260 + spin_unlock(&fake_lock);
1261 +
1262 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1263 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1264 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1265 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1266 +}
1267 +
1268 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1269 +{
1270 + assign_fake_blocknr(blocknr, 1);
1271 + grabbed2fake_allocated_formatted();
1272 + return 0;
1273 +}
1274 +
1275 +/**
1276 + * fake_blocknrs_unformatted
1277 + * @count: number of fake numbers to get
1278 + *
1279 + * Allocates @count fake block numbers which will be assigned to jnodes
1280 + */
1281 +reiser4_block_nr fake_blocknr_unformatted(int count)
1282 +{
1283 + reiser4_block_nr blocknr;
1284 +
1285 + assign_fake_blocknr(&blocknr, count);
1286 + grabbed2fake_allocated_unformatted(count);
1287 +
1288 + return blocknr;
1289 +}
1290 +
1291 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1292 + follows grabbing of free disk space. */
1293 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1294 + __u64 count)
1295 +{
1296 + sub_from_ctx_grabbed(ctx, count);
1297 +
1298 + spin_lock_reiser4_super(sbinfo);
1299 +
1300 + sub_from_sb_grabbed(sbinfo, count);
1301 + sbinfo->blocks_used += count;
1302 +
1303 + assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1304 +
1305 + spin_unlock_reiser4_super(sbinfo);
1306 +}
1307 +
1308 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1309 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1310 + reiser4_ba_flags_t flags)
1311 +{
1312 + spin_lock_reiser4_super(sbinfo);
1313 +
1314 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1315 + sbinfo->blocks_used += count;
1316 +
1317 + assert("nikita-2680",
1318 + reiser4_check_block_counters(reiser4_get_current_sb()));
1319 +
1320 + spin_unlock_reiser4_super(sbinfo);
1321 +}
1322 +
1323 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1324 +{
1325 + reiser4_super_info_data *sbinfo;
1326 +
1327 + assert("zam-787", atom != NULL);
1328 + assert_spin_locked(&(atom->alock));
1329 +
1330 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1331 +
1332 + sbinfo = get_current_super_private();
1333 + spin_lock_reiser4_super(sbinfo);
1334 +
1335 + sub_from_sb_flush_reserved(sbinfo, count);
1336 + sbinfo->blocks_used += count;
1337 +
1338 + assert("zam-789",
1339 + reiser4_check_block_counters(reiser4_get_current_sb()));
1340 +
1341 + spin_unlock_reiser4_super(sbinfo);
1342 +}
1343 +
1344 +/* update the per fs blocknr hint default value. */
1345 +void
1346 +update_blocknr_hint_default(const struct super_block *s,
1347 + const reiser4_block_nr * block)
1348 +{
1349 + reiser4_super_info_data *sbinfo = get_super_private(s);
1350 +
1351 + assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1352 +
1353 + spin_lock_reiser4_super(sbinfo);
1354 + if (*block < sbinfo->block_count) {
1355 + sbinfo->blocknr_hint_default = *block;
1356 + } else {
1357 + warning("zam-676",
1358 + "block number %llu is too large to be used in a blocknr hint\n",
1359 + (unsigned long long)*block);
1360 + dump_stack();
1361 + DEBUGON(1);
1362 + }
1363 + spin_unlock_reiser4_super(sbinfo);
1364 +}
1365 +
1366 +/* get current value of the default blocknr hint. */
1367 +void get_blocknr_hint_default(reiser4_block_nr * result)
1368 +{
1369 + reiser4_super_info_data *sbinfo = get_current_super_private();
1370 +
1371 + spin_lock_reiser4_super(sbinfo);
1372 + *result = sbinfo->blocknr_hint_default;
1373 + assert("zam-677", *result < sbinfo->block_count);
1374 + spin_unlock_reiser4_super(sbinfo);
1375 +}
1376 +
1377 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1378 + * method. Blocks are allocated in one contiguous disk region. The plugin
1379 + * independent part accounts blocks by subtracting allocated amount from grabbed
1380 + * or fake block counter and add the same amount to the counter of allocated
1381 + * blocks.
1382 + *
1383 + * @hint -- a reiser4 blocknr hint object which contains further block
1384 + * allocation hints and parameters (search start, a stage of block
1385 + * which will be mapped to disk, etc.),
1386 + * @blk -- an out parameter for the beginning of the allocated region,
1387 + * @len -- in/out parameter, it should contain the maximum number of allocated
1388 + * blocks, after block allocation completes, it contains the length of
1389 + * allocated disk region.
1390 + * @flags -- see reiser4_ba_flags_t description.
1391 + *
1392 + * @return -- 0 if success, error code otherwise.
1393 + */
1394 +int
1395 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1396 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1397 +{
1398 + __u64 needed = *len;
1399 + reiser4_context *ctx;
1400 + reiser4_super_info_data *sbinfo;
1401 + int ret;
1402 +
1403 + assert("zam-986", hint != NULL);
1404 +
1405 + ctx = get_current_context();
1406 + sbinfo = get_super_private(ctx->super);
1407 +
1408 + /* For write-optimized data we use default search start value, which is
1409 + * close to last write location. */
1410 + if (flags & BA_USE_DEFAULT_SEARCH_START)
1411 + get_blocknr_hint_default(&hint->blk);
1412 +
1413 + /* VITALY: allocator should grab this for internal/tx-lists/similar
1414 + only. */
1415 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/
1416 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1417 + ret = reiser4_grab_space_force(*len, flags);
1418 + if (ret != 0)
1419 + return ret;
1420 + }
1421 +
1422 + ret =
1423 + sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1424 + hint, (int)needed, blk, len);
1425 +
1426 + if (!ret) {
1427 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1428 + assert("zam-681",
1429 + *blk + *len <= reiser4_block_count(ctx->super));
1430 +
1431 + if (flags & BA_PERMANENT) {
1432 + /* we assume that current atom exists at this moment */
1433 + txn_atom *atom = get_current_atom_locked();
1434 + atom->nr_blocks_allocated += *len;
1435 + spin_unlock_atom(atom);
1436 + }
1437 +
1438 + switch (hint->block_stage) {
1439 + case BLOCK_NOT_COUNTED:
1440 + case BLOCK_GRABBED:
1441 + grabbed2used(ctx, sbinfo, *len);
1442 + break;
1443 + case BLOCK_UNALLOCATED:
1444 + fake_allocated2used(sbinfo, *len, flags);
1445 + break;
1446 + case BLOCK_FLUSH_RESERVED:
1447 + {
1448 + txn_atom *atom = get_current_atom_locked();
1449 + flush_reserved2used(atom, *len);
1450 + spin_unlock_atom(atom);
1451 + }
1452 + break;
1453 + default:
1454 + impossible("zam-531", "wrong block stage");
1455 + }
1456 + } else {
1457 + assert("zam-821",
1458 + ergo(hint->max_dist == 0
1459 + && !hint->backward, ret != -ENOSPC));
1460 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1461 + grabbed2free(ctx, sbinfo, needed);
1462 + }
1463 +
1464 + return ret;
1465 +}
1466 +
1467 +/* used -> fake_allocated -> grabbed -> free */
1468 +
1469 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1470 + disk */
1471 +static void
1472 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1473 + int formatted)
1474 +{
1475 + spin_lock_reiser4_super(sbinfo);
1476 +
1477 + if (formatted)
1478 + sbinfo->blocks_fake_allocated += count;
1479 + else
1480 + sbinfo->blocks_fake_allocated_unformatted += count;
1481 +
1482 + sub_from_sb_used(sbinfo, count);
1483 +
1484 + assert("nikita-2681",
1485 + reiser4_check_block_counters(reiser4_get_current_sb()));
1486 +
1487 + spin_unlock_reiser4_super(sbinfo);
1488 +}
1489 +
1490 +static void
1491 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1492 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1493 +{
1494 + assert("nikita-2791", atom != NULL);
1495 + assert_spin_locked(&(atom->alock));
1496 +
1497 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1498 +
1499 + spin_lock_reiser4_super(sbinfo);
1500 +
1501 + sbinfo->blocks_flush_reserved += count;
1502 + /*add_to_sb_flush_reserved(sbinfo, count); */
1503 + sub_from_sb_used(sbinfo, count);
1504 +
1505 + assert("nikita-2681",
1506 + reiser4_check_block_counters(reiser4_get_current_sb()));
1507 +
1508 + spin_unlock_reiser4_super(sbinfo);
1509 +}
1510 +
1511 +/* disk space, virtually used by fake block numbers is counted as "grabbed"
1512 + again. */
1513 +static void
1514 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1515 + __u64 count, reiser4_ba_flags_t flags)
1516 +{
1517 + add_to_ctx_grabbed(ctx, count);
1518 +
1519 + spin_lock_reiser4_super(sbinfo);
1520 +
1521 + assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1522 +
1523 + sbinfo->blocks_grabbed += count;
1524 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1525 +
1526 + assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1527 +
1528 + spin_unlock_reiser4_super(sbinfo);
1529 +}
1530 +
1531 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1532 +{
1533 + reiser4_context *ctx;
1534 + reiser4_super_info_data *sbinfo;
1535 +
1536 + ctx = get_current_context();
1537 + sbinfo = get_super_private(ctx->super);
1538 +
1539 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1540 + grabbed2free(ctx, sbinfo, count);
1541 +}
1542 +
1543 +void grabbed2free_mark(__u64 mark)
1544 +{
1545 + reiser4_context *ctx;
1546 + reiser4_super_info_data *sbinfo;
1547 +
1548 + ctx = get_current_context();
1549 + sbinfo = get_super_private(ctx->super);
1550 +
1551 + assert("nikita-3007", (__s64) mark >= 0);
1552 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1553 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1554 +}
1555 +
1556 +/**
1557 + * grabbed2free - adjust grabbed and free block counters
1558 + * @ctx: context to update grabbed block counter of
1559 + * @sbinfo: super block to update grabbed and free block counters of
1560 + * @count: number of blocks to adjust counters by
1561 + *
1562 + * Decreases context's and per filesystem's counters of grabbed
1563 + * blocks. Increases per filesystem's counter of free blocks.
1564 + */
1565 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1566 + __u64 count)
1567 +{
1568 + sub_from_ctx_grabbed(ctx, count);
1569 +
1570 + spin_lock_reiser4_super(sbinfo);
1571 +
1572 + sub_from_sb_grabbed(sbinfo, count);
1573 + sbinfo->blocks_free += count;
1574 + assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1575 +
1576 + spin_unlock_reiser4_super(sbinfo);
1577 +}
1578 +
1579 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1580 +{
1581 + reiser4_context *ctx;
1582 + reiser4_super_info_data *sbinfo;
1583 +
1584 + assert("vs-1095", atom);
1585 +
1586 + ctx = get_current_context();
1587 + sbinfo = get_super_private(ctx->super);
1588 +
1589 + sub_from_ctx_grabbed(ctx, count);
1590 +
1591 + add_to_atom_flush_reserved_nolock(atom, count);
1592 +
1593 + spin_lock_reiser4_super(sbinfo);
1594 +
1595 + sbinfo->blocks_flush_reserved += count;
1596 + sub_from_sb_grabbed(sbinfo, count);
1597 +
1598 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1599 +
1600 + spin_unlock_reiser4_super(sbinfo);
1601 +}
1602 +
1603 +void grabbed2flush_reserved(__u64 count)
1604 +{
1605 + txn_atom *atom = get_current_atom_locked();
1606 +
1607 + grabbed2flush_reserved_nolock(atom, count);
1608 +
1609 + spin_unlock_atom(atom);
1610 +}
1611 +
1612 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1613 +{
1614 + reiser4_context *ctx;
1615 + reiser4_super_info_data *sbinfo;
1616 +
1617 + assert("nikita-2788", atom != NULL);
1618 + assert_spin_locked(&(atom->alock));
1619 +
1620 + ctx = get_current_context();
1621 + sbinfo = get_super_private(ctx->super);
1622 +
1623 + add_to_ctx_grabbed(ctx, count);
1624 +
1625 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1626 +
1627 + spin_lock_reiser4_super(sbinfo);
1628 +
1629 + sbinfo->blocks_grabbed += count;
1630 + sub_from_sb_flush_reserved(sbinfo, count);
1631 +
1632 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1633 +
1634 + spin_unlock_reiser4_super(sbinfo);
1635 +}
1636 +
1637 +/**
1638 + * all_grabbed2free - releases all blocks grabbed in context
1639 + *
1640 + * Decreases context's and super block's grabbed block counters by number of
1641 + * blocks grabbed by current context and increases super block's free block
1642 + * counter correspondingly.
1643 + */
1644 +void all_grabbed2free(void)
1645 +{
1646 + reiser4_context *ctx = get_current_context();
1647 +
1648 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1649 +}
1650 +
1651 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1652 + after freeing, @count blocks become "grabbed". */
1653 +static void
1654 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1655 + __u64 count)
1656 +{
1657 + add_to_ctx_grabbed(ctx, count);
1658 +
1659 + spin_lock_reiser4_super(sbinfo);
1660 +
1661 + sbinfo->blocks_grabbed += count;
1662 + sub_from_sb_used(sbinfo, count);
1663 +
1664 + assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1665 +
1666 + spin_unlock_reiser4_super(sbinfo);
1667 +}
1668 +
1669 +/* this used to be done through used2grabbed and grabbed2free*/
1670 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1671 +{
1672 + spin_lock_reiser4_super(sbinfo);
1673 +
1674 + sbinfo->blocks_free += count;
1675 + sub_from_sb_used(sbinfo, count);
1676 +
1677 + assert("nikita-2685",
1678 + reiser4_check_block_counters(reiser4_get_current_sb()));
1679 +
1680 + spin_unlock_reiser4_super(sbinfo);
1681 +}
1682 +
1683 +#if REISER4_DEBUG
1684 +
1685 +/* check "allocated" state of given block range */
1686 +static void
1687 +reiser4_check_blocks(const reiser4_block_nr * start,
1688 + const reiser4_block_nr * len, int desired)
1689 +{
1690 + sa_check_blocks(start, len, desired);
1691 +}
1692 +
1693 +/* check "allocated" state of given block */
1694 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1695 +{
1696 + const reiser4_block_nr one = 1;
1697 +
1698 + reiser4_check_blocks(block, &one, desired);
1699 +}
1700 +
1701 +#endif
1702 +
1703 +/* Blocks deallocation function may do an actual deallocation through space
1704 + plugin allocation or store deleted block numbers in atom's delete_set data
1705 + structure depend on @defer parameter. */
1706 +
1707 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks
1708 + which will be deleted from WORKING bitmap. They might be just unmapped from
1709 + disk, or freed but disk space is still grabbed by current thread, or these
1710 + blocks must not be counted in any reiser4 sb block counters,
1711 + see block_stage_t comment */
1712 +
1713 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1714 + distinguish blocks allocated for unformatted and formatted nodes */
1715 +
1716 +int
1717 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1718 + const reiser4_block_nr * len,
1719 + block_stage_t target_stage, reiser4_ba_flags_t flags)
1720 +{
1721 + txn_atom *atom = NULL;
1722 + int ret;
1723 + reiser4_context *ctx;
1724 + reiser4_super_info_data *sbinfo;
1725 +
1726 + ctx = get_current_context();
1727 + sbinfo = get_super_private(ctx->super);
1728 +
1729 + if (REISER4_DEBUG) {
1730 + assert("zam-431", *len != 0);
1731 + assert("zam-432", *start != 0);
1732 + assert("zam-558", !reiser4_blocknr_is_fake(start));
1733 +
1734 + spin_lock_reiser4_super(sbinfo);
1735 + assert("zam-562", *start < sbinfo->block_count);
1736 + spin_unlock_reiser4_super(sbinfo);
1737 + }
1738 +
1739 + if (flags & BA_DEFER) {
1740 + blocknr_set_entry *bsep = NULL;
1741 +
1742 + /* storing deleted block numbers in a blocknr set
1743 + datastructure for further actual deletion */
1744 + do {
1745 + atom = get_current_atom_locked();
1746 + assert("zam-430", atom != NULL);
1747 +
1748 + ret =
1749 + blocknr_set_add_extent(atom, &atom->delete_set,
1750 + &bsep, start, len);
1751 +
1752 + if (ret == -ENOMEM)
1753 + return ret;
1754 +
1755 + /* This loop might spin at most two times */
1756 + } while (ret == -E_REPEAT);
1757 +
1758 + assert("zam-477", ret == 0);
1759 + assert("zam-433", atom != NULL);
1760 +
1761 + spin_unlock_atom(atom);
1762 +
1763 + } else {
1764 + assert("zam-425", get_current_super_private() != NULL);
1765 + sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1766 + *start, *len);
1767 +
1768 + if (flags & BA_PERMANENT) {
1769 + /* These blocks were counted as allocated, we have to
1770 + * revert it back if allocation is discarded. */
1771 + txn_atom *atom = get_current_atom_locked();
1772 + atom->nr_blocks_allocated -= *len;
1773 + spin_unlock_atom(atom);
1774 + }
1775 +
1776 + switch (target_stage) {
1777 + case BLOCK_NOT_COUNTED:
1778 + assert("vs-960", flags & BA_FORMATTED);
1779 + /* VITALY: This is what was grabbed for
1780 + internal/tx-lists/similar only */
1781 + used2free(sbinfo, *len);
1782 + break;
1783 +
1784 + case BLOCK_GRABBED:
1785 + used2grabbed(ctx, sbinfo, *len);
1786 + break;
1787 +
1788 + case BLOCK_UNALLOCATED:
1789 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1790 + break;
1791 +
1792 + case BLOCK_FLUSH_RESERVED:{
1793 + txn_atom *atom;
1794 +
1795 + atom = get_current_atom_locked();
1796 + used2flush_reserved(sbinfo, atom, *len,
1797 + flags & BA_FORMATTED);
1798 + spin_unlock_atom(atom);
1799 + break;
1800 + }
1801 + default:
1802 + impossible("zam-532", "wrong block stage");
1803 + }
1804 + }
1805 +
1806 + return 0;
1807 +}
1808 +
1809 +/* wrappers for block allocator plugin methods */
1810 +int reiser4_pre_commit_hook(void)
1811 +{
1812 + assert("zam-502", get_current_super_private() != NULL);
1813 + sa_pre_commit_hook();
1814 + return 0;
1815 +}
1816 +
1817 +/* an actor which applies delete set to block allocator data */
1818 +static int
1819 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1820 + const reiser4_block_nr * b, void *data UNUSED_ARG)
1821 +{
1822 + reiser4_context *ctx;
1823 + reiser4_super_info_data *sbinfo;
1824 +
1825 + __u64 len = 1;
1826 +
1827 + ctx = get_current_context();
1828 + sbinfo = get_super_private(ctx->super);
1829 +
1830 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1831 + assert("zam-552", sbinfo != NULL);
1832 +
1833 + if (b != NULL)
1834 + len = *b;
1835 +
1836 + if (REISER4_DEBUG) {
1837 + spin_lock_reiser4_super(sbinfo);
1838 +
1839 + assert("zam-554", *a < reiser4_block_count(ctx->super));
1840 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1841 +
1842 + spin_unlock_reiser4_super(sbinfo);
1843 + }
1844 +
1845 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1846 + /* adjust sb block counters */
1847 + used2free(sbinfo, len);
1848 + return 0;
1849 +}
1850 +
1851 +void reiser4_post_commit_hook(void)
1852 +{
1853 + txn_atom *atom;
1854 +
1855 + atom = get_current_atom_locked();
1856 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1857 + spin_unlock_atom(atom);
1858 +
1859 + /* do the block deallocation which was deferred
1860 + until commit is done */
1861 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1862 +
1863 + assert("zam-504", get_current_super_private() != NULL);
1864 + sa_post_commit_hook();
1865 +}
1866 +
1867 +void reiser4_post_write_back_hook(void)
1868 +{
1869 + assert("zam-504", get_current_super_private() != NULL);
1870 +
1871 + sa_post_commit_hook();
1872 +}
1873 +
1874 +/*
1875 + Local variables:
1876 + c-indentation-style: "K&R"
1877 + mode-name: "LC"
1878 + c-basic-offset: 8
1879 + tab-width: 8
1880 + fill-column: 120
1881 + scroll-step: 1
1882 + End:
1883 +*/
1884 diff -urN linux-2.6.33.orig/fs/reiser4/block_alloc.h linux-2.6.33/fs/reiser4/block_alloc.h
1885 --- linux-2.6.33.orig/fs/reiser4/block_alloc.h 1970-01-01 01:00:00.000000000 +0100
1886 +++ linux-2.6.33/fs/reiser4/block_alloc.h 2010-03-04 19:33:22.000000000 +0100
1887 @@ -0,0 +1,177 @@
1888 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1889 +
1890 +#if !defined(__FS_REISER4_BLOCK_ALLOC_H__)
1891 +#define __FS_REISER4_BLOCK_ALLOC_H__
1892 +
1893 +#include "dformat.h"
1894 +#include "forward.h"
1895 +
1896 +#include <linux/types.h> /* for __u?? */
1897 +#include <linux/fs.h>
1898 +
1899 +/* Mask when is applied to given block number shows is that block number is a
1900 + fake one */
1901 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1902 +/* Mask which isolates a type of object this fake block number was assigned
1903 + to */
1904 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1905 +
1906 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1907 + against these two values to understand is the object unallocated or bitmap
1908 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1909 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1910 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1911 +
1912 +/* specification how block allocation was counted in sb block counters */
1913 +typedef enum {
1914 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1915 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1916 + of this block */
1917 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1918 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1919 + ( unallocated formatted or unformatted
1920 + node) */
1921 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1922 + number assigned */
1923 +} block_stage_t;
1924 +
1925 +/* a hint for block allocator */
1926 +struct reiser4_blocknr_hint {
1927 + /* FIXME: I think we want to add a longterm lock on the bitmap block
1928 + here. This is to prevent jnode_flush() calls from interleaving
1929 + allocations on the same bitmap, once a hint is established. */
1930 +
1931 + /* search start hint */
1932 + reiser4_block_nr blk;
1933 + /* if not zero, it is a region size we search for free blocks in */
1934 + reiser4_block_nr max_dist;
1935 + /* level for allocation, may be useful have branch-level and higher
1936 + write-optimized. */
1937 + tree_level level;
1938 + /* block allocator assumes that blocks, which will be mapped to disk,
1939 + are in this specified block_stage */
1940 + block_stage_t block_stage;
1941 + /* If direction = 1 allocate blocks in backward direction from the end
1942 + * of disk to the beginning of disk. */
1943 + unsigned int backward:1;
1944 +
1945 +};
1946 +
1947 +/* These flags control block allocation/deallocation behavior */
1948 +enum reiser4_ba_flags {
1949 + /* do allocatations from reserved (5%) area */
1950 + BA_RESERVED = (1 << 0),
1951 +
1952 + /* block allocator can do commit trying to recover free space */
1953 + BA_CAN_COMMIT = (1 << 1),
1954 +
1955 + /* if operation will be applied to formatted block */
1956 + BA_FORMATTED = (1 << 2),
1957 +
1958 + /* defer actual block freeing until transaction commit */
1959 + BA_DEFER = (1 << 3),
1960 +
1961 + /* allocate blocks for permanent fs objects (formatted or unformatted),
1962 + not wandered of log blocks */
1963 + BA_PERMANENT = (1 << 4),
1964 +
1965 + /* grab space even it was disabled */
1966 + BA_FORCE = (1 << 5),
1967 +
1968 + /* use default start value for free blocks search. */
1969 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1970 +};
1971 +
1972 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1973 +
1974 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1975 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1976 +extern void update_blocknr_hint_default(const struct super_block *,
1977 + const reiser4_block_nr *);
1978 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1979 +
1980 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1981 +
1982 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1983 +reiser4_block_nr fake_blocknr_unformatted(int);
1984 +
1985 +/* free -> grabbed -> fake_allocated -> used */
1986 +
1987 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1988 +void all_grabbed2free(void);
1989 +void grabbed2free(reiser4_context * , reiser4_super_info_data * , __u64 count);
1990 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1991 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1992 +void grabbed2flush_reserved(__u64 count);
1993 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1994 + reiser4_block_nr * start,
1995 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
1996 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1997 + const reiser4_block_nr *,
1998 + block_stage_t, reiser4_ba_flags_t flags);
1999 +
2000 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2001 + reiser4_block_nr * start,
2002 + reiser4_ba_flags_t flags)
2003 +{
2004 + reiser4_block_nr one = 1;
2005 + return reiser4_alloc_blocks(hint, start, &one, flags);
2006 +}
2007 +
2008 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2009 + block_stage_t stage,
2010 + reiser4_ba_flags_t flags)
2011 +{
2012 + const reiser4_block_nr one = 1;
2013 + return reiser4_dealloc_blocks(block, &one, stage, flags);
2014 +}
2015 +
2016 +#define reiser4_grab_space_force(count, flags) \
2017 + reiser4_grab_space(count, flags | BA_FORCE)
2018 +
2019 +extern void grabbed2free_mark(__u64 mark);
2020 +extern int reiser4_grab_reserved(struct super_block *,
2021 + __u64, reiser4_ba_flags_t);
2022 +extern void reiser4_release_reserved(struct super_block *super);
2023 +
2024 +/* grabbed -> fake_allocated */
2025 +
2026 +/* fake_allocated -> used */
2027 +
2028 +/* used -> fake_allocated -> grabbed -> free */
2029 +
2030 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2031 +
2032 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
2033 +
2034 +extern void grabbed2cluster_reserved(int count);
2035 +extern void cluster_reserved2grabbed(int count);
2036 +extern void cluster_reserved2free(int count);
2037 +
2038 +extern int reiser4_check_block_counters(const struct super_block *);
2039 +
2040 +#if REISER4_DEBUG
2041 +
2042 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2043 +
2044 +#else
2045 +
2046 +# define reiser4_check_block(beg, val) noop
2047 +
2048 +#endif
2049 +
2050 +extern int reiser4_pre_commit_hook(void);
2051 +extern void reiser4_post_commit_hook(void);
2052 +extern void reiser4_post_write_back_hook(void);
2053 +
2054 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
2055 +
2056 +/* Make Linus happy.
2057 + Local variables:
2058 + c-indentation-style: "K&R"
2059 + mode-name: "LC"
2060 + c-basic-offset: 8
2061 + tab-width: 8
2062 + fill-column: 120
2063 + End:
2064 +*/
2065 diff -urN linux-2.6.33.orig/fs/reiser4/blocknrset.c linux-2.6.33/fs/reiser4/blocknrset.c
2066 --- linux-2.6.33.orig/fs/reiser4/blocknrset.c 1970-01-01 01:00:00.000000000 +0100
2067 +++ linux-2.6.33/fs/reiser4/blocknrset.c 2010-03-04 19:33:22.000000000 +0100
2068 @@ -0,0 +1,371 @@
2069 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2070 +reiser4/README */
2071 +
2072 +/* This file contains code for various block number sets used by the atom to
2073 + track the deleted set and wandered block mappings. */
2074 +
2075 +#include "debug.h"
2076 +#include "dformat.h"
2077 +#include "txnmgr.h"
2078 +#include "context.h"
2079 +
2080 +#include <linux/slab.h>
2081 +
2082 +/* The proposed data structure for storing unordered block number sets is a
2083 + list of elements, each of which contains an array of block number or/and
2084 + array of block number pairs. That element called blocknr_set_entry is used
2085 + to store block numbers from the beginning and for extents from the end of
2086 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2087 + count numbers of blocks and extents.
2088 +
2089 + +------------------- blocknr_set_entry->data ------------------+
2090 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2091 + +------------------------------------------------------------+
2092 +
2093 + When current blocknr_set_entry is full, allocate a new one. */
2094 +
2095 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2096 + * set (single blocks and block extents), in that case blocknr pair represent an
2097 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2098 + * there represent a (real block) -> (wandered block) mapping. */
2099 +
2100 +/* Protection: blocknr sets belong to reiser4 atom, and
2101 + * their modifications are performed with the atom lock held */
2102 +
2103 +/* The total size of a blocknr_set_entry. */
2104 +#define BLOCKNR_SET_ENTRY_SIZE 128
2105 +
2106 +/* The number of blocks that can fit the blocknr data area. */
2107 +#define BLOCKNR_SET_ENTRIES_NUMBER \
2108 + ((BLOCKNR_SET_ENTRY_SIZE - \
2109 + 2 * sizeof(unsigned) - \
2110 + sizeof(struct list_head)) / \
2111 + sizeof(reiser4_block_nr))
2112 +
2113 +/* An entry of the blocknr_set */
2114 +struct blocknr_set_entry {
2115 + unsigned nr_singles;
2116 + unsigned nr_pairs;
2117 + struct list_head link;
2118 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2119 +};
2120 +
2121 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2122 +struct blocknr_pair {
2123 + reiser4_block_nr a;
2124 + reiser4_block_nr b;
2125 +};
2126 +
2127 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2128 +/* Audited by: green(2002.06.11) */
2129 +static unsigned bse_avail(blocknr_set_entry * bse)
2130 +{
2131 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2132 +
2133 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2134 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2135 +
2136 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
2137 +}
2138 +
2139 +/* Initialize a blocknr_set_entry. */
2140 +static void bse_init(blocknr_set_entry *bse)
2141 +{
2142 + bse->nr_singles = 0;
2143 + bse->nr_pairs = 0;
2144 + INIT_LIST_HEAD(&bse->link);
2145 +}
2146 +
2147 +/* Allocate and initialize a blocknr_set_entry. */
2148 +/* Audited by: green(2002.06.11) */
2149 +static blocknr_set_entry *bse_alloc(void)
2150 +{
2151 + blocknr_set_entry *e;
2152 +
2153 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2154 + reiser4_ctx_gfp_mask_get())) == NULL)
2155 + return NULL;
2156 +
2157 + bse_init(e);
2158 +
2159 + return e;
2160 +}
2161 +
2162 +/* Free a blocknr_set_entry. */
2163 +/* Audited by: green(2002.06.11) */
2164 +static void bse_free(blocknr_set_entry * bse)
2165 +{
2166 + kfree(bse);
2167 +}
2168 +
2169 +/* Add a block number to a blocknr_set_entry */
2170 +/* Audited by: green(2002.06.11) */
2171 +static void
2172 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2173 +{
2174 + assert("jmacd-5099", bse_avail(bse) >= 1);
2175 +
2176 + bse->entries[bse->nr_singles++] = *block;
2177 +}
2178 +
2179 +/* Get a pair of block numbers */
2180 +/* Audited by: green(2002.06.11) */
2181 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2182 + unsigned pno)
2183 +{
2184 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2185 +
2186 + return (struct blocknr_pair *) (bse->entries +
2187 + BLOCKNR_SET_ENTRIES_NUMBER -
2188 + 2 * (pno + 1));
2189 +}
2190 +
2191 +/* Add a pair of block numbers to a blocknr_set_entry */
2192 +/* Audited by: green(2002.06.11) */
2193 +static void
2194 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2195 + const reiser4_block_nr * b)
2196 +{
2197 + struct blocknr_pair *pair;
2198 +
2199 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2200 +
2201 + pair = bse_get_pair(bse, bse->nr_pairs++);
2202 +
2203 + pair->a = *a;
2204 + pair->b = *b;
2205 +}
2206 +
2207 +/* Add either a block or pair of blocks to the block number set. The first
2208 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2209 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2210 + the call is made with the atom lock held. There may not be enough space in
2211 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2212 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2213 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2214 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2215 + returned with the atom unlocked for the operation to be tried again. If
2216 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2217 + used during the call, it will be freed automatically. */
2218 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2219 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2220 + const reiser4_block_nr *b)
2221 +{
2222 + blocknr_set_entry *bse;
2223 + unsigned entries_needed;
2224 +
2225 + assert("jmacd-5101", a != NULL);
2226 +
2227 + entries_needed = (b == NULL) ? 1 : 2;
2228 + if (list_empty(bset) ||
2229 + bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2230 + /* See if a bse was previously allocated. */
2231 + if (*new_bsep == NULL) {
2232 + spin_unlock_atom(atom);
2233 + *new_bsep = bse_alloc();
2234 + return (*new_bsep != NULL) ? -E_REPEAT :
2235 + RETERR(-ENOMEM);
2236 + }
2237 +
2238 + /* Put it on the head of the list. */
2239 + list_add(&((*new_bsep)->link), bset);
2240 +
2241 + *new_bsep = NULL;
2242 + }
2243 +
2244 + /* Add the single or pair. */
2245 + bse = list_entry(bset->next, blocknr_set_entry, link);
2246 + if (b == NULL) {
2247 + bse_put_single(bse, a);
2248 + } else {
2249 + bse_put_pair(bse, a, b);
2250 + }
2251 +
2252 + /* If new_bsep is non-NULL then there was an allocation race, free this
2253 + copy. */
2254 + if (*new_bsep != NULL) {
2255 + bse_free(*new_bsep);
2256 + *new_bsep = NULL;
2257 + }
2258 +
2259 + return 0;
2260 +}
2261 +
2262 +/* Add an extent to the block set. If the length is 1, it is treated as a
2263 + single block (e.g., reiser4_set_add_block). */
2264 +/* Audited by: green(2002.06.11) */
2265 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2266 + kmalloc might schedule. The only exception is atom spinlock, which is
2267 + properly freed. */
2268 +int
2269 +blocknr_set_add_extent(txn_atom * atom,
2270 + struct list_head *bset,
2271 + blocknr_set_entry ** new_bsep,
2272 + const reiser4_block_nr * start,
2273 + const reiser4_block_nr * len)
2274 +{
2275 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2276 + return blocknr_set_add(atom, bset, new_bsep, start,
2277 + *len == 1 ? NULL : len);
2278 +}
2279 +
2280 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2281 + * by an assertion that both arguments are not null.*/
2282 +/* Audited by: green(2002.06.11) */
2283 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2284 + kmalloc might schedule. The only exception is atom spinlock, which is
2285 + properly freed. */
2286 +int
2287 +blocknr_set_add_pair(txn_atom * atom,
2288 + struct list_head *bset,
2289 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2290 + const reiser4_block_nr * b)
2291 +{
2292 + assert("jmacd-5103", a != NULL && b != NULL);
2293 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2294 +}
2295 +
2296 +/* Initialize a blocknr_set. */
2297 +void blocknr_set_init(struct list_head *bset)
2298 +{
2299 + INIT_LIST_HEAD(bset);
2300 +}
2301 +
2302 +/* Release the entries of a blocknr_set. */
2303 +void blocknr_set_destroy(struct list_head *bset)
2304 +{
2305 + blocknr_set_entry *bse;
2306 +
2307 + while (!list_empty(bset)) {
2308 + bse = list_entry(bset->next, blocknr_set_entry, link);
2309 + list_del_init(&bse->link);
2310 + bse_free(bse);
2311 + }
2312 +}
2313 +
2314 +/* Merge blocknr_set entries out of @from into @into. */
2315 +/* Audited by: green(2002.06.11) */
2316 +/* Auditor comments: This merge does not know if merged sets contain
2317 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2318 + overlapping ranges if there is some. So I believe it may lead to
2319 + some blocks being presented several times in one blocknr_set. To help
2320 + debugging such problems it might help to check for duplicate entries on
2321 + actual processing of this set. Testing this kind of stuff right here is
2322 + also complicated by the fact that these sets are not sorted and going
2323 + through whole set on each element addition is going to be CPU-heavy task */
2324 +void blocknr_set_merge(struct list_head *from, struct list_head *into)
2325 +{
2326 + blocknr_set_entry *bse_into = NULL;
2327 +
2328 + /* If @from is empty, no work to perform. */
2329 + if (list_empty(from))
2330 + return;
2331 + /* If @into is not empty, try merging partial-entries. */
2332 + if (!list_empty(into)) {
2333 +
2334 + /* Neither set is empty, pop the front to members and try to
2335 + combine them. */
2336 + blocknr_set_entry *bse_from;
2337 + unsigned into_avail;
2338 +
2339 + bse_into = list_entry(into->next, blocknr_set_entry, link);
2340 + list_del_init(&bse_into->link);
2341 + bse_from = list_entry(from->next, blocknr_set_entry, link);
2342 + list_del_init(&bse_from->link);
2343 +
2344 + /* Combine singles. */
2345 + for (into_avail = bse_avail(bse_into);
2346 + into_avail != 0 && bse_from->nr_singles != 0;
2347 + into_avail -= 1) {
2348 + bse_put_single(bse_into,
2349 + &bse_from->entries[--bse_from->
2350 + nr_singles]);
2351 + }
2352 +
2353 + /* Combine pairs. */
2354 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2355 + into_avail -= 2) {
2356 + struct blocknr_pair *pair =
2357 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2358 + bse_put_pair(bse_into, &pair->a, &pair->b);
2359 + }
2360 +
2361 + /* If bse_from is empty, delete it now. */
2362 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2363 + bse_free(bse_from);
2364 + } else {
2365 + /* Otherwise, bse_into is full or nearly full (e.g.,
2366 + it could have one slot avail and bse_from has one
2367 + pair left). Push it back onto the list. bse_from
2368 + becomes bse_into, which will be the new partial. */
2369 + list_add(&bse_into->link, into);
2370 + bse_into = bse_from;
2371 + }
2372 + }
2373 +
2374 + /* Splice lists together. */
2375 + list_splice_init(from, into->prev);
2376 +
2377 + /* Add the partial entry back to the head of the list. */
2378 + if (bse_into != NULL)
2379 + list_add(&bse_into->link, into);
2380 +}
2381 +
2382 +/* Iterate over all blocknr set elements. */
2383 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2384 + blocknr_set_actor_f actor, void *data, int delete)
2385 +{
2386 +
2387 + blocknr_set_entry *entry;
2388 +
2389 + assert("zam-429", atom != NULL);
2390 + assert("zam-430", atom_is_protected(atom));
2391 + assert("zam-431", bset != 0);
2392 + assert("zam-432", actor != NULL);
2393 +
2394 + entry = list_entry(bset->next, blocknr_set_entry, link);
2395 + while (bset != &entry->link) {
2396 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2397 + unsigned int i;
2398 + int ret;
2399 +
2400 + for (i = 0; i < entry->nr_singles; i++) {
2401 + ret = actor(atom, &entry->entries[i], NULL, data);
2402 +
2403 + /* We can't break a loop if delete flag is set. */
2404 + if (ret != 0 && !delete)
2405 + return ret;
2406 + }
2407 +
2408 + for (i = 0; i < entry->nr_pairs; i++) {
2409 + struct blocknr_pair *ab;
2410 +
2411 + ab = bse_get_pair(entry, i);
2412 +
2413 + ret = actor(atom, &ab->a, &ab->b, data);
2414 +
2415 + if (ret != 0 && !delete)
2416 + return ret;
2417 + }
2418 +
2419 + if (delete) {
2420 + list_del(&entry->link);
2421 + bse_free(entry);
2422 + }
2423 +
2424 + entry = tmp;
2425 + }
2426 +
2427 + return 0;
2428 +}
2429 +
2430 +/*
2431 + * Local variables:
2432 + * c-indentation-style: "K&R"
2433 + * mode-name: "LC"
2434 + * c-basic-offset: 8
2435 + * tab-width: 8
2436 + * fill-column: 79
2437 + * scroll-step: 1
2438 + * End:
2439 + */
2440 diff -urN linux-2.6.33.orig/fs/reiser4/carry.c linux-2.6.33/fs/reiser4/carry.c
2441 --- linux-2.6.33.orig/fs/reiser4/carry.c 1970-01-01 01:00:00.000000000 +0100
2442 +++ linux-2.6.33/fs/reiser4/carry.c 2010-03-04 19:33:22.000000000 +0100
2443 @@ -0,0 +1,1398 @@
2444 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2445 + reiser4/README */
2446 +/* Functions to "carry" tree modification(s) upward. */
2447 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2448 + set of changes that need to be propagated to the next level. We manage
2449 + node locking such that any searches that collide with carrying are
2450 + restarted, from the root if necessary.
2451 +
2452 + Insertion of a new item may result in items being moved among nodes and
2453 + this requires the delimiting key to be updated at the least common parent
2454 + of the nodes modified to preserve search tree invariants. Also, insertion
2455 + may require allocation of a new node. A pointer to the new node has to be
2456 + inserted into some node on the parent level, etc.
2457 +
2458 + Tree carrying is meant to be analogous to arithmetic carrying.
2459 +
2460 + A carry operation is always associated with some node (&carry_node).
2461 +
2462 + Carry process starts with some initial set of operations to be performed
2463 + and an initial set of already locked nodes. Operations are performed one
2464 + by one. Performing each single operation has following possible effects:
2465 +
2466 + - content of carry node associated with operation is modified
2467 + - new carry nodes are locked and involved into carry process on this level
2468 + - new carry operations are posted to the next level
2469 +
2470 + After all carry operations on this level are done, process is repeated for
2471 + the accumulated sequence on carry operations for the next level. This
2472 + starts by trying to lock (in left to right order) all carry nodes
2473 + associated with carry operations on the parent level. After this, we decide
2474 + whether more nodes are required on the left of already locked set. If so,
2475 + all locks taken on the parent level are released, new carry nodes are
2476 + added, and locking process repeats.
2477 +
2478 + It may happen that balancing process fails owing to unrecoverable error on
2479 + some of upper levels of a tree (possible causes are io error, failure to
2480 + allocate new node, etc.). In this case we should unmount the filesystem,
2481 + rebooting if it is the root, and possibly advise the use of fsck.
2482 +
2483 + USAGE:
2484 +
2485 + int some_tree_operation( znode *node, ... )
2486 + {
2487 + // Allocate on a stack pool of carry objects: operations and nodes.
2488 + // Most carry processes will only take objects from here, without
2489 + // dynamic allocation.
2490 +
2491 +I feel uneasy about this pool. It adds to code complexity, I understand why it
2492 +exists, but.... -Hans
2493 +
2494 + carry_pool pool;
2495 + carry_level lowest_level;
2496 + carry_op *op;
2497 +
2498 + init_carry_pool( &pool );
2499 + init_carry_level( &lowest_level, &pool );
2500 +
2501 + // operation may be one of:
2502 + // COP_INSERT --- insert new item into node
2503 + // COP_CUT --- remove part of or whole node
2504 + // COP_PASTE --- increase size of item
2505 + // COP_DELETE --- delete pointer from parent node
2506 + // COP_UPDATE --- update delimiting key in least
2507 + // common ancestor of two
2508 +
2509 + op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2510 + if( IS_ERR( op ) || ( op == NULL ) ) {
2511 + handle error
2512 + } else {
2513 + // fill in remaining fields in @op, according to carry.h:carry_op
2514 + result = carry(&lowest_level, NULL);
2515 + }
2516 + done_carry_pool(&pool);
2517 + }
2518 +
2519 + When you are implementing node plugin method that participates in carry
2520 + (shifting, insertion, deletion, etc.), do the following:
2521 +
2522 + int foo_node_method(znode * node, ..., carry_level * todo)
2523 + {
2524 + carry_op *op;
2525 +
2526 + ....
2527 +
2528 + // note, that last argument to reiser4_post_carry() is non-null
2529 + // here, because @op is to be applied to the parent of @node, rather
2530 + // than to the @node itself as in the previous case.
2531 +
2532 + op = node_post_carry(todo, operation, node, 1);
2533 + // fill in remaining fields in @op, according to carry.h:carry_op
2534 +
2535 + ....
2536 +
2537 + }
2538 +
2539 + BATCHING:
2540 +
2541 + One of the main advantages of level-by-level balancing implemented here is
2542 + ability to batch updates on a parent level and to peform them more
2543 + efficiently as a result.
2544 +
2545 + Description To Be Done (TBD).
2546 +
2547 + DIFFICULTIES AND SUBTLE POINTS:
2548 +
2549 + 1. complex plumbing is required, because:
2550 +
2551 + a. effective allocation through pools is needed
2552 +
2553 + b. target of operation is not exactly known when operation is
2554 + posted. This is worked around through bitfields in &carry_node and
2555 + logic in lock_carry_node()
2556 +
2557 + c. of interaction with locking code: node should be added into sibling
2558 + list when pointer to it is inserted into its parent, which is some time
2559 + after node was created. Between these moments, node is somewhat in
2560 + suspended state and is only registered in the carry lists
2561 +
2562 + 2. whole balancing logic is implemented here, in particular, insertion
2563 + logic is coded in make_space().
2564 +
2565 + 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2566 + (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2567 + (insert_paste()) have to be handled.
2568 +
2569 + 4. there is non-trivial interdependency between allocation of new nodes
2570 + and almost everything else. This is mainly due to the (1.c) above. I shall
2571 + write about this later.
2572 +
2573 +*/
2574 +
2575 +#include "forward.h"
2576 +#include "debug.h"
2577 +#include "key.h"
2578 +#include "coord.h"
2579 +#include "plugin/item/item.h"
2580 +#include "plugin/item/extent.h"
2581 +#include "plugin/node/node.h"
2582 +#include "jnode.h"
2583 +#include "znode.h"
2584 +#include "tree_mod.h"
2585 +#include "tree_walk.h"
2586 +#include "block_alloc.h"
2587 +#include "pool.h"
2588 +#include "tree.h"
2589 +#include "carry.h"
2590 +#include "carry_ops.h"
2591 +#include "super.h"
2592 +#include "reiser4.h"
2593 +
2594 +#include <linux/types.h>
2595 +
2596 +/* level locking/unlocking */
2597 +static int lock_carry_level(carry_level * level);
2598 +static void unlock_carry_level(carry_level * level, int failure);
2599 +static void done_carry_level(carry_level * level);
2600 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2601 +
2602 +int lock_carry_node(carry_level * level, carry_node * node);
2603 +int lock_carry_node_tail(carry_node * node);
2604 +
2605 +/* carry processing proper */
2606 +static int carry_on_level(carry_level * doing, carry_level * todo);
2607 +
2608 +static carry_op *add_op(carry_level * level, pool_ordering order,
2609 + carry_op * reference);
2610 +
2611 +/* handlers for carry operations. */
2612 +
2613 +static void fatal_carry_error(carry_level * doing, int ecode);
2614 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2615 +
2616 +static void print_level(const char *prefix, carry_level * level);
2617 +
2618 +#if REISER4_DEBUG
2619 +typedef enum {
2620 + CARRY_TODO,
2621 + CARRY_DOING
2622 +} carry_queue_state;
2623 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2624 +#endif
2625 +
2626 +/* main entry point for tree balancing.
2627 +
2628 + Tree carry performs operations from @doing and while doing so accumulates
2629 + information about operations to be performed on the next level ("carried"
2630 + to the parent level). Carried operations are performed, causing possibly
2631 + more operations to be carried upward etc. carry() takes care about
2632 + locking and pinning znodes while operating on them.
2633 +
2634 + For usage, see comment at the top of fs/reiser4/carry.c
2635 +
2636 +*/
2637 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2638 + * performed */ ,
2639 + carry_level * done /* set of nodes, already performed
2640 + * at the previous level.
2641 + * NULL in most cases */)
2642 +{
2643 + int result = 0;
2644 + /* queue of new requests */
2645 + carry_level *todo;
2646 + ON_DEBUG(STORE_COUNTERS);
2647 +
2648 + assert("nikita-888", doing != NULL);
2649 + BUG_ON(done != NULL);
2650 +
2651 + todo = doing + 1;
2652 + init_carry_level(todo, doing->pool);
2653 +
2654 + /* queue of requests preformed on the previous level */
2655 + done = todo + 1;
2656 + init_carry_level(done, doing->pool);
2657 +
2658 + /* iterate until there is nothing more to do */
2659 + while (result == 0 && doing->ops_num > 0) {
2660 + carry_level *tmp;
2661 +
2662 + /* at this point @done is locked. */
2663 + /* repeat lock/do/unlock while
2664 +
2665 + (1) lock_carry_level() fails due to deadlock avoidance, or
2666 +
2667 + (2) carry_on_level() decides that more nodes have to
2668 + be involved.
2669 +
2670 + (3) some unexpected error occurred while balancing on the
2671 + upper levels. In this case all changes are rolled back.
2672 +
2673 + */
2674 + while (1) {
2675 + result = lock_carry_level(doing);
2676 + if (result == 0) {
2677 + /* perform operations from @doing and
2678 + accumulate new requests in @todo */
2679 + result = carry_on_level(doing, todo);
2680 + if (result == 0)
2681 + break;
2682 + else if (result != -E_REPEAT ||
2683 + !doing->restartable) {
2684 + warning("nikita-1043",
2685 + "Fatal error during carry: %i",
2686 + result);
2687 + print_level("done", done);
2688 + print_level("doing", doing);
2689 + print_level("todo", todo);
2690 + /* do some rough stuff like aborting
2691 + all pending transcrashes and thus
2692 + pushing tree back to the consistent
2693 + state. Alternatvely, just panic.
2694 + */
2695 + fatal_carry_error(doing, result);
2696 + return result;
2697 + }
2698 + } else if (result != -E_REPEAT) {
2699 + fatal_carry_error(doing, result);
2700 + return result;
2701 + }
2702 + unlock_carry_level(doing, 1);
2703 + }
2704 + /* at this point @done can be safely unlocked */
2705 + done_carry_level(done);
2706 +
2707 + /* cyclically shift queues */
2708 + tmp = done;
2709 + done = doing;
2710 + doing = todo;
2711 + todo = tmp;
2712 + init_carry_level(todo, doing->pool);
2713 +
2714 + /* give other threads chance to run */
2715 + reiser4_preempt_point();
2716 + }
2717 + done_carry_level(done);
2718 +
2719 + /* all counters, but x_refs should remain the same. x_refs can change
2720 + owing to transaction manager */
2721 + ON_DEBUG(CHECK_COUNTERS);
2722 + return result;
2723 +}
2724 +
2725 +/* perform carry operations on given level.
2726 +
2727 + Optimizations proposed by pooh:
2728 +
2729 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2730 + required;
2731 +
2732 + (2) unlock node if there are no more operations to be performed upon it and
2733 + node didn't add any operation to @todo. This can be implemented by
2734 + attaching to each node two counters: counter of operaions working on this
2735 + node and counter and operations carried upward from this node.
2736 +
2737 +*/
2738 +static int carry_on_level(carry_level * doing /* queue of carry operations to
2739 + * do on this level */ ,
2740 + carry_level * todo /* queue where new carry
2741 + * operations to be performed on
2742 + * the * parent level are
2743 + * accumulated during @doing
2744 + * processing. */ )
2745 +{
2746 + int result;
2747 + int (*f) (carry_op *, carry_level *, carry_level *);
2748 + carry_op *op;
2749 + carry_op *tmp_op;
2750 +
2751 + assert("nikita-1034", doing != NULL);
2752 + assert("nikita-1035", todo != NULL);
2753 +
2754 + /* @doing->nodes are locked. */
2755 +
2756 + /* This function can be split into two phases: analysis and modification
2757 +
2758 + Analysis calculates precisely what items should be moved between
2759 + nodes. This information is gathered in some structures attached to
2760 + each carry_node in a @doing queue. Analysis also determines whether
2761 + new nodes are to be allocated etc.
2762 +
2763 + After analysis is completed, actual modification is performed. Here
2764 + we can take advantage of "batch modification": if there are several
2765 + operations acting on the same node, modifications can be performed
2766 + more efficiently when batched together.
2767 +
2768 + Above is an optimization left for the future.
2769 + */
2770 + /* Important, but delayed optimization: it's possible to batch
2771 + operations together and perform them more efficiently as a
2772 + result. For example, deletion of several neighboring items from a
2773 + node can be converted to a single ->cut() operation.
2774 +
2775 + Before processing queue, it should be scanned and "mergeable"
2776 + operations merged.
2777 + */
2778 + result = 0;
2779 + for_all_ops(doing, op, tmp_op) {
2780 + carry_opcode opcode;
2781 +
2782 + assert("nikita-1041", op != NULL);
2783 + opcode = op->op;
2784 + assert("nikita-1042", op->op < COP_LAST_OP);
2785 + f = op_dispatch_table[op->op].handler;
2786 + result = f(op, doing, todo);
2787 + /* locking can fail with -E_REPEAT. Any different error is fatal
2788 + and will be handled by fatal_carry_error() sledgehammer.
2789 + */
2790 + if (result != 0)
2791 + break;
2792 + }
2793 + if (result == 0) {
2794 + carry_plugin_info info;
2795 + carry_node *scan;
2796 + carry_node *tmp_scan;
2797 +
2798 + info.doing = doing;
2799 + info.todo = todo;
2800 +
2801 + assert("nikita-3002",
2802 + carry_level_invariant(doing, CARRY_DOING));
2803 + for_all_nodes(doing, scan, tmp_scan) {
2804 + znode *node;
2805 +
2806 + node = reiser4_carry_real(scan);
2807 + assert("nikita-2547", node != NULL);
2808 + if (node_is_empty(node)) {
2809 + result =
2810 + node_plugin_by_node(node)->
2811 + prepare_removal(node, &info);
2812 + if (result != 0)
2813 + break;
2814 + }
2815 + }
2816 + }
2817 + return result;
2818 +}
2819 +
2820 +/* post carry operation
2821 +
2822 + This is main function used by external carry clients: node layout plugins
2823 + and tree operations to create new carry operation to be performed on some
2824 + level.
2825 +
2826 + New operation will be included in the @level queue. To actually perform it,
2827 + call carry( level, ... ). This function takes write lock on @node. Carry
2828 + manages all its locks by itself, don't worry about this.
2829 +
2830 + This function adds operation and node at the end of the queue. It is up to
2831 + caller to guarantee proper ordering of node queue.
2832 +
2833 +*/
2834 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2835 + * is to be posted at */ ,
2836 + carry_opcode op /* opcode of operation */ ,
2837 + znode * node /* node on which this operation
2838 + * will operate */ ,
2839 + int apply_to_parent_p /* whether operation will
2840 + * operate directly on @node
2841 + * or on it parent. */)
2842 +{
2843 + carry_op *result;
2844 + carry_node *child;
2845 +
2846 + assert("nikita-1046", level != NULL);
2847 + assert("nikita-1788", znode_is_write_locked(node));
2848 +
2849 + result = add_op(level, POOLO_LAST, NULL);
2850 + if (IS_ERR(result))
2851 + return result;
2852 + child = reiser4_add_carry(level, POOLO_LAST, NULL);
2853 + if (IS_ERR(child)) {
2854 + reiser4_pool_free(&level->pool->op_pool, &result->header);
2855 + return (carry_op *) child;
2856 + }
2857 + result->node = child;
2858 + result->op = op;
2859 + child->parent = apply_to_parent_p;
2860 + if (ZF_ISSET(node, JNODE_ORPHAN))
2861 + child->left_before = 1;
2862 + child->node = node;
2863 + return result;
2864 +}
2865 +
2866 +/* initialize carry queue */
2867 +void init_carry_level(carry_level * level /* level to initialize */ ,
2868 + carry_pool * pool /* pool @level will allocate objects
2869 + * from */ )
2870 +{
2871 + assert("nikita-1045", level != NULL);
2872 + assert("nikita-967", pool != NULL);
2873 +
2874 + memset(level, 0, sizeof *level);
2875 + level->pool = pool;
2876 +
2877 + INIT_LIST_HEAD(&level->nodes);
2878 + INIT_LIST_HEAD(&level->ops);
2879 +}
2880 +
2881 +/* allocate carry pool and initialize pools within queue */
2882 +carry_pool *init_carry_pool(int size)
2883 +{
2884 + carry_pool *pool;
2885 +
2886 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2887 + pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2888 + if (pool == NULL)
2889 + return ERR_PTR(RETERR(-ENOMEM));
2890 +
2891 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2892 + (char *)pool->op);
2893 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2894 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2895 + return pool;
2896 +}
2897 +
2898 +/* finish with queue pools */
2899 +void done_carry_pool(carry_pool * pool/* pool to destroy */)
2900 +{
2901 + reiser4_done_pool(&pool->op_pool);
2902 + reiser4_done_pool(&pool->node_pool);
2903 + kfree(pool);
2904 +}
2905 +
2906 +/* add new carry node to the @level.
2907 +
2908 + Returns pointer to the new carry node allocated from pool. It's up to
2909 + callers to maintain proper order in the @level. Assumption is that if carry
2910 + nodes on one level are already sorted and modifications are peroformed from
2911 + left to right, carry nodes added on the parent level will be ordered
2912 + automatically. To control ordering use @order and @reference parameters.
2913 +
2914 +*/
2915 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2916 + * node to */ ,
2917 + pool_ordering order /* where to insert:
2918 + * at the beginning of
2919 + * @level,
2920 + * before @reference,
2921 + * after @reference,
2922 + * at the end of @level
2923 + */ ,
2924 + carry_node * reference/* reference node for
2925 + * insertion */)
2926 +{
2927 + ON_DEBUG(carry_node * orig_ref = reference);
2928 +
2929 + if (order == POOLO_BEFORE) {
2930 + reference = find_left_carry(reference, level);
2931 + if (reference == NULL)
2932 + reference = list_entry(level->nodes.next, carry_node,
2933 + header.level_linkage);
2934 + else
2935 + reference = list_entry(reference->header.level_linkage.next,
2936 + carry_node, header.level_linkage);
2937 + } else if (order == POOLO_AFTER) {
2938 + reference = find_right_carry(reference, level);
2939 + if (reference == NULL)
2940 + reference = list_entry(level->nodes.prev, carry_node,
2941 + header.level_linkage);
2942 + else
2943 + reference = list_entry(reference->header.level_linkage.prev,
2944 + carry_node, header.level_linkage);
2945 + }
2946 + assert("nikita-2209",
2947 + ergo(orig_ref != NULL,
2948 + reiser4_carry_real(reference) ==
2949 + reiser4_carry_real(orig_ref)));
2950 + return reiser4_add_carry(level, order, reference);
2951 +}
2952 +
2953 +carry_node *reiser4_add_carry(carry_level * level, /* carry_level to add
2954 + node to */
2955 + pool_ordering order, /* where to insert:
2956 + * at the beginning of
2957 + * @level;
2958 + * before @reference;
2959 + * after @reference;
2960 + * at the end of @level
2961 + */
2962 + carry_node * reference /* reference node for
2963 + * insertion */)
2964 +{
2965 + carry_node *result;
2966 +
2967 + result =
2968 + (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2969 + &level->nodes,
2970 + order, &reference->header);
2971 + if (!IS_ERR(result) && (result != NULL))
2972 + ++level->nodes_num;
2973 + return result;
2974 +}
2975 +
2976 +/**
2977 + * add new carry operation to the @level.
2978 + *
2979 + * Returns pointer to the new carry operations allocated from pool. It's up to
2980 + * callers to maintain proper order in the @level. To control ordering use
2981 + * @order and @reference parameters.
2982 + */
2983 +static carry_op *add_op(carry_level * level, /* &carry_level to add node to */
2984 + pool_ordering order, /* where to insert:
2985 + * at the beginning of @level;
2986 + * before @reference;
2987 + * after @reference;
2988 + * at the end of @level */
2989 + carry_op * reference /* reference node for insertion */)
2990 +{
2991 + carry_op *result;
2992 +
2993 + result =
2994 + (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2995 + order, &reference->header);
2996 + if (!IS_ERR(result) && (result != NULL))
2997 + ++level->ops_num;
2998 + return result;
2999 +}
3000 +
3001 +/**
3002 + * Return node on the right of which @node was created.
3003 + *
3004 + * Each node is created on the right of some existing node (or it is new root,
3005 + * which is special case not handled here).
3006 + *
3007 + * @node is new node created on some level, but not yet inserted into its
3008 + * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3009 + */
3010 +static carry_node *find_begetting_brother(carry_node * node,/* node to start
3011 + search from */
3012 + carry_level * kin UNUSED_ARG
3013 + /* level to scan */)
3014 +{
3015 + carry_node *scan;
3016 +
3017 + assert("nikita-1614", node != NULL);
3018 + assert("nikita-1615", kin != NULL);
3019 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3020 + assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
3021 + ZF_ISSET(reiser4_carry_real(node),
3022 + JNODE_ORPHAN)));
3023 + for (scan = node;;
3024 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
3025 + header.level_linkage)) {
3026 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3027 + if ((scan->node != node->node) &&
3028 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3029 + assert("nikita-1618", reiser4_carry_real(scan) != NULL);
3030 + break;
3031 + }
3032 + }
3033 + return scan;
3034 +}
3035 +
3036 +static cmp_t
3037 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3038 +{
3039 + assert("nikita-2199", n1 != NULL);
3040 + assert("nikita-2200", n2 != NULL);
3041 +
3042 + if (n1 == n2)
3043 + return EQUAL_TO;
3044 + while (1) {
3045 + n1 = carry_node_next(n1);
3046 + if (carry_node_end(level, n1))
3047 + return GREATER_THAN;
3048 + if (n1 == n2)
3049 + return LESS_THAN;
3050 + }
3051 + impossible("nikita-2201", "End of level reached");
3052 +}
3053 +
3054 +carry_node *find_carry_node(carry_level * level, const znode * node)
3055 +{
3056 + carry_node *scan;
3057 + carry_node *tmp_scan;
3058 +
3059 + assert("nikita-2202", level != NULL);
3060 + assert("nikita-2203", node != NULL);
3061 +
3062 + for_all_nodes(level, scan, tmp_scan) {
3063 + if (reiser4_carry_real(scan) == node)
3064 + return scan;
3065 + }
3066 + return NULL;
3067 +}
3068 +
3069 +znode *reiser4_carry_real(const carry_node * node)
3070 +{
3071 + assert("nikita-3061", node != NULL);
3072 +
3073 + return node->lock_handle.node;
3074 +}
3075 +
3076 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3077 + const znode * node)
3078 +{
3079 + carry_node *base;
3080 + carry_node *scan;
3081 + carry_node *tmp_scan;
3082 + carry_node *proj;
3083 +
3084 + base = find_carry_node(doing, node);
3085 + assert("nikita-2204", base != NULL);
3086 +
3087 + for_all_nodes(todo, scan, tmp_scan) {
3088 + proj = find_carry_node(doing, scan->node);
3089 + assert("nikita-2205", proj != NULL);
3090 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3091 + break;
3092 + }
3093 + return scan;
3094 +}
3095 +
3096 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3097 + znode * node)
3098 +{
3099 + carry_node *reference;
3100 +
3101 + assert("nikita-2994", doing != NULL);
3102 + assert("nikita-2995", todo != NULL);
3103 + assert("nikita-2996", node != NULL);
3104 +
3105 + reference = insert_carry_node(doing, todo, node);
3106 + assert("nikita-2997", reference != NULL);
3107 +
3108 + return reiser4_add_carry(todo, POOLO_BEFORE, reference);
3109 +}
3110 +
3111 +/* like reiser4_post_carry(), but designed to be called from node plugin
3112 + methods. This function is different from reiser4_post_carry() in that it
3113 + finds proper place to insert node in the queue. */
3114 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
3115 + * passed down to node
3116 + * plugin */ ,
3117 + carry_opcode op /* opcode of operation */ ,
3118 + znode * node /* node on which this
3119 + * operation will operate */ ,
3120 + int apply_to_parent_p /* whether operation will
3121 + * operate directly on @node
3122 + * or on it parent. */ )
3123 +{
3124 + carry_op *result;
3125 + carry_node *child;
3126 +
3127 + assert("nikita-2207", info != NULL);
3128 + assert("nikita-2208", info->todo != NULL);
3129 +
3130 + if (info->doing == NULL)
3131 + return reiser4_post_carry(info->todo, op, node,
3132 + apply_to_parent_p);
3133 +
3134 + result = add_op(info->todo, POOLO_LAST, NULL);
3135 + if (IS_ERR(result))
3136 + return result;
3137 + child = add_carry_atplace(info->doing, info->todo, node);
3138 + if (IS_ERR(child)) {
3139 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3140 + return (carry_op *) child;
3141 + }
3142 + result->node = child;
3143 + result->op = op;
3144 + child->parent = apply_to_parent_p;
3145 + if (ZF_ISSET(node, JNODE_ORPHAN))
3146 + child->left_before = 1;
3147 + child->node = node;
3148 + return result;
3149 +}
3150 +
3151 +/* lock all carry nodes in @level */
3152 +static int lock_carry_level(carry_level * level/* level to lock */)
3153 +{
3154 + int result;
3155 + carry_node *node;
3156 + carry_node *tmp_node;
3157 +
3158 + assert("nikita-881", level != NULL);
3159 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3160 +
3161 + /* lock nodes from left to right */
3162 + result = 0;
3163 + for_all_nodes(level, node, tmp_node) {
3164 + result = lock_carry_node(level, node);
3165 + if (result != 0)
3166 + break;
3167 + }
3168 + return result;
3169 +}
3170 +
3171 +/* Synchronize delimiting keys between @node and its left neighbor.
3172 +
3173 + To reduce contention on dk key and simplify carry code, we synchronize
3174 + delimiting keys only when carry ultimately leaves tree level (carrying
3175 + changes upward) and unlocks nodes at this level.
3176 +
3177 + This function first finds left neighbor of @node and then updates left
3178 + neighbor's right delimiting key to conincide with least key in @node.
3179 +
3180 +*/
3181 +
3182 +ON_DEBUG(extern atomic_t delim_key_version;
3183 + )
3184 +
3185 +static void sync_dkeys(znode * spot/* node to update */)
3186 +{
3187 + reiser4_key pivot;
3188 + reiser4_tree *tree;
3189 +
3190 + assert("nikita-1610", spot != NULL);
3191 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3192 +
3193 + tree = znode_get_tree(spot);
3194 + read_lock_tree(tree);
3195 + write_lock_dk(tree);
3196 +
3197 + assert("nikita-2192", znode_is_loaded(spot));
3198 +
3199 + /* sync left delimiting key of @spot with key in its leftmost item */
3200 + if (node_is_empty(spot))
3201 + pivot = *znode_get_rd_key(spot);
3202 + else
3203 + leftmost_key_in_node(spot, &pivot);
3204 +
3205 + znode_set_ld_key(spot, &pivot);
3206 +
3207 + /* there can be sequence of empty nodes pending removal on the left of
3208 + @spot. Scan them and update their left and right delimiting keys to
3209 + match left delimiting key of @spot. Also, update right delimiting
3210 + key of first non-empty left neighbor.
3211 + */
3212 + while (1) {
3213 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3214 + break;
3215 +
3216 + spot = spot->left;
3217 + if (spot == NULL)
3218 + break;
3219 +
3220 + znode_set_rd_key(spot, &pivot);
3221 + /* don't sink into the domain of another balancing */
3222 + if (!znode_is_write_locked(spot))
3223 + break;
3224 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3225 + znode_set_ld_key(spot, &pivot);
3226 + else
3227 + break;
3228 + }
3229 +
3230 + write_unlock_dk(tree);
3231 + read_unlock_tree(tree);
3232 +}
3233 +
3234 +/* unlock all carry nodes in @level */
3235 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3236 + int failure /* true if unlocking owing to
3237 + * failure */ )
3238 +{
3239 + carry_node *node;
3240 + carry_node *tmp_node;
3241 +
3242 + assert("nikita-889", level != NULL);
3243 +
3244 + if (!failure) {
3245 + znode *spot;
3246 +
3247 + spot = NULL;
3248 + /* update delimiting keys */
3249 + for_all_nodes(level, node, tmp_node) {
3250 + if (reiser4_carry_real(node) != spot) {
3251 + spot = reiser4_carry_real(node);
3252 + sync_dkeys(spot);
3253 + }
3254 + }
3255 + }
3256 +
3257 + /* nodes can be unlocked in arbitrary order. In preemptible
3258 + environment it's better to unlock in reverse order of locking,
3259 + though.
3260 + */
3261 + for_all_nodes_back(level, node, tmp_node) {
3262 + /* all allocated nodes should be already linked to their
3263 + parents at this moment. */
3264 + assert("nikita-1631",
3265 + ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3266 + JNODE_ORPHAN)));
3267 + ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3268 + unlock_carry_node(level, node, failure);
3269 + }
3270 + level->new_root = NULL;
3271 +}
3272 +
3273 +/* finish with @level
3274 +
3275 + Unlock nodes and release all allocated resources */
3276 +static void done_carry_level(carry_level * level/* level to finish */)
3277 +{
3278 + carry_node *node;
3279 + carry_node *tmp_node;
3280 + carry_op *op;
3281 + carry_op *tmp_op;
3282 +
3283 + assert("nikita-1076", level != NULL);
3284 +
3285 + unlock_carry_level(level, 0);
3286 + for_all_nodes(level, node, tmp_node) {
3287 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3288 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3289 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3290 + }
3291 + for_all_ops(level, op, tmp_op)
3292 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3293 +}
3294 +
3295 +/* helper function to complete locking of carry node
3296 +
3297 + Finish locking of carry node. There are several ways in which new carry
3298 + node can be added into carry level and locked. Normal is through
3299 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3300 + function factors out common final part of all locking scenarios. It
3301 + supposes that @node -> lock_handle is lock handle for lock just taken and
3302 + fills ->real_node from this lock handle.
3303 +
3304 +*/
3305 +int lock_carry_node_tail(carry_node * node/* node to complete locking of */)
3306 +{
3307 + assert("nikita-1052", node != NULL);
3308 + assert("nikita-1187", reiser4_carry_real(node) != NULL);
3309 + assert("nikita-1188", !node->unlock);
3310 +
3311 + node->unlock = 1;
3312 + /* Load node content into memory and install node plugin by
3313 + looking at the node header.
3314 +
3315 + Most of the time this call is cheap because the node is
3316 + already in memory.
3317 +
3318 + Corresponding zrelse() is in unlock_carry_node()
3319 + */
3320 + return zload(reiser4_carry_real(node));
3321 +}
3322 +
3323 +/* lock carry node
3324 +
3325 + "Resolve" node to real znode, lock it and mark as locked.
3326 + This requires recursive locking of znodes.
3327 +
3328 + When operation is posted to the parent level, node it will be applied to is
3329 + not yet known. For example, when shifting data between two nodes,
3330 + delimiting has to be updated in parent or parents of nodes involved. But
3331 + their parents is not yet locked and, moreover said nodes can be reparented
3332 + by concurrent balancing.
3333 +
3334 + To work around this, carry operation is applied to special "carry node"
3335 + rather than to the znode itself. Carry node consists of some "base" or
3336 + "reference" znode and flags indicating how to get to the target of carry
3337 + operation (->real_node field of carry_node) from base.
3338 +
3339 +*/
3340 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3341 + carry_node * node/* node to lock */)
3342 +{
3343 + int result;
3344 + znode *reference_point;
3345 + lock_handle lh;
3346 + lock_handle tmp_lh;
3347 + reiser4_tree *tree;
3348 +
3349 + assert("nikita-887", level != NULL);
3350 + assert("nikita-882", node != NULL);
3351 +
3352 + result = 0;
3353 + reference_point = node->node;
3354 + init_lh(&lh);
3355 + init_lh(&tmp_lh);
3356 + if (node->left_before) {
3357 + /* handling of new nodes, allocated on the previous level:
3358 +
3359 + some carry ops were propably posted from the new node, but
3360 + this node neither has parent pointer set, nor is
3361 + connected. This will be done in ->create_hook() for
3362 + internal item.
3363 +
3364 + No then less, parent of new node has to be locked. To do
3365 + this, first go to the "left" in the carry order. This
3366 + depends on the decision to always allocate new node on the
3367 + right of existing one.
3368 +
3369 + Loop handles case when multiple nodes, all orphans, were
3370 + inserted.
3371 +
3372 + Strictly speaking, taking tree lock is not necessary here,
3373 + because all nodes scanned by loop in
3374 + find_begetting_brother() are write-locked by this thread,
3375 + and thus, their sibling linkage cannot change.
3376 +
3377 + */
3378 + tree = znode_get_tree(reference_point);
3379 + read_lock_tree(tree);
3380 + reference_point = find_begetting_brother(node, level)->node;
3381 + read_unlock_tree(tree);
3382 + assert("nikita-1186", reference_point != NULL);
3383 + }
3384 + if (node->parent && (result == 0)) {
3385 + result =
3386 + reiser4_get_parent(&tmp_lh, reference_point,
3387 + ZNODE_WRITE_LOCK);
3388 + if (result != 0) {
3389 + ; /* nothing */
3390 + } else if (znode_get_level(tmp_lh.node) == 0) {
3391 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3392 + result = add_new_root(level, node, tmp_lh.node);
3393 + if (result == 0) {
3394 + reference_point = level->new_root;
3395 + move_lh(&lh, &node->lock_handle);
3396 + }
3397 + } else if ((level->new_root != NULL)
3398 + && (level->new_root !=
3399 + znode_parent_nolock(reference_point))) {
3400 + /* parent of node exists, but this level aready
3401 + created different new root, so */
3402 + warning("nikita-1109",
3403 + /* it should be "radicis", but tradition is
3404 + tradition. do banshees read latin? */
3405 + "hodie natus est radici frater");
3406 + result = -EIO;
3407 + } else {
3408 + move_lh(&lh, &tmp_lh);
3409 + reference_point = lh.node;
3410 + }
3411 + }
3412 + if (node->left && (result == 0)) {
3413 + assert("nikita-1183", node->parent);
3414 + assert("nikita-883", reference_point != NULL);
3415 + result =
3416 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3417 + ZNODE_WRITE_LOCK,
3418 + GN_CAN_USE_UPPER_LEVELS);
3419 + if (result == 0) {
3420 + done_lh(&lh);
3421 + move_lh(&lh, &tmp_lh);
3422 + reference_point = lh.node;
3423 + }
3424 + }
3425 + if (!node->parent && !node->left && !node->left_before) {
3426 + result =
3427 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3428 + ZNODE_LOCK_HIPRI);
3429 + }
3430 + if (result == 0) {
3431 + move_lh(&node->lock_handle, &lh);
3432 + result = lock_carry_node_tail(node);
3433 + }
3434 + done_lh(&tmp_lh);
3435 + done_lh(&lh);
3436 + return result;
3437 +}
3438 +
3439 +/* release a lock on &carry_node.
3440 +
3441 + Release if necessary lock on @node. This opearion is pair of
3442 + lock_carry_node() and is idempotent: you can call it more than once on the
3443 + same node.
3444 +
3445 +*/
3446 +static void
3447 +unlock_carry_node(carry_level * level,
3448 + carry_node * node /* node to be released */ ,
3449 + int failure /* 0 if node is unlocked due
3450 + * to some error */ )
3451 +{
3452 + znode *real_node;
3453 +
3454 + assert("nikita-884", node != NULL);
3455 +
3456 + real_node = reiser4_carry_real(node);
3457 + /* pair to zload() in lock_carry_node_tail() */
3458 + zrelse(real_node);
3459 + if (node->unlock && (real_node != NULL)) {
3460 + assert("nikita-899", real_node == node->lock_handle.node);
3461 + longterm_unlock_znode(&node->lock_handle);
3462 + }
3463 + if (failure) {
3464 + if (node->deallocate && (real_node != NULL)) {
3465 + /* free node in bitmap
3466 +
3467 + Prepare node for removal. Last zput() will finish
3468 + with it.
3469 + */
3470 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3471 + }
3472 + if (node->free) {
3473 + assert("nikita-2177",
3474 + list_empty_careful(&node->lock_handle.locks_link));
3475 + assert("nikita-2112",
3476 + list_empty_careful(&node->lock_handle.owners_link));
3477 + reiser4_pool_free(&level->pool->node_pool,
3478 + &node->header);
3479 + }
3480 + }
3481 +}
3482 +
3483 +/* fatal_carry_error() - all-catching error handling function
3484 +
3485 + It is possible that carry faces unrecoverable error, like unability to
3486 + insert pointer at the internal level. Our simple solution is just panic in
3487 + this situation. More sophisticated things like attempt to remount
3488 + file-system as read-only can be implemented without much difficlties.
3489 +
3490 + It is believed, that:
3491 +
3492 + 1. in stead of panicking, all current transactions can be aborted rolling
3493 + system back to the consistent state.
3494 +
3495 +Umm, if you simply panic without doing anything more at all, then all current
3496 +transactions are aborted and the system is rolled back to a consistent state,
3497 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3498 +precise. If an internal node is corrupted on disk due to hardware failure,
3499 +then there may be no consistent state that can be rolled back to, so instead
3500 +we should say that it will rollback the transactions, which barring other
3501 +factors means rolling back to a consistent state.
3502 +
3503 +# Nikita: there is a subtle difference between panic and aborting
3504 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3505 +# don't using reiser4 (not that we care about such processes), or using other
3506 +# reiser4 mounts (about them we do care) will simply continue to run. With
3507 +# some luck, even application using aborted file system can survive: it will
3508 +# get some error, like EBADF, from each file descriptor on failed file system,
3509 +# but applications that do care about tolerance will cope with this (squid
3510 +# will).
3511 +
3512 +It would be a nice feature though to support rollback without rebooting
3513 +followed by remount, but this can wait for later versions.
3514 +
3515 + 2. once isolated transactions will be implemented it will be possible to
3516 + roll back offending transaction.
3517 +
3518 +2. is additional code complexity of inconsistent value (it implies that a
3519 +broken tree should be kept in operation), so we must think about it more
3520 +before deciding if it should be done. -Hans
3521 +
3522 +*/
3523 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3524 + * where
3525 + * unrecoverable
3526 + * error
3527 + * occurred */ ,
3528 + int ecode/* error code */)
3529 +{
3530 + assert("nikita-1230", doing != NULL);
3531 + assert("nikita-1231", ecode < 0);
3532 +
3533 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3534 +}
3535 +
3536 +/**
3537 + * Add new root to the tree
3538 + *
3539 + * This function itself only manages changes in carry structures and delegates
3540 + * all hard work (allocation of znode for new root, changes of parent and
3541 + * sibling pointers) to the reiser4_add_tree_root().
3542 + *
3543 + * Locking: old tree root is locked by carry at this point. Fake znode is also
3544 + * locked.
3545 + */
3546 +static int add_new_root(carry_level * level,/* carry level in context of which
3547 + * operation is performed */
3548 + carry_node * node, /* carry node for existing root */
3549 + znode * fake /* "fake" znode already locked by
3550 + * us */)
3551 +{
3552 + int result;
3553 +
3554 + assert("nikita-1104", level != NULL);
3555 + assert("nikita-1105", node != NULL);
3556 +
3557 + assert("nikita-1403", znode_is_write_locked(node->node));
3558 + assert("nikita-1404", znode_is_write_locked(fake));
3559 +
3560 + /* trying to create new root. */
3561 + /* @node is root and it's already locked by us. This
3562 + means that nobody else can be trying to add/remove
3563 + tree root right now.
3564 + */
3565 + if (level->new_root == NULL)
3566 + level->new_root = reiser4_add_tree_root(node->node, fake);
3567 + if (!IS_ERR(level->new_root)) {
3568 + assert("nikita-1210", znode_is_root(level->new_root));
3569 + node->deallocate = 1;
3570 + result =
3571 + longterm_lock_znode(&node->lock_handle, level->new_root,
3572 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3573 + if (result == 0)
3574 + zput(level->new_root);
3575 + } else {
3576 + result = PTR_ERR(level->new_root);
3577 + level->new_root = NULL;
3578 + }
3579 + return result;
3580 +}
3581 +
3582 +/* allocate new znode and add the operation that inserts the
3583 + pointer to it into the parent node into the todo level
3584 +
3585 + Allocate new znode, add it into carry queue and post into @todo queue
3586 + request to add pointer to new node into its parent.
3587 +
3588 + This is carry related routing that calls reiser4_new_node() to allocate new
3589 + node.
3590 +*/
3591 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3592 + * node */ ,
3593 + carry_node * ref /* carry node after which new
3594 + * carry node is to be inserted
3595 + * into queue. This affects
3596 + * locking. */ ,
3597 + carry_level * doing /* carry queue where new node is
3598 + * to be added */ ,
3599 + carry_level * todo /* carry queue where COP_INSERT
3600 + * operation to add pointer to
3601 + * new node will ne added */ )
3602 +{
3603 + carry_node *fresh;
3604 + znode *new_znode;
3605 + carry_op *add_pointer;
3606 + carry_plugin_info info;
3607 +
3608 + assert("nikita-1048", brother != NULL);
3609 + assert("nikita-1049", todo != NULL);
3610 +
3611 + /* There is a lot of possible variations here: to what parent
3612 + new node will be attached and where. For simplicity, always
3613 + do the following:
3614 +
3615 + (1) new node and @brother will have the same parent.
3616 +
3617 + (2) new node is added on the right of @brother
3618 +
3619 + */
3620 +
3621 + fresh = reiser4_add_carry_skip(doing,
3622 + ref ? POOLO_AFTER : POOLO_LAST, ref);
3623 + if (IS_ERR(fresh))
3624 + return fresh;
3625 +
3626 + fresh->deallocate = 1;
3627 + fresh->free = 1;
3628 +
3629 + new_znode = reiser4_new_node(brother, znode_get_level(brother));
3630 + if (IS_ERR(new_znode))
3631 + /* @fresh will be deallocated automatically by error
3632 + handling code in the caller. */
3633 + return (carry_node *) new_znode;
3634 +
3635 + /* new_znode returned znode with x_count 1. Caller has to decrease
3636 + it. make_space() does. */
3637 +
3638 + ZF_SET(new_znode, JNODE_ORPHAN);
3639 + fresh->node = new_znode;
3640 +
3641 + while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3642 + ref = carry_node_prev(ref);
3643 + assert("nikita-1606", !carry_node_end(doing, ref));
3644 + }
3645 +
3646 + info.todo = todo;
3647 + info.doing = doing;
3648 + add_pointer = node_post_carry(&info, COP_INSERT,
3649 + reiser4_carry_real(ref), 1);
3650 + if (IS_ERR(add_pointer)) {
3651 + /* no need to deallocate @new_znode here: it will be
3652 + deallocated during carry error handling. */
3653 + return (carry_node *) add_pointer;
3654 + }
3655 +
3656 + add_pointer->u.insert.type = COPT_CHILD;
3657 + add_pointer->u.insert.child = fresh;
3658 + add_pointer->u.insert.brother = brother;
3659 + /* initially new node spawns empty key range */
3660 + write_lock_dk(znode_get_tree(brother));
3661 + znode_set_ld_key(new_znode,
3662 + znode_set_rd_key(new_znode,
3663 + znode_get_rd_key(brother)));
3664 + write_unlock_dk(znode_get_tree(brother));
3665 + return fresh;
3666 +}
3667 +
3668 +/* DEBUGGING FUNCTIONS.
3669 +
3670 + Probably we also should leave them on even when
3671 + debugging is turned off to print dumps at errors.
3672 +*/
3673 +#if REISER4_DEBUG
3674 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3675 +{
3676 + carry_node *node;
3677 + carry_node *tmp_node;
3678 +
3679 + if (level == NULL)
3680 + return 0;
3681 +
3682 + if (level->track_type != 0 &&
3683 + level->track_type != CARRY_TRACK_NODE &&
3684 + level->track_type != CARRY_TRACK_CHANGE)
3685 + return 0;
3686 +
3687 + /* check that nodes are in ascending order */
3688 + for_all_nodes(level, node, tmp_node) {
3689 + znode *left;
3690 + znode *right;
3691 +
3692 + reiser4_key lkey;
3693 + reiser4_key rkey;
3694 +
3695 + if (node != carry_node_front(level)) {
3696 + if (state == CARRY_TODO) {
3697 + right = node->node;
3698 + left = carry_node_prev(node)->node;
3699 + } else {
3700 + right = reiser4_carry_real(node);
3701 + left = reiser4_carry_real(carry_node_prev(node));
3702 + }
3703 + if (right == NULL || left == NULL)
3704 + continue;
3705 + if (node_is_empty(right) || node_is_empty(left))
3706 + continue;
3707 + if (!keyle(leftmost_key_in_node(left, &lkey),
3708 + leftmost_key_in_node(right, &rkey))) {
3709 + warning("", "wrong key order");
3710 + return 0;
3711 + }
3712 + }
3713 + }
3714 + return 1;
3715 +}
3716 +#endif
3717 +
3718 +/* get symbolic name for boolean */
3719 +static const char *tf(int boolean/* truth value */)
3720 +{
3721 + return boolean ? "t" : "f";
3722 +}
3723 +
3724 +/* symbolic name for carry operation */
3725 +static const char *carry_op_name(carry_opcode op/* carry opcode */)
3726 +{
3727 + switch (op) {
3728 + case COP_INSERT:
3729 + return "COP_INSERT";
3730 + case COP_DELETE:
3731 + return "COP_DELETE";
3732 + case COP_CUT:
3733 + return "COP_CUT";
3734 + case COP_PASTE:
3735 + return "COP_PASTE";
3736 + case COP_UPDATE:
3737 + return "COP_UPDATE";
3738 + case COP_EXTENT:
3739 + return "COP_EXTENT";
3740 + case COP_INSERT_FLOW:
3741 + return "COP_INSERT_FLOW";
3742 + default:{
3743 + /* not mt safe, but who cares? */
3744 + static char buf[20];
3745 +
3746 + sprintf(buf, "unknown op: %x", op);
3747 + return buf;
3748 + }
3749 + }
3750 +}
3751 +
3752 +/* dump information about carry node */
3753 +static void print_carry(const char *prefix /* prefix to print */ ,
3754 + carry_node * node/* node to print */)
3755 +{
3756 + if (node == NULL) {
3757 + printk("%s: null\n", prefix);
3758 + return;
3759 + }
3760 + printk
3761 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3762 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3763 + tf(node->free), tf(node->deallocate));
3764 +}
3765 +
3766 +/* dump information about carry operation */
3767 +static void print_op(const char *prefix /* prefix to print */ ,
3768 + carry_op * op/* operation to print */)
3769 +{
3770 + if (op == NULL) {
3771 + printk("%s: null\n", prefix);
3772 + return;
3773 + }
3774 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3775 + print_carry("\tnode", op->node);
3776 + switch (op->op) {
3777 + case COP_INSERT:
3778 + case COP_PASTE:
3779 + print_coord("\tcoord",
3780 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3781 + reiser4_print_key("\tkey",
3782 + op->u.insert.d ? op->u.insert.d->key : NULL);
3783 + print_carry("\tchild", op->u.insert.child);
3784 + break;
3785 + case COP_DELETE:
3786 + print_carry("\tchild", op->u.delete.child);
3787 + break;
3788 + case COP_CUT:
3789 + if (op->u.cut_or_kill.is_cut) {
3790 + print_coord("\tfrom",
3791 + op->u.cut_or_kill.u.kill->params.from, 0);
3792 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3793 + 0);
3794 + } else {
3795 + print_coord("\tfrom",
3796 + op->u.cut_or_kill.u.cut->params.from, 0);
3797 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3798 + 0);
3799 + }
3800 + break;
3801 + case COP_UPDATE:
3802 + print_carry("\tleft", op->u.update.left);
3803 + break;
3804 + default:
3805 + /* do nothing */
3806 + break;
3807 + }
3808 +}
3809 +
3810 +/* dump information about all nodes and operations in a @level */
3811 +static void print_level(const char *prefix /* prefix to print */ ,
3812 + carry_level * level/* level to print */)
3813 +{
3814 + carry_node *node;
3815 + carry_node *tmp_node;
3816 + carry_op *op;
3817 + carry_op *tmp_op;
3818 +
3819 + if (level == NULL) {
3820 + printk("%s: null\n", prefix);
3821 + return;
3822 + }
3823 + printk("%s: %p, restartable: %s\n",
3824 + prefix, level, tf(level->restartable));
3825 +
3826 + for_all_nodes(level, node, tmp_node)
3827 + print_carry("\tcarry node", node);
3828 + for_all_ops(level, op, tmp_op)
3829 + print_op("\tcarry op", op);
3830 +}
3831 +
3832 +/* Make Linus happy.
3833 + Local variables:
3834 + c-indentation-style: "K&R"
3835 + mode-name: "LC"
3836 + c-basic-offset: 8
3837 + tab-width: 8
3838 + fill-column: 120
3839 + scroll-step: 1
3840 + End:
3841 +*/
3842 diff -urN linux-2.6.33.orig/fs/reiser4/carry.h linux-2.6.33/fs/reiser4/carry.h
3843 --- linux-2.6.33.orig/fs/reiser4/carry.h 1970-01-01 01:00:00.000000000 +0100
3844 +++ linux-2.6.33/fs/reiser4/carry.h 2010-03-04 19:33:22.000000000 +0100
3845 @@ -0,0 +1,445 @@
3846 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
3847 + reiser4/README */
3848 +
3849 +/* Functions and data types to "carry" tree modification(s) upward.
3850 + See fs/reiser4/carry.c for details. */
3851 +
3852 +#if !defined(__FS_REISER4_CARRY_H__)
3853 +#define __FS_REISER4_CARRY_H__
3854 +
3855 +#include "forward.h"
3856 +#include "debug.h"
3857 +#include "pool.h"
3858 +#include "znode.h"
3859 +
3860 +#include <linux/types.h>
3861 +
3862 +/* &carry_node - "location" of carry node.
3863 +
3864 + "location" of node that is involved or going to be involved into
3865 + carry process. Node where operation will be carried to on the
3866 + parent level cannot be recorded explicitly. Operation will be carried
3867 + usually to the parent of some node (where changes are performed at
3868 + the current level) or, to the left neighbor of its parent. But while
3869 + modifications are performed at the current level, parent may
3870 + change. So, we have to allow some indirection (or, positevly,
3871 + flexibility) in locating carry nodes.
3872 +
3873 +*/
3874 +typedef struct carry_node {
3875 + /* pool linkage */
3876 + struct reiser4_pool_header header;
3877 +
3878 + /* base node from which real_node is calculated. See
3879 + fs/reiser4/carry.c:lock_carry_node(). */
3880 + znode *node;
3881 +
3882 + /* how to get ->real_node */
3883 + /* to get ->real_node obtain parent of ->node */
3884 + __u32 parent:1;
3885 + /* to get ->real_node obtain left neighbor of parent of
3886 + ->node */
3887 + __u32 left:1;
3888 + __u32 left_before:1;
3889 +
3890 + /* locking */
3891 +
3892 + /* this node was locked by carry process and should be
3893 + unlocked when carry leaves a level */
3894 + __u32 unlock:1;
3895 +
3896 + /* disk block for this node was allocated by carry process and
3897 + should be deallocated when carry leaves a level */
3898 + __u32 deallocate:1;
3899 + /* this carry node was allocated by carry process and should be
3900 + freed when carry leaves a level */
3901 + __u32 free:1;
3902 +
3903 + /* type of lock we want to take on this node */
3904 + lock_handle lock_handle;
3905 +} carry_node;
3906 +
3907 +/* &carry_opcode - elementary operations that can be carried upward
3908 +
3909 + Operations that carry() can handle. This list is supposed to be
3910 + expanded.
3911 +
3912 + Each carry operation (cop) is handled by appropriate function defined
3913 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
3914 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3915 + call plugins of nodes affected by operation to modify nodes' content
3916 + and to gather operations to be performed on the next level.
3917 +
3918 +*/
3919 +typedef enum {
3920 + /* insert new item into node. */
3921 + COP_INSERT,
3922 + /* delete pointer from parent node */
3923 + COP_DELETE,
3924 + /* remove part of or whole node. */
3925 + COP_CUT,
3926 + /* increase size of item. */
3927 + COP_PASTE,
3928 + /* insert extent (that is sequence of unformatted nodes). */
3929 + COP_EXTENT,
3930 + /* update delimiting key in least common ancestor of two
3931 + nodes. This is performed when items are moved between two
3932 + nodes.
3933 + */
3934 + COP_UPDATE,
3935 + /* insert flow */
3936 + COP_INSERT_FLOW,
3937 + COP_LAST_OP,
3938 +} carry_opcode;
3939 +
3940 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3941 +
3942 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3943 + item is determined. */
3944 +typedef enum {
3945 + /* target item is one containing pointer to the ->child node */
3946 + COPT_CHILD,
3947 + /* target item is given explicitly by @coord */
3948 + COPT_ITEM_DATA,
3949 + /* target item is given by key */
3950 + COPT_KEY,
3951 + /* see insert_paste_common() for more comments on this. */
3952 + COPT_PASTE_RESTARTED,
3953 +} cop_insert_pos_type;
3954 +
3955 +/* flags to cut and delete */
3956 +typedef enum {
3957 + /* don't kill node even if it became completely empty as results of
3958 + * cut. This is needed for eottl handling. See carry_extent() for
3959 + * details. */
3960 + DELETE_RETAIN_EMPTY = (1 << 0)
3961 +} cop_delete_flag;
3962 +
3963 +/*
3964 + * carry() implements "lock handle tracking" feature.
3965 + *
3966 + * Callers supply carry with node where to perform initial operation and lock
3967 + * handle on this node. Trying to optimize node utilization carry may actually
3968 + * move insertion point to different node. Callers expect that lock handle
3969 + * will rebe transferred to the new node also.
3970 + *
3971 + */
3972 +typedef enum {
3973 + /* transfer lock handle along with insertion point */
3974 + CARRY_TRACK_CHANGE = 1,
3975 + /* acquire new lock handle to the node where insertion point is. This
3976 + * is used when carry() client doesn't initially possess lock handle
3977 + * on the insertion point node, for example, by extent insertion
3978 + * code. See carry_extent(). */
3979 + CARRY_TRACK_NODE = 2
3980 +} carry_track_type;
3981 +
3982 +/* data supplied to COP_{INSERT|PASTE} by callers */
3983 +typedef struct carry_insert_data {
3984 + /* position where new item is to be inserted */
3985 + coord_t *coord;
3986 + /* new item description */
3987 + reiser4_item_data * data;
3988 + /* key of new item */
3989 + const reiser4_key * key;
3990 +} carry_insert_data;
3991 +
3992 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the
3993 + below structure of parameters */
3994 +struct cut_kill_params {
3995 + /* coord where cut starts (inclusive) */
3996 + coord_t *from;
3997 + /* coord where cut stops (inclusive, this item/unit will also be
3998 + * cut) */
3999 + coord_t *to;
4000 + /* starting key. This is necessary when item and unit pos don't
4001 + * uniquely identify what portion or tree to remove. For example, this
4002 + * indicates what portion of extent unit will be affected. */
4003 + const reiser4_key * from_key;
4004 + /* exclusive stop key */
4005 + const reiser4_key * to_key;
4006 + /* if this is not NULL, smallest actually removed key is stored
4007 + * here. */
4008 + reiser4_key *smallest_removed;
4009 + /* kill_node_content() is called for file truncate */
4010 + int truncate;
4011 +};
4012 +
4013 +struct carry_cut_data {
4014 + struct cut_kill_params params;
4015 +};
4016 +
4017 +struct carry_kill_data {
4018 + struct cut_kill_params params;
4019 + /* parameter to be passed to the ->kill_hook() method of item
4020 + * plugin */
4021 + /*void *iplug_params; *//* FIXME: unused currently */
4022 + /* if not NULL---inode whose items are being removed. This is needed
4023 + * for ->kill_hook() of extent item to update VM structures when
4024 + * removing pages. */
4025 + struct inode *inode;
4026 + /* sibling list maintenance is complicated by existence of eottl. When
4027 + * eottl whose left and right neighbors are formatted leaves is
4028 + * removed, one has to connect said leaves in the sibling list. This
4029 + * cannot be done when extent removal is just started as locking rules
4030 + * require sibling list update to happen atomically with removal of
4031 + * extent item. Therefore: 1. pointers to left and right neighbors
4032 + * have to be passed down to the ->kill_hook() of extent item, and
4033 + * 2. said neighbors have to be locked. */
4034 + lock_handle *left;
4035 + lock_handle *right;
4036 + /* flags modifying behavior of kill. Currently, it may have
4037 + DELETE_RETAIN_EMPTY set. */
4038 + unsigned flags;
4039 + char *buf;
4040 +};
4041 +
4042 +/* &carry_tree_op - operation to "carry" upward.
4043 +
4044 + Description of an operation we want to "carry" to the upper level of
4045 + a tree: e.g, when we insert something and there is not enough space
4046 + we allocate a new node and "carry" the operation of inserting a
4047 + pointer to the new node to the upper level, on removal of empty node,
4048 + we carry up operation of removing appropriate entry from parent.
4049 +
4050 + There are two types of carry ops: when adding or deleting node we
4051 + node at the parent level where appropriate modification has to be
4052 + performed is known in advance. When shifting items between nodes
4053 + (split, merge), delimiting key should be changed in the least common
4054 + parent of the nodes involved that is not known in advance.
4055 +
4056 + For the operations of the first type we store in &carry_op pointer to
4057 + the &carry_node at the parent level. For the operation of the second
4058 + type we store &carry_node or parents of the left and right nodes
4059 + modified and keep track of them upward until they coincide.
4060 +
4061 +*/
4062 +typedef struct carry_op {
4063 + /* pool linkage */
4064 + struct reiser4_pool_header header;
4065 + carry_opcode op;
4066 + /* node on which operation is to be performed:
4067 +
4068 + for insert, paste: node where new item is to be inserted
4069 +
4070 + for delete: node where pointer is to be deleted
4071 +
4072 + for cut: node to cut from
4073 +
4074 + for update: node where delimiting key is to be modified
4075 +
4076 + for modify: parent of modified node
4077 +
4078 + */
4079 + carry_node *node;
4080 + union {
4081 + struct {
4082 + /* (sub-)type of insertion/paste. Taken from
4083 + cop_insert_pos_type. */
4084 + __u8 type;
4085 + /* various operation flags. Taken from
4086 + cop_insert_flag. */
4087 + __u8 flags;
4088 + carry_insert_data *d;
4089 + carry_node *child;
4090 + znode *brother;
4091 + } insert, paste, extent;
4092 +
4093 + struct {
4094 + int is_cut;
4095 + union {
4096 + carry_kill_data *kill;
4097 + carry_cut_data *cut;
4098 + } u;
4099 + } cut_or_kill;
4100 +
4101 + struct {
4102 + carry_node *left;
4103 + } update;
4104 + struct {
4105 + /* changed child */
4106 + carry_node *child;
4107 + /* bitmask of changes. See &cop_modify_flag */
4108 + __u32 flag;
4109 + } modify;
4110 + struct {
4111 + /* flags to deletion operation. Are taken from
4112 + cop_delete_flag */
4113 + __u32 flags;
4114 + /* child to delete from parent. If this is
4115 + NULL, delete op->node. */
4116 + carry_node *child;
4117 + } delete;
4118 + struct {
4119 + /* various operation flags. Taken from
4120 + cop_insert_flag. */
4121 + __u32 flags;
4122 + flow_t *flow;
4123 + coord_t *insert_point;
4124 + reiser4_item_data *data;
4125 + /* flow insertion is limited by number of new blocks
4126 + added in that operation which do not get any data
4127 + but part of flow. This limit is set by macro
4128 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4129 + of nodes added already during one carry_flow */
4130 + int new_nodes;
4131 + } insert_flow;
4132 + } u;
4133 +} carry_op;
4134 +
4135 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4136 +typedef struct carry_pool {
4137 + carry_op op[CARRIES_POOL_SIZE];
4138 + struct reiser4_pool op_pool;
4139 + carry_node node[NODES_LOCKED_POOL_SIZE];
4140 + struct reiser4_pool node_pool;
4141 +} carry_pool;
4142 +
4143 +/* &carry_tree_level - carry process on given level
4144 +
4145 + Description of balancing process on the given level.
4146 +
4147 + No need for locking here, as carry_tree_level is essentially per
4148 + thread thing (for now).
4149 +
4150 +*/
4151 +struct carry_level {
4152 + /* this level may be restarted */
4153 + __u32 restartable:1;
4154 + /* list of carry nodes on this level, ordered by key order */
4155 + struct list_head nodes;
4156 + struct list_head ops;
4157 + /* pool where new objects are allocated from */
4158 + carry_pool *pool;
4159 + int ops_num;
4160 + int nodes_num;
4161 + /* new root created on this level, if any */
4162 + znode *new_root;
4163 + /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4164 + when they want ->tracked to automagically wander to the node where
4165 + insertion point moved after insert or paste.
4166 + */
4167 + carry_track_type track_type;
4168 + /* lock handle supplied by user that we are tracking. See
4169 + above. */
4170 + lock_handle *tracked;
4171 +};
4172 +
4173 +/* information carry passes to plugin methods that may add new operations to
4174 + the @todo queue */
4175 +struct carry_plugin_info {
4176 + carry_level *doing;
4177 + carry_level *todo;
4178 +};
4179 +
4180 +int reiser4_carry(carry_level * doing, carry_level * done);
4181 +
4182 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4183 + carry_node * reference);
4184 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4185 + carry_node * reference);
4186 +
4187 +extern carry_node *insert_carry_node(carry_level * doing,
4188 + carry_level * todo, const znode * node);
4189 +
4190 +extern carry_pool *init_carry_pool(int);
4191 +extern void done_carry_pool(carry_pool * pool);
4192 +
4193 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4194 +
4195 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4196 + znode * node, int apply_to_parent);
4197 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4198 + znode * node, int apply_to_parent_p);
4199 +
4200 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4201 + carry_level * doing, carry_level * todo);
4202 +
4203 +carry_node *find_carry_node(carry_level * level, const znode * node);
4204 +
4205 +extern znode *reiser4_carry_real(const carry_node * node);
4206 +
4207 +/* helper macros to iterate over carry queues */
4208 +
4209 +#define carry_node_next(node) \
4210 + list_entry((node)->header.level_linkage.next, carry_node, \
4211 + header.level_linkage)
4212 +
4213 +#define carry_node_prev(node) \
4214 + list_entry((node)->header.level_linkage.prev, carry_node, \
4215 + header.level_linkage)
4216 +
4217 +#define carry_node_front(level) \
4218 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4219 +
4220 +#define carry_node_back(level) \
4221 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4222 +
4223 +#define carry_node_end(level, node) \
4224 + (&(level)->nodes == &(node)->header.level_linkage)
4225 +
4226 +/* macro to iterate over all operations in a @level */
4227 +#define for_all_ops(level /* carry level (of type carry_level *) */, \
4228 + op /* pointer to carry operation, modified by loop (of \
4229 + * type carry_op *) */, \
4230 + tmp /* pointer to carry operation (of type carry_op *), \
4231 + * used to make iterator stable in the face of \
4232 + * deletions from the level */ ) \
4233 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4234 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4235 + &op->header.level_linkage != &level->ops; \
4236 + op = tmp, \
4237 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4238 +
4239 +#if 0
4240 +for (op = (carry_op *) pool_level_list_front(&level->ops), \
4241 + tmp = (carry_op *) pool_level_list_next(&op->header) ; \
4242 + !pool_level_list_end(&level->ops, &op->header) ; \
4243 + op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header))
4244 +#endif
4245 +
4246 +/* macro to iterate over all nodes in a @level */ \
4247 +#define for_all_nodes(level /* carry level (of type carry_level *) */, \
4248 + node /* pointer to carry node, modified by loop (of \
4249 + * type carry_node *) */, \
4250 + tmp /* pointer to carry node (of type carry_node *), \
4251 + * used to make iterator stable in the face of * \
4252 + * deletions from the level */ ) \
4253 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4254 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4255 + &node->header.level_linkage != &level->nodes; \
4256 + node = tmp, \
4257 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4258 +
4259 +#if 0
4260 +for (node = carry_node_front(level), \
4261 + tmp = carry_node_next(node) ; !carry_node_end(level, node) ; \
4262 + node = tmp, tmp = carry_node_next(node))
4263 +#endif
4264 +
4265 +/* macro to iterate over all nodes in a @level in reverse order
4266 +
4267 + This is used, because nodes are unlocked in reversed order of locking */
4268 +#define for_all_nodes_back(level /* carry level (of type carry_level *) */, \
4269 + node /* pointer to carry node, modified by loop \
4270 + * (of type carry_node *) */, \
4271 + tmp /* pointer to carry node (of type carry_node \
4272 + * *), used to make iterator stable in the \
4273 + * face of deletions from the level */ ) \
4274 +for (node = carry_node_back(level), \
4275 + tmp = carry_node_prev(node) ; !carry_node_end(level, node) ; \
4276 + node = tmp, tmp = carry_node_prev(node))
4277 +
4278 +/* __FS_REISER4_CARRY_H__ */
4279 +#endif
4280 +
4281 +/* Make Linus happy.
4282 + Local variables:
4283 + c-indentation-style: "K&R"
4284 + mode-name: "LC"
4285 + c-basic-offset: 8
4286 + tab-width: 8
4287 + fill-column: 120
4288 + scroll-step: 1
4289 + End:
4290 +*/
4291 diff -urN linux-2.6.33.orig/fs/reiser4/carry_ops.c linux-2.6.33/fs/reiser4/carry_ops.c
4292 --- linux-2.6.33.orig/fs/reiser4/carry_ops.c 1970-01-01 01:00:00.000000000 +0100
4293 +++ linux-2.6.33/fs/reiser4/carry_ops.c 2010-03-04 19:33:22.000000000 +0100
4294 @@ -0,0 +1,2132 @@
4295 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
4296 + reiser4/README */
4297 +
4298 +/* implementation of carry operations */
4299 +
4300 +#include "forward.h"
4301 +#include "debug.h"
4302 +#include "key.h"
4303 +#include "coord.h"
4304 +#include "plugin/item/item.h"
4305 +#include "plugin/node/node.h"
4306 +#include "jnode.h"
4307 +#include "znode.h"
4308 +#include "block_alloc.h"
4309 +#include "tree_walk.h"
4310 +#include "pool.h"
4311 +#include "tree_mod.h"
4312 +#include "carry.h"
4313 +#include "carry_ops.h"
4314 +#include "tree.h"
4315 +#include "super.h"
4316 +#include "reiser4.h"
4317 +
4318 +#include <linux/types.h>
4319 +#include <linux/err.h>
4320 +
4321 +static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node,
4322 + carry_level * doing, carry_level * todo,
4323 + unsigned int including_insert_coord_p);
4324 +
4325 +extern int lock_carry_node(carry_level * level, carry_node * node);
4326 +extern int lock_carry_node_tail(carry_node * node);
4327 +
4328 +/* find left neighbor of a carry node
4329 +
4330 + Look for left neighbor of @node and add it to the @doing queue. See
4331 + comments in the body.
4332 +
4333 +*/
4334 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4335 + * neighbor of */ ,
4336 + carry_level * doing/* level to scan */)
4337 +{
4338 + int result;
4339 + carry_node *node;
4340 + carry_node *left;
4341 + int flags;
4342 + reiser4_tree *tree;
4343 +
4344 + node = op->node;
4345 +
4346 + tree = current_tree;
4347 + read_lock_tree(tree);
4348 + /* first, check whether left neighbor is already in a @doing queue */
4349 + if (reiser4_carry_real(node)->left != NULL) {
4350 + /* NOTE: there is locking subtlety here. Look into
4351 + * find_right_neighbor() for more info */
4352 + if (find_carry_node(doing,
4353 + reiser4_carry_real(node)->left) != NULL) {
4354 + read_unlock_tree(tree);
4355 + left = node;
4356 + do {
4357 + left = list_entry(left->header.level_linkage.prev,
4358 + carry_node, header.level_linkage);
4359 + assert("nikita-3408", !carry_node_end(doing,
4360 + left));
4361 + } while (reiser4_carry_real(left) ==
4362 + reiser4_carry_real(node));
4363 + return left;
4364 + }
4365 + }
4366 + read_unlock_tree(tree);
4367 +
4368 + left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4369 + if (IS_ERR(left))
4370 + return left;
4371 +
4372 + left->node = node->node;
4373 + left->free = 1;
4374 +
4375 + flags = GN_TRY_LOCK;
4376 + if (!(op->u.insert.flags & COPI_LOAD_LEFT))
4377 + flags |= GN_NO_ALLOC;
4378 +
4379 + /* then, feeling lucky, peek left neighbor in the cache. */
4380 + result = reiser4_get_left_neighbor(&left->lock_handle,
4381 + reiser4_carry_real(node),
4382 + ZNODE_WRITE_LOCK, flags);
4383 + if (result == 0) {
4384 + /* ok, node found and locked. */
4385 + result = lock_carry_node_tail(left);
4386 + if (result != 0)
4387 + left = ERR_PTR(result);
4388 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4389 + /* node is leftmost node in a tree, or neighbor wasn't in
4390 + cache, or there is an extent on the left. */
4391 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4392 + left = NULL;
4393 + } else if (doing->restartable) {
4394 + /* if left neighbor is locked, and level is restartable, add
4395 + new node to @doing and restart. */
4396 + assert("nikita-913", node->parent != 0);
4397 + assert("nikita-914", node->node != NULL);
4398 + left->left = 1;
4399 + left->free = 0;
4400 + left = ERR_PTR(-E_REPEAT);
4401 + } else {
4402 + /* left neighbor is locked, level cannot be restarted. Just
4403 + ignore left neighbor. */
4404 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4405 + left = NULL;
4406 + }
4407 + return left;
4408 +}
4409 +
4410 +/* find right neighbor of a carry node
4411 +
4412 + Look for right neighbor of @node and add it to the @doing queue. See
4413 + comments in the body.
4414 +
4415 +*/
4416 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4417 + * neighbor of */ ,
4418 + carry_level * doing/* level to scan */)
4419 +{
4420 + int result;
4421 + carry_node *node;
4422 + carry_node *right;
4423 + lock_handle lh;
4424 + int flags;
4425 + reiser4_tree *tree;
4426 +
4427 + init_lh(&lh);
4428 +
4429 + node = op->node;
4430 +
4431 + tree = current_tree;
4432 + read_lock_tree(tree);
4433 + /* first, check whether right neighbor is already in a @doing queue */
4434 + if (reiser4_carry_real(node)->right != NULL) {
4435 + /*
4436 + * Tree lock is taken here anyway, because, even if _outcome_
4437 + * of (find_carry_node() != NULL) doesn't depends on
4438 + * concurrent updates to ->right, find_carry_node() cannot
4439 + * work with second argument NULL. Hence, following comment is
4440 + * of historic importance only.
4441 + *
4442 + * Subtle:
4443 + *
4444 + * Q: why don't we need tree lock here, looking for the right
4445 + * neighbor?
4446 + *
4447 + * A: even if value of node->real_node->right were changed
4448 + * during find_carry_node() execution, outcome of execution
4449 + * wouldn't change, because (in short) other thread cannot add
4450 + * elements to the @doing, and if node->real_node->right
4451 + * already was in @doing, value of node->real_node->right
4452 + * couldn't change, because node cannot be inserted between
4453 + * locked neighbors.
4454 + */
4455 + if (find_carry_node(doing,
4456 + reiser4_carry_real(node)->right) != NULL) {
4457 + read_unlock_tree(tree);
4458 + /*
4459 + * What we are doing here (this is also applicable to
4460 + * the find_left_neighbor()).
4461 + *
4462 + * tree_walk.c code requires that insertion of a
4463 + * pointer to a child, modification of parent pointer
4464 + * in the child, and insertion of the child into
4465 + * sibling list are atomic (see
4466 + * plugin/item/internal.c:create_hook_internal()).
4467 + *
4468 + * carry allocates new node long before pointer to it
4469 + * is inserted into parent and, actually, long before
4470 + * parent is even known. Such allocated-but-orphaned
4471 + * nodes are only trackable through carry level lists.
4472 + *
4473 + * Situation that is handled here is following: @node
4474 + * has valid ->right pointer, but there is
4475 + * allocated-but-orphaned node in the carry queue that
4476 + * is logically between @node and @node->right. Here
4477 + * we are searching for it. Critical point is that
4478 + * this is only possible if @node->right is also in
4479 + * the carry queue (this is checked above), because
4480 + * this is the only way new orphaned node could be
4481 + * inserted between them (before inserting new node,
4482 + * make_space() first tries to shift to the right, so,
4483 + * right neighbor will be locked and queued).
4484 + *
4485 + */
4486 + right = node;
4487 + do {
4488 + right = list_entry(right->header.level_linkage.next,
4489 + carry_node, header.level_linkage);
4490 + assert("nikita-3408", !carry_node_end(doing,
4491 + right));
4492 + } while (reiser4_carry_real(right) ==
4493 + reiser4_carry_real(node));
4494 + return right;
4495 + }
4496 + }
4497 + read_unlock_tree(tree);
4498 +
4499 + flags = GN_CAN_USE_UPPER_LEVELS;
4500 + if (!(op->u.insert.flags & COPI_LOAD_RIGHT))
4501 + flags = GN_NO_ALLOC;
4502 +
4503 + /* then, try to lock right neighbor */
4504 + init_lh(&lh);
4505 + result = reiser4_get_right_neighbor(&lh,
4506 + reiser4_carry_real(node),
4507 + ZNODE_WRITE_LOCK, flags);
4508 + if (result == 0) {
4509 + /* ok, node found and locked. */
4510 + right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4511 + if (!IS_ERR(right)) {
4512 + right->node = lh.node;
4513 + move_lh(&right->lock_handle, &lh);
4514 + right->free = 1;
4515 + result = lock_carry_node_tail(right);
4516 + if (result != 0)
4517 + right = ERR_PTR(result);
4518 + }
4519 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4520 + /* node is rightmost node in a tree, or neighbor wasn't in
4521 + cache, or there is an extent on the right. */
4522 + right = NULL;
4523 + } else
4524 + right = ERR_PTR(result);
4525 + done_lh(&lh);
4526 + return right;
4527 +}
4528 +
4529 +/* how much free space in a @node is needed for @op
4530 +
4531 + How much space in @node is required for completion of @op, where @op is
4532 + insert or paste operation.
4533 +*/
4534 +static unsigned int space_needed_for_op(znode * node /* znode data are
4535 + * inserted or
4536 + * pasted in */ ,
4537 + carry_op * op /* carry
4538 + operation */ )
4539 +{
4540 + assert("nikita-919", op != NULL);
4541 +
4542 + switch (op->op) {
4543 + default:
4544 + impossible("nikita-1701", "Wrong opcode");
4545 + case COP_INSERT:
4546 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4547 + case COP_PASTE:
4548 + return space_needed(node, op->u.insert.d->coord,
4549 + op->u.insert.d->data, 0);
4550 + }
4551 +}
4552 +
4553 +/* how much space in @node is required to insert or paste @data at
4554 + @coord. */
4555 +unsigned int space_needed(const znode * node /* node data are inserted or
4556 + * pasted in */ ,
4557 + const coord_t *coord /* coord where data are
4558 + * inserted or pasted
4559 + * at */ ,
4560 + const reiser4_item_data * data /* data to insert or
4561 + * paste */ ,
4562 + int insertion/* non-0 is inserting, 0---paste */)
4563 +{
4564 + int result;
4565 + item_plugin *iplug;
4566 +
4567 + assert("nikita-917", node != NULL);
4568 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4569 + assert("vs-230", !insertion || (coord == NULL));
4570 +
4571 + result = 0;
4572 + iplug = data->iplug;
4573 + if (iplug->b.estimate != NULL) {
4574 + /* ask item plugin how much space is needed to insert this
4575 + item */
4576 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4577 + } else {
4578 + /* reasonable default */
4579 + result += data->length;
4580 + }
4581 + if (insertion) {
4582 + node_plugin *nplug;
4583 +
4584 + nplug = node->nplug;
4585 + /* and add node overhead */
4586 + if (nplug->item_overhead != NULL)
4587 + result += nplug->item_overhead(node, NULL);
4588 + }
4589 + return result;
4590 +}
4591 +
4592 +/* find &coord in parent where pointer to new child is to be stored. */
4593 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4594 + * insert pointer to new
4595 + * child */ )
4596 +{
4597 + int result;
4598 + znode *node;
4599 + znode *child;
4600 +
4601 + assert("nikita-941", op != NULL);
4602 + assert("nikita-942", op->op == COP_INSERT);
4603 +
4604 + node = reiser4_carry_real(op->node);
4605 + assert("nikita-943", node != NULL);
4606 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4607 +
4608 + child = reiser4_carry_real(op->u.insert.child);
4609 + result =
4610 + find_new_child_ptr(node, child, op->u.insert.brother,
4611 + op->u.insert.d->coord);
4612 +
4613 + build_child_ptr_data(child, op->u.insert.d->data);
4614 + return result;
4615 +}
4616 +
4617 +/* additional amount of free space in @node required to complete @op */
4618 +static int free_space_shortage(znode * node /* node to check */ ,
4619 + carry_op * op/* operation being performed */)
4620 +{
4621 + assert("nikita-1061", node != NULL);
4622 + assert("nikita-1062", op != NULL);
4623 +
4624 + switch (op->op) {
4625 + default:
4626 + impossible("nikita-1702", "Wrong opcode");
4627 + case COP_INSERT:
4628 + case COP_PASTE:
4629 + return space_needed_for_op(node, op) - znode_free_space(node);
4630 + case COP_EXTENT:
4631 + /* when inserting extent shift data around until insertion
4632 + point is utmost in the node. */
4633 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4634 + return +1;
4635 + else
4636 + return -1;
4637 + }
4638 +}
4639 +
4640 +/* helper function: update node pointer in operation after insertion
4641 + point was probably shifted into @target. */
4642 +static znode *sync_op(carry_op * op, carry_node * target)
4643 +{
4644 + znode *insertion_node;
4645 +
4646 + /* reget node from coord: shift might move insertion coord to
4647 + the neighbor */
4648 + insertion_node = op->u.insert.d->coord->node;
4649 + /* if insertion point was actually moved into new node,
4650 + update carry node pointer in operation. */
4651 + if (insertion_node != reiser4_carry_real(op->node)) {
4652 + op->node = target;
4653 + assert("nikita-2540",
4654 + reiser4_carry_real(target) == insertion_node);
4655 + }
4656 + assert("nikita-2541",
4657 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4658 + return insertion_node;
4659 +}
4660 +
4661 +/*
4662 + * complete make_space() call: update tracked lock handle if necessary. See
4663 + * comments for fs/reiser4/carry.h:carry_track_type
4664 + */
4665 +static int
4666 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4667 +{
4668 + int result;
4669 + carry_track_type tracking;
4670 + znode *node;
4671 +
4672 + tracking = doing->track_type;
4673 + node = op->u.insert.d->coord->node;
4674 +
4675 + if (tracking == CARRY_TRACK_NODE ||
4676 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4677 + /* inserting or pasting into node different from
4678 + original. Update lock handle supplied by caller. */
4679 + assert("nikita-1417", doing->tracked != NULL);
4680 + done_lh(doing->tracked);
4681 + init_lh(doing->tracked);
4682 + result = longterm_lock_znode(doing->tracked, node,
4683 + ZNODE_WRITE_LOCK,
4684 + ZNODE_LOCK_HIPRI);
4685 + } else
4686 + result = 0;
4687 + return result;
4688 +}
4689 +
4690 +/* This is insertion policy function. It shifts data to the left and right
4691 + neighbors of insertion coord and allocates new nodes until there is enough
4692 + free space to complete @op.
4693 +
4694 + See comments in the body.
4695 +
4696 + Assumes that the node format favors insertions at the right end of the node
4697 + as node40 does.
4698 +
4699 + See carry_flow() on detail about flow insertion
4700 +*/
4701 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4702 + carry_level * doing /* current carry queue */ ,
4703 + carry_level * todo/* carry queue on the parent level */)
4704 +{
4705 + znode *node;
4706 + int result;
4707 + int not_enough_space;
4708 + int blk_alloc;
4709 + znode *orig_node;
4710 + __u32 flags;
4711 +
4712 + coord_t *coord;
4713 +
4714 + assert("nikita-890", op != NULL);
4715 + assert("nikita-891", todo != NULL);
4716 + assert("nikita-892",
4717 + op->op == COP_INSERT ||
4718 + op->op == COP_PASTE || op->op == COP_EXTENT);
4719 + assert("nikita-1607",
4720 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4721 +
4722 + flags = op->u.insert.flags;
4723 +
4724 + /* NOTE check that new node can only be allocated after checking left
4725 + * and right neighbors. This is necessary for proper work of
4726 + * find_{left,right}_neighbor(). */
4727 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4728 + flags & COPI_DONT_SHIFT_LEFT));
4729 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4730 + flags & COPI_DONT_SHIFT_RIGHT));
4731 +
4732 + coord = op->u.insert.d->coord;
4733 + orig_node = node = coord->node;
4734 +
4735 + assert("nikita-908", node != NULL);
4736 + assert("nikita-909", node_plugin_by_node(node) != NULL);
4737 +
4738 + result = 0;
4739 + /* If there is not enough space in a node, try to shift something to
4740 + the left neighbor. This is a bit tricky, as locking to the left is
4741 + low priority. This is handled by restart logic in carry().
4742 + */
4743 + not_enough_space = free_space_shortage(node, op);
4744 + if (not_enough_space <= 0)
4745 + /* it is possible that carry was called when there actually
4746 + was enough space in the node. For example, when inserting
4747 + leftmost item so that delimiting keys have to be updated.
4748 + */
4749 + return make_space_tail(op, doing, orig_node);
4750 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4751 + carry_node *left;
4752 + /* make note in statistics of an attempt to move
4753 + something into the left neighbor */
4754 + left = find_left_neighbor(op, doing);
4755 + if (unlikely(IS_ERR(left))) {
4756 + if (PTR_ERR(left) == -E_REPEAT)
4757 + return -E_REPEAT;
4758 + else {
4759 + /* some error other than restart request
4760 + occurred. This shouldn't happen. Issue a
4761 + warning and continue as if left neighbor
4762 + weren't existing.
4763 + */
4764 + warning("nikita-924",
4765 + "Error accessing left neighbor: %li",
4766 + PTR_ERR(left));
4767 + }
4768 + } else if (left != NULL) {
4769 +
4770 + /* shift everything possible on the left of and
4771 + including insertion coord into the left neighbor */
4772 + result = carry_shift_data(LEFT_SIDE, coord,
4773 + reiser4_carry_real(left),
4774 + doing, todo,
4775 + flags & COPI_GO_LEFT);
4776 +
4777 + /* reget node from coord: shift_left() might move
4778 + insertion coord to the left neighbor */
4779 + node = sync_op(op, left);
4780 +
4781 + not_enough_space = free_space_shortage(node, op);
4782 + /* There is not enough free space in @node, but
4783 + may be, there is enough free space in
4784 + @left. Various balancing decisions are valid here.
4785 + The same for the shifiting to the right.
4786 + */
4787 + }
4788 + }
4789 + /* If there still is not enough space, shift to the right */
4790 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4791 + carry_node *right;
4792 +
4793 + right = find_right_neighbor(op, doing);
4794 + if (IS_ERR(right)) {
4795 + warning("nikita-1065",
4796 + "Error accessing right neighbor: %li",
4797 + PTR_ERR(right));
4798 + } else if (right != NULL) {
4799 + /* node containing insertion point, and its right
4800 + neighbor node are write locked by now.
4801 +
4802 + shift everything possible on the right of but
4803 + excluding insertion coord into the right neighbor
4804 + */
4805 + result = carry_shift_data(RIGHT_SIDE, coord,
4806 + reiser4_carry_real(right),
4807 + doing, todo,
4808 + flags & COPI_GO_RIGHT);
4809 + /* reget node from coord: shift_right() might move
4810 + insertion coord to the right neighbor */
4811 + node = sync_op(op, right);
4812 + not_enough_space = free_space_shortage(node, op);
4813 + }
4814 + }
4815 + /* If there is still not enough space, allocate new node(s).
4816 +
4817 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4818 + the carry operation flags (currently this is needed during flush
4819 + only).
4820 + */
4821 + for (blk_alloc = 0;
4822 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4823 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4824 + carry_node *fresh; /* new node we are allocating */
4825 + coord_t coord_shadow; /* remembered insertion point before
4826 + * shifting data into new node */
4827 + carry_node *node_shadow; /* remembered insertion node
4828 + * before shifting */
4829 + unsigned int gointo; /* whether insertion point should move
4830 + * into newly allocated node */
4831 +
4832 + /* allocate new node on the right of @node. Znode and disk
4833 + fake block number for new node are allocated.
4834 +
4835 + add_new_znode() posts carry operation COP_INSERT with
4836 + COPT_CHILD option to the parent level to add
4837 + pointer to newly created node to its parent.
4838 +
4839 + Subtle point: if several new nodes are required to complete
4840 + insertion operation at this level, they will be inserted
4841 + into their parents in the order of creation, which means
4842 + that @node will be valid "cookie" at the time of insertion.
4843 +
4844 + */
4845 + fresh = add_new_znode(node, op->node, doing, todo);
4846 + if (IS_ERR(fresh))
4847 + return PTR_ERR(fresh);
4848 +
4849 + /* Try to shift into new node. */
4850 + result = lock_carry_node(doing, fresh);
4851 + zput(reiser4_carry_real(fresh));
4852 + if (result != 0) {
4853 + warning("nikita-947",
4854 + "Cannot lock new node: %i", result);
4855 + return result;
4856 + }
4857 +
4858 + /* both nodes are write locked by now.
4859 +
4860 + shift everything possible on the right of and
4861 + including insertion coord into the right neighbor.
4862 + */
4863 + coord_dup(&coord_shadow, op->u.insert.d->coord);
4864 + node_shadow = op->node;
4865 + /* move insertion point into newly created node if:
4866 +
4867 + . insertion point is rightmost in the source node, or
4868 + . this is not the first node we are allocating in a row.
4869 + */
4870 + gointo =
4871 + (blk_alloc > 0) ||
4872 + coord_is_after_rightmost(op->u.insert.d->coord);
4873 +
4874 + if (gointo &&
4875 + op->op == COP_PASTE &&
4876 + coord_is_existing_item(op->u.insert.d->coord) &&
4877 + is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4878 + /* paste into solid (atomic) item, which can contain
4879 + only one unit, so we need to shift it right, where
4880 + insertion point supposed to be */
4881 +
4882 + assert("edward-1444", op->u.insert.d->data->iplug ==
4883 + item_plugin_by_id(STATIC_STAT_DATA_ID));
4884 + assert("edward-1445",
4885 + op->u.insert.d->data->length >
4886 + node_plugin_by_node(coord->node)->free_space
4887 + (coord->node));
4888 +
4889 + op->u.insert.d->coord->between = BEFORE_UNIT;
4890 + }
4891 +
4892 + result = carry_shift_data(RIGHT_SIDE, coord,
4893 + reiser4_carry_real(fresh),
4894 + doing, todo, gointo);
4895 + /* if insertion point was actually moved into new node,
4896 + update carry node pointer in operation. */
4897 + node = sync_op(op, fresh);
4898 + not_enough_space = free_space_shortage(node, op);
4899 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4900 + /* there is not enough free in new node. Shift
4901 + insertion point back to the @shadow_node so that
4902 + next new node would be inserted between
4903 + @shadow_node and @fresh.
4904 + */
4905 + coord_normalize(&coord_shadow);
4906 + coord_dup(coord, &coord_shadow);
4907 + node = coord->node;
4908 + op->node = node_shadow;
4909 + if (1 || (flags & COPI_STEP_BACK)) {
4910 + /* still not enough space?! Maybe there is
4911 + enough space in the source node (i.e., node
4912 + data are moved from) now.
4913 + */
4914 + not_enough_space =
4915 + free_space_shortage(node, op);
4916 + }
4917 + }
4918 + }
4919 + if (not_enough_space > 0) {
4920 + if (!(flags & COPI_DONT_ALLOCATE))
4921 + warning("nikita-948", "Cannot insert new item");
4922 + result = -E_NODE_FULL;
4923 + }
4924 + assert("nikita-1622", ergo(result == 0,
4925 + reiser4_carry_real(op->node) == coord->node));
4926 + assert("nikita-2616", coord == op->u.insert.d->coord);
4927 + if (result == 0)
4928 + result = make_space_tail(op, doing, orig_node);
4929 + return result;
4930 +}
4931 +
4932 +/* insert_paste_common() - common part of insert and paste operations
4933 +
4934 + This function performs common part of COP_INSERT and COP_PASTE.
4935 +
4936 + There are two ways in which insertion/paste can be requested:
4937 +
4938 + . by directly supplying reiser4_item_data. In this case, op ->
4939 + u.insert.type is set to COPT_ITEM_DATA.
4940 +
4941 + . by supplying child pointer to which is to inserted into parent. In this
4942 + case op -> u.insert.type == COPT_CHILD.
4943 +
4944 + . by supplying key of new item/unit. This is currently only used during
4945 + extent insertion
4946 +
4947 + This is required, because when new node is allocated we don't know at what
4948 + position pointer to it is to be stored in the parent. Actually, we don't
4949 + even know what its parent will be, because parent can be re-balanced
4950 + concurrently and new node re-parented, and because parent can be full and
4951 + pointer to the new node will go into some other node.
4952 +
4953 + insert_paste_common() resolves pointer to child node into position in the
4954 + parent by calling find_new_child_coord(), that fills
4955 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
4956 +
4957 + Another complication is with finding free space during pasting. It may
4958 + happen that while shifting items to the neighbors and newly allocated
4959 + nodes, insertion coord can no longer be in the item we wanted to paste
4960 + into. At this point, paste becomes (morphs) into insert. Moreover free
4961 + space analysis has to be repeated, because amount of space required for
4962 + insertion is different from that of paste (item header overhead, etc).
4963 +
4964 + This function "unifies" different insertion modes (by resolving child
4965 + pointer or key into insertion coord), and then calls make_space() to free
4966 + enough space in the node by shifting data to the left and right and by
4967 + allocating new nodes if necessary. Carry operation knows amount of space
4968 + required for its completion. After enough free space is obtained, caller of
4969 + this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4970 + by calling item plugin method.
4971 +
4972 +*/
4973 +static int insert_paste_common(carry_op * op /* carry operation being
4974 + * performed */ ,
4975 + carry_level * doing /* current carry level */ ,
4976 + carry_level * todo /* next carry level */ ,
4977 + carry_insert_data * cdata /* pointer to
4978 + * cdata */ ,
4979 + coord_t *coord /* insertion/paste coord */ ,
4980 + reiser4_item_data * data /* data to be
4981 + * inserted/pasted */ )
4982 +{
4983 + assert("nikita-981", op != NULL);
4984 + assert("nikita-980", todo != NULL);
4985 + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4986 + || (op->op == COP_EXTENT));
4987 +
4988 + if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4989 + /* nothing to do. Fall through to make_space(). */
4990 + ;
4991 + } else if (op->u.insert.type == COPT_KEY) {
4992 + node_search_result intra_node;
4993 + znode *node;
4994 + /* Problem with doing batching at the lowest level, is that
4995 + operations here are given by coords where modification is
4996 + to be performed, and one modification can invalidate coords
4997 + of all following operations.
4998 +
4999 + So, we are implementing yet another type for operation that
5000 + will use (the only) "locator" stable across shifting of
5001 + data between nodes, etc.: key (COPT_KEY).
5002 +
5003 + This clause resolves key to the coord in the node.
5004 +
5005 + But node can change also. Probably some pieces have to be
5006 + added to the lock_carry_node(), to lock node by its key.
5007 +
5008 + */
5009 + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5010 + if you need something else. */
5011 + op->u.insert.d->coord = coord;
5012 + node = reiser4_carry_real(op->node);
5013 + intra_node = node_plugin_by_node(node)->lookup
5014 + (node, op->u.insert.d->key, FIND_EXACT,
5015 + op->u.insert.d->coord);
5016 + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5017 + warning("nikita-1715", "Intra node lookup failure: %i",
5018 + intra_node);
5019 + return intra_node;
5020 + }
5021 + } else if (op->u.insert.type == COPT_CHILD) {
5022 + /* if we are asked to insert pointer to the child into
5023 + internal node, first convert pointer to the child into
5024 + coord within parent node.
5025 + */
5026 + znode *child;
5027 + int result;
5028 +
5029 + op->u.insert.d = cdata;
5030 + op->u.insert.d->coord = coord;
5031 + op->u.insert.d->data = data;
5032 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5033 + result = find_new_child_coord(op);
5034 + child = reiser4_carry_real(op->u.insert.child);
5035 + if (result != NS_NOT_FOUND) {
5036 + warning("nikita-993",
5037 + "Cannot find a place for child pointer: %i",
5038 + result);
5039 + return result;
5040 + }
5041 + /* This only happens when we did multiple insertions at
5042 + the previous level, trying to insert single item and
5043 + it so happened, that insertion of pointers to all new
5044 + nodes before this one already caused parent node to
5045 + split (may be several times).
5046 +
5047 + I am going to come up with better solution.
5048 +
5049 + You are not expected to understand this.
5050 + -- v6root/usr/sys/ken/slp.c
5051 +
5052 + Basically, what happens here is the following: carry came
5053 + to the parent level and is about to insert internal item
5054 + pointing to the child node that it just inserted in the
5055 + level below. Position where internal item is to be inserted
5056 + was found by find_new_child_coord() above, but node of the
5057 + current carry operation (that is, parent node of child
5058 + inserted on the previous level), was determined earlier in
5059 + the lock_carry_level/lock_carry_node. It could so happen
5060 + that other carry operations already performed on the parent
5061 + level already split parent node, so that insertion point
5062 + moved into another node. Handle this by creating new carry
5063 + node for insertion point if necessary.
5064 + */
5065 + if (reiser4_carry_real(op->node) !=
5066 + op->u.insert.d->coord->node) {
5067 + pool_ordering direction;
5068 + znode *z1;
5069 + znode *z2;
5070 + reiser4_key k1;
5071 + reiser4_key k2;
5072 +
5073 + /*
5074 + * determine in what direction insertion point
5075 + * moved. Do this by comparing delimiting keys.
5076 + */
5077 + z1 = op->u.insert.d->coord->node;
5078 + z2 = reiser4_carry_real(op->node);
5079 + if (keyle(leftmost_key_in_node(z1, &k1),
5080 + leftmost_key_in_node(z2, &k2)))
5081 + /* insertion point moved to the left */
5082 + direction = POOLO_BEFORE;
5083 + else
5084 + /* insertion point moved to the right */
5085 + direction = POOLO_AFTER;
5086 +
5087 + op->node = reiser4_add_carry_skip(doing,
5088 + direction, op->node);
5089 + if (IS_ERR(op->node))
5090 + return PTR_ERR(op->node);
5091 + op->node->node = op->u.insert.d->coord->node;
5092 + op->node->free = 1;
5093 + result = lock_carry_node(doing, op->node);
5094 + if (result != 0)
5095 + return result;
5096 + }
5097 +
5098 + /*
5099 + * set up key of an item being inserted: we are inserting
5100 + * internal item and its key is (by the very definition of
5101 + * search tree) is leftmost key in the child node.
5102 + */
5103 + write_lock_dk(znode_get_tree(child));
5104 + op->u.insert.d->key = leftmost_key_in_node(child,
5105 + znode_get_ld_key(child));
5106 + write_unlock_dk(znode_get_tree(child));
5107 + op->u.insert.d->data->arg = op->u.insert.brother;
5108 + } else {
5109 + assert("vs-243", op->u.insert.d->coord != NULL);
5110 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5111 + }
5112 +
5113 + /* find free space. */
5114 + return make_space(op, doing, todo);
5115 +}
5116 +
5117 +/* handle carry COP_INSERT operation.
5118 +
5119 + Insert new item into node. New item can be given in one of two ways:
5120 +
5121 + - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5122 + only applicable at the leaf/twig level.
5123 +
5124 + - by passing a child node pointer to which is to be inserted by this
5125 + operation.
5126 +
5127 +*/
5128 +static int carry_insert(carry_op * op /* operation to perform */ ,
5129 + carry_level * doing /* queue of operations @op
5130 + * is part of */ ,
5131 + carry_level * todo /* queue where new operations
5132 + * are accumulated */ )
5133 +{
5134 + znode *node;
5135 + carry_insert_data cdata;
5136 + coord_t coord;
5137 + reiser4_item_data data;
5138 + carry_plugin_info info;
5139 + int result;
5140 +
5141 + assert("nikita-1036", op != NULL);
5142 + assert("nikita-1037", todo != NULL);
5143 + assert("nikita-1038", op->op == COP_INSERT);
5144 +
5145 + coord_init_zero(&coord);
5146 +
5147 + /* perform common functionality of insert and paste. */
5148 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5149 + if (result != 0)
5150 + return result;
5151 +
5152 + node = op->u.insert.d->coord->node;
5153 + assert("nikita-1039", node != NULL);
5154 + assert("nikita-1040", node_plugin_by_node(node) != NULL);
5155 +
5156 + assert("nikita-949",
5157 + space_needed_for_op(node, op) <= znode_free_space(node));
5158 +
5159 + /* ask node layout to create new item. */
5160 + info.doing = doing;
5161 + info.todo = todo;
5162 + result = node_plugin_by_node(node)->create_item
5163 + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5164 + &info);
5165 + doing->restartable = 0;
5166 + znode_make_dirty(node);
5167 +
5168 + return result;
5169 +}
5170 +
5171 +/*
5172 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5173 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5174 + * by slicing into multiple items.
5175 + */
5176 +
5177 +#define flow_insert_point(op) ((op)->u.insert_flow.insert_point)
5178 +#define flow_insert_flow(op) ((op)->u.insert_flow.flow)
5179 +#define flow_insert_data(op) ((op)->u.insert_flow.data)
5180 +
5181 +static size_t item_data_overhead(carry_op * op)
5182 +{
5183 + if (flow_insert_data(op)->iplug->b.estimate == NULL)
5184 + return 0;
5185 + return (flow_insert_data(op)->iplug->b.
5186 + estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5187 + flow_insert_data(op)->length);
5188 +}
5189 +
5190 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5191 + and it will always return the same result. Some optimization could be made
5192 + by calculating this value once at the beginning and passing it around. That
5193 + would reduce some flexibility in future changes
5194 +*/
5195 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5196 +static size_t flow_insertion_overhead(carry_op * op)
5197 +{
5198 + znode *node;
5199 + size_t insertion_overhead;
5200 +
5201 + node = flow_insert_point(op)->node;
5202 + insertion_overhead = 0;
5203 + if (node->nplug->item_overhead &&
5204 + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5205 + flow_insert_data(op)))
5206 + insertion_overhead =
5207 + node->nplug->item_overhead(node, NULL) +
5208 + item_data_overhead(op);
5209 + return insertion_overhead;
5210 +}
5211 +
5212 +/* how many bytes of flow does fit to the node */
5213 +static int what_can_fit_into_node(carry_op * op)
5214 +{
5215 + size_t free, overhead;
5216 +
5217 + overhead = flow_insertion_overhead(op);
5218 + free = znode_free_space(flow_insert_point(op)->node);
5219 + if (free <= overhead)
5220 + return 0;
5221 + free -= overhead;
5222 + /* FIXME: flow->length is loff_t only to not get overflowed in case of
5223 + expandign truncate */
5224 + if (free < op->u.insert_flow.flow->length)
5225 + return free;
5226 + return (int)op->u.insert_flow.flow->length;
5227 +}
5228 +
5229 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5230 + fits into a node or whether minimal fraction of flow fits into a node */
5231 +static int enough_space_for_whole_flow(carry_op * op)
5232 +{
5233 + return (unsigned)what_can_fit_into_node(op) ==
5234 + op->u.insert_flow.flow->length;
5235 +}
5236 +
5237 +#define MIN_FLOW_FRACTION 1
5238 +static int enough_space_for_min_flow_fraction(carry_op * op)
5239 +{
5240 + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5241 +
5242 + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5243 +}
5244 +
5245 +/* this returns 0 if left neighbor was obtained successfully and everything
5246 + upto insertion point including it were shifted and left neighbor still has
5247 + some free space to put minimal fraction of flow into it */
5248 +static int
5249 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5250 +{
5251 + carry_node *left;
5252 + znode *orig;
5253 +
5254 + left = find_left_neighbor(op, doing);
5255 + if (unlikely(IS_ERR(left))) {
5256 + warning("vs-899",
5257 + "make_space_by_shift_left: "
5258 + "error accessing left neighbor: %li", PTR_ERR(left));
5259 + return 1;
5260 + }
5261 + if (left == NULL)
5262 + /* left neighbor either does not exist or is unformatted
5263 + node */
5264 + return 1;
5265 +
5266 + orig = flow_insert_point(op)->node;
5267 + /* try to shift content of node @orig from its head upto insert point
5268 + including insertion point into the left neighbor */
5269 + carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5270 + reiser4_carry_real(left), doing, todo,
5271 + 1/* including insert point */);
5272 + if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5273 + /* insertion point did not move */
5274 + return 1;
5275 + }
5276 +
5277 + /* insertion point is set after last item in the node */
5278 + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5279 +
5280 + if (!enough_space_for_min_flow_fraction(op)) {
5281 + /* insertion point node does not have enough free space to put
5282 + even minimal portion of flow into it, therefore, move
5283 + insertion point back to orig node (before first item) */
5284 + coord_init_before_first_item(flow_insert_point(op), orig);
5285 + return 1;
5286 + }
5287 +
5288 + /* part of flow is to be written to the end of node */
5289 + op->node = left;
5290 + return 0;
5291 +}
5292 +
5293 +/* this returns 0 if right neighbor was obtained successfully and everything to
5294 + the right of insertion point was shifted to it and node got enough free
5295 + space to put minimal fraction of flow into it */
5296 +static int
5297 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5298 + carry_level * todo)
5299 +{
5300 + carry_node *right;
5301 +
5302 + right = find_right_neighbor(op, doing);
5303 + if (unlikely(IS_ERR(right))) {
5304 + warning("nikita-1065", "shift_right_excluding_insert_point: "
5305 + "error accessing right neighbor: %li", PTR_ERR(right));
5306 + return 1;
5307 + }
5308 + if (right) {
5309 + /* shift everything possible on the right of but excluding
5310 + insertion coord into the right neighbor */
5311 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5312 + reiser4_carry_real(right), doing, todo,
5313 + 0/* not including insert point */);
5314 + } else {
5315 + /* right neighbor either does not exist or is unformatted
5316 + node */
5317 + ;
5318 + }
5319 + if (coord_is_after_rightmost(flow_insert_point(op))) {
5320 + if (enough_space_for_min_flow_fraction(op)) {
5321 + /* part of flow is to be written to the end of node */
5322 + return 0;
5323 + }
5324 + }
5325 +
5326 + /* new node is to be added if insert point node did not get enough
5327 + space for whole flow */
5328 + return 1;
5329 +}
5330 +
5331 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5332 + fits into that node */
5333 +static int
5334 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5335 +{
5336 + int result;
5337 + znode *node;
5338 + carry_node *new;
5339 +
5340 + node = flow_insert_point(op)->node;
5341 +
5342 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5343 + return RETERR(-E_NODE_FULL);
5344 + /* add new node after insert point node */
5345 + new = add_new_znode(node, op->node, doing, todo);
5346 + if (unlikely(IS_ERR(new)))
5347 + return PTR_ERR(new);
5348 + result = lock_carry_node(doing, new);
5349 + zput(reiser4_carry_real(new));
5350 + if (unlikely(result))
5351 + return result;
5352 + op->u.insert_flow.new_nodes++;
5353 + if (!coord_is_after_rightmost(flow_insert_point(op))) {
5354 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5355 + reiser4_carry_real(new), doing, todo,
5356 + 0/* not including insert point */);
5357 + assert("vs-901",
5358 + coord_is_after_rightmost(flow_insert_point(op)));
5359 +
5360 + if (enough_space_for_min_flow_fraction(op))
5361 + return 0;
5362 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5363 + return RETERR(-E_NODE_FULL);
5364 +
5365 + /* add one more new node */
5366 + new = add_new_znode(node, op->node, doing, todo);
5367 + if (unlikely(IS_ERR(new)))
5368 + return PTR_ERR(new);
5369 + result = lock_carry_node(doing, new);
5370 + zput(reiser4_carry_real(new));
5371 + if (unlikely(result))
5372 + return result;
5373 + op->u.insert_flow.new_nodes++;
5374 + }
5375 +
5376 + /* move insertion point to new node */
5377 + coord_init_before_first_item(flow_insert_point(op),
5378 + reiser4_carry_real(new));
5379 + op->node = new;
5380 + return 0;
5381 +}
5382 +
5383 +static int
5384 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5385 + carry_level * todo)
5386 +{
5387 + __u32 flags = op->u.insert_flow.flags;
5388 +
5389 + if (enough_space_for_whole_flow(op)) {
5390 + /* whole flow fits into insert point node */
5391 + return 0;
5392 + }
5393 +
5394 + if (!(flags & COPI_DONT_SHIFT_LEFT)
5395 + && (make_space_by_shift_left(op, doing, todo) == 0)) {
5396 + /* insert point is shifted to left neighbor of original insert
5397 + point node and is set after last unit in that node. It has
5398 + enough space to fit at least minimal fraction of flow. */
5399 + return 0;
5400 + }
5401 +
5402 + if (enough_space_for_whole_flow(op)) {
5403 + /* whole flow fits into insert point node */
5404 + return 0;
5405 + }
5406 +
5407 + if (!(flags & COPI_DONT_SHIFT_RIGHT)
5408 + && (make_space_by_shift_right(op, doing, todo) == 0)) {
5409 + /* insert point is still set to the same node, but there is
5410 + nothing to the right of insert point. */
5411 + return 0;
5412 + }
5413 +
5414 + if (enough_space_for_whole_flow(op)) {
5415 + /* whole flow fits into insert point node */
5416 + return 0;
5417 + }
5418 +
5419 + return make_space_by_new_nodes(op, doing, todo);
5420 +}
5421 +
5422 +/* implements COP_INSERT_FLOW operation */
5423 +static int
5424 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5425 +{
5426 + int result;
5427 + flow_t *f;
5428 + coord_t *insert_point;
5429 + node_plugin *nplug;
5430 + carry_plugin_info info;
5431 + znode *orig_node;
5432 + lock_handle *orig_lh;
5433 +
5434 + f = op->u.insert_flow.flow;
5435 + result = 0;
5436 +
5437 + /* carry system needs this to work */
5438 + info.doing = doing;
5439 + info.todo = todo;
5440 +
5441 + orig_node = flow_insert_point(op)->node;
5442 + orig_lh = doing->tracked;
5443 +
5444 + while (f->length) {
5445 + result = make_space_for_flow_insertion(op, doing, todo);
5446 + if (result)
5447 + break;
5448 +
5449 + insert_point = flow_insert_point(op);
5450 + nplug = node_plugin_by_node(insert_point->node);
5451 +
5452 + /* compose item data for insertion/pasting */
5453 + flow_insert_data(op)->data = f->data;
5454 + flow_insert_data(op)->length = what_can_fit_into_node(op);
5455 +
5456 + if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5457 + /* insert point is set to item of file we are writing to
5458 + and we have to append to it */
5459 + assert("vs-903", insert_point->between == AFTER_UNIT);
5460 + nplug->change_item_size(insert_point,
5461 + flow_insert_data(op)->length);
5462 + flow_insert_data(op)->iplug->b.paste(insert_point,
5463 + flow_insert_data
5464 + (op), &info);
5465 + } else {
5466 + /* new item must be inserted */
5467 + pos_in_node_t new_pos;
5468 + flow_insert_data(op)->length += item_data_overhead(op);
5469 +
5470 + /* FIXME-VS: this is because node40_create_item changes
5471 + insert_point for obscure reasons */
5472 + switch (insert_point->between) {
5473 + case AFTER_ITEM:
5474 + new_pos = insert_point->item_pos + 1;
5475 + break;
5476 + case EMPTY_NODE:
5477 + new_pos = 0;
5478 + break;
5479 + case BEFORE_ITEM:
5480 + assert("vs-905", insert_point->item_pos == 0);
5481 + new_pos = 0;
5482 + break;
5483 + default:
5484 + impossible("vs-906",
5485 + "carry_insert_flow: invalid coord");
5486 + new_pos = 0;
5487 + break;
5488 + }
5489 +
5490 + nplug->create_item(insert_point, &f->key,
5491 + flow_insert_data(op), &info);
5492 + coord_set_item_pos(insert_point, new_pos);
5493 + }
5494 + coord_init_after_item_end(insert_point);
5495 + doing->restartable = 0;
5496 + znode_make_dirty(insert_point->node);
5497 +
5498 + move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5499 + }
5500 +
5501 + if (orig_node != flow_insert_point(op)->node) {
5502 + /* move lock to new insert point */
5503 + done_lh(orig_lh);
5504 + init_lh(orig_lh);
5505 + result =
5506 + longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5507 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5508 + }
5509 +
5510 + return result;
5511 +}
5512 +
5513 +/* implements COP_DELETE operation
5514 +
5515 + Remove pointer to @op -> u.delete.child from it's parent.
5516 +
5517 + This function also handles killing of a tree root is last pointer from it
5518 + was removed. This is complicated by our handling of "twig" level: root on
5519 + twig level is never killed.
5520 +
5521 +*/
5522 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5523 + carry_level * doing UNUSED_ARG /* current carry
5524 + * level */ ,
5525 + carry_level * todo/* next carry level */)
5526 +{
5527 + int result;
5528 + coord_t coord;
5529 + coord_t coord2;
5530 + znode *parent;
5531 + znode *child;
5532 + carry_plugin_info info;
5533 + reiser4_tree *tree;
5534 +
5535 + /*
5536 + * This operation is called to delete internal item pointing to the
5537 + * child node that was removed by carry from the tree on the previous
5538 + * tree level.
5539 + */
5540 +
5541 + assert("nikita-893", op != NULL);
5542 + assert("nikita-894", todo != NULL);
5543 + assert("nikita-895", op->op == COP_DELETE);
5544 +
5545 + coord_init_zero(&coord);
5546 + coord_init_zero(&coord2);
5547 +
5548 + parent = reiser4_carry_real(op->node);
5549 + child = op->u.delete.child ?
5550 + reiser4_carry_real(op->u.delete.child) : op->node->node;
5551 + tree = znode_get_tree(child);
5552 + read_lock_tree(tree);
5553 +
5554 + /*
5555 + * @parent was determined when carry entered parent level
5556 + * (lock_carry_level/lock_carry_node). Since then, actual parent of
5557 + * @child node could change due to other carry operations performed on
5558 + * the parent level. Check for this.
5559 + */
5560 +
5561 + if (znode_parent(child) != parent) {
5562 + /* NOTE-NIKITA add stat counter for this. */
5563 + parent = znode_parent(child);
5564 + assert("nikita-2581", find_carry_node(doing, parent));
5565 + }
5566 + read_unlock_tree(tree);
5567 +
5568 + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5569 +
5570 + /* Twig level horrors: tree should be of height at least 2. So, last
5571 + pointer from the root at twig level is preserved even if child is
5572 + empty. This is ugly, but so it was architectured.
5573 + */
5574 +
5575 + if (znode_is_root(parent) &&
5576 + znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5577 + node_num_items(parent) == 1) {
5578 + /* Delimiting key manipulations. */
5579 + write_lock_dk(tree);
5580 + znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5581 + znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5582 + ZF_SET(child, JNODE_DKSET);
5583 + write_unlock_dk(tree);
5584 +
5585 + /* @child escaped imminent death! */
5586 + ZF_CLR(child, JNODE_HEARD_BANSHEE);
5587 + return 0;
5588 + }
5589 +
5590 + /* convert child pointer to the coord_t */
5591 + result = find_child_ptr(parent, child, &coord);
5592 + if (result != NS_FOUND) {
5593 + warning("nikita-994", "Cannot find child pointer: %i", result);
5594 + print_coord_content("coord", &coord);
5595 + return result;
5596 + }
5597 +
5598 + coord_dup(&coord2, &coord);
5599 + info.doing = doing;
5600 + info.todo = todo;
5601 + {
5602 + /*
5603 + * Actually kill internal item: prepare structure with
5604 + * arguments for ->cut_and_kill() method...
5605 + */
5606 +
5607 + struct carry_kill_data kdata;
5608 + kdata.params.from = &coord;
5609 + kdata.params.to = &coord2;
5610 + kdata.params.from_key = NULL;
5611 + kdata.params.to_key = NULL;
5612 + kdata.params.smallest_removed = NULL;
5613 + kdata.params.truncate = 1;
5614 + kdata.flags = op->u.delete.flags;
5615 + kdata.inode = NULL;
5616 + kdata.left = NULL;
5617 + kdata.right = NULL;
5618 + kdata.buf = NULL;
5619 + /* ... and call it. */
5620 + result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5621 + &info);
5622 + }
5623 + doing->restartable = 0;
5624 +
5625 + /* check whether root should be killed violently */
5626 + if (znode_is_root(parent) &&
5627 + /* don't kill roots at and lower than twig level */
5628 + znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5629 + node_num_items(parent) == 1)
5630 + result = reiser4_kill_tree_root(coord.node);
5631 +
5632 + return result < 0 ? : 0;
5633 +}
5634 +
5635 +/* implements COP_CUT opration
5636 +
5637 + Cuts part or whole content of node.
5638 +
5639 +*/
5640 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5641 + carry_level * doing /* current carry level */ ,
5642 + carry_level * todo/* next carry level */)
5643 +{
5644 + int result;
5645 + carry_plugin_info info;
5646 + node_plugin *nplug;
5647 +
5648 + assert("nikita-896", op != NULL);
5649 + assert("nikita-897", todo != NULL);
5650 + assert("nikita-898", op->op == COP_CUT);
5651 +
5652 + info.doing = doing;
5653 + info.todo = todo;
5654 +
5655 + nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5656 + if (op->u.cut_or_kill.is_cut)
5657 + result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5658 + else
5659 + result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5660 +
5661 + doing->restartable = 0;
5662 + return result < 0 ? : 0;
5663 +}
5664 +
5665 +/* helper function for carry_paste(): returns true if @op can be continued as
5666 + paste */
5667 +static int
5668 +can_paste(coord_t *icoord, const reiser4_key * key,
5669 + const reiser4_item_data * data)
5670 +{
5671 + coord_t circa;
5672 + item_plugin *new_iplug;
5673 + item_plugin *old_iplug;
5674 + int result = 0; /* to keep gcc shut */
5675 +
5676 + assert("", icoord->between != AT_UNIT);
5677 +
5678 + /* obviously, one cannot paste when node is empty---there is nothing
5679 + to paste into. */
5680 + if (node_is_empty(icoord->node))
5681 + return 0;
5682 + /* if insertion point is at the middle of the item, then paste */
5683 + if (!coord_is_between_items(icoord))
5684 + return 1;
5685 + coord_dup(&circa, icoord);
5686 + circa.between = AT_UNIT;
5687 +
5688 + old_iplug = item_plugin_by_coord(&circa);
5689 + new_iplug = data->iplug;
5690 +
5691 + /* check whether we can paste to the item @icoord is "at" when we
5692 + ignore ->between field */
5693 + if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data))
5694 + result = 1;
5695 + else if (icoord->between == BEFORE_UNIT
5696 + || icoord->between == BEFORE_ITEM) {
5697 + /* otherwise, try to glue to the item at the left, if any */
5698 + coord_dup(&circa, icoord);
5699 + if (coord_set_to_left(&circa)) {
5700 + result = 0;
5701 + coord_init_before_item(icoord);
5702 + } else {
5703 + old_iplug = item_plugin_by_coord(&circa);
5704 + result = (old_iplug == new_iplug)
5705 + && item_can_contain_key(icoord, key, data);
5706 + if (result) {
5707 + coord_dup(icoord, &circa);
5708 + icoord->between = AFTER_UNIT;
5709 + }
5710 + }
5711 + } else if (icoord->between == AFTER_UNIT
5712 + || icoord->between == AFTER_ITEM) {
5713 + coord_dup(&circa, icoord);
5714 + /* otherwise, try to glue to the item at the right, if any */
5715 + if (coord_set_to_right(&circa)) {
5716 + result = 0;
5717 + coord_init_after_item(icoord);
5718 + } else {
5719 + int (*cck) (const coord_t *, const reiser4_key *,
5720 + const reiser4_item_data *);
5721 +
5722 + old_iplug = item_plugin_by_coord(&circa);
5723 +
5724 + cck = old_iplug->b.can_contain_key;
5725 + if (cck == NULL)
5726 + /* item doesn't define ->can_contain_key
5727 + method? So it is not expandable. */
5728 + result = 0;
5729 + else {
5730 + result = (old_iplug == new_iplug)
5731 + && cck(&circa /*icoord */ , key, data);
5732 + if (result) {
5733 + coord_dup(icoord, &circa);
5734 + icoord->between = BEFORE_UNIT;
5735 + }
5736 + }
5737 + }
5738 + } else
5739 + impossible("nikita-2513", "Nothing works");
5740 + if (result) {
5741 + if (icoord->between == BEFORE_ITEM) {
5742 + assert("vs-912", icoord->unit_pos == 0);
5743 + icoord->between = BEFORE_UNIT;
5744 + } else if (icoord->between == AFTER_ITEM) {
5745 + coord_init_after_item_end(icoord);
5746 + }
5747 + }
5748 + return result;
5749 +}
5750 +
5751 +/* implements COP_PASTE operation
5752 +
5753 + Paste data into existing item. This is complicated by the fact that after
5754 + we shifted something to the left or right neighbors trying to free some
5755 + space, item we were supposed to paste into can be in different node than
5756 + insertion coord. If so, we are no longer doing paste, but insert. See
5757 + comments in insert_paste_common().
5758 +
5759 +*/
5760 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5761 + carry_level * doing UNUSED_ARG /* current carry
5762 + * level */ ,
5763 + carry_level * todo/* next carry level */)
5764 +{
5765 + znode *node;
5766 + carry_insert_data cdata;
5767 + coord_t dcoord;
5768 + reiser4_item_data data;
5769 + int result;
5770 + int real_size;
5771 + item_plugin *iplug;
5772 + carry_plugin_info info;
5773 + coord_t *coord;
5774 +
5775 + assert("nikita-982", op != NULL);
5776 + assert("nikita-983", todo != NULL);
5777 + assert("nikita-984", op->op == COP_PASTE);
5778 +
5779 + coord_init_zero(&dcoord);
5780 +
5781 + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5782 + if (result != 0)
5783 + return result;
5784 +
5785 + coord = op->u.insert.d->coord;
5786 +
5787 + /* handle case when op -> u.insert.coord doesn't point to the item
5788 + of required type. restart as insert. */
5789 + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5790 + op->op = COP_INSERT;
5791 + op->u.insert.type = COPT_PASTE_RESTARTED;
5792 + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5793 +
5794 + return result;
5795 + }
5796 +
5797 + node = coord->node;
5798 + iplug = item_plugin_by_coord(coord);
5799 + assert("nikita-992", iplug != NULL);
5800 +
5801 + assert("nikita-985", node != NULL);
5802 + assert("nikita-986", node_plugin_by_node(node) != NULL);
5803 +
5804 + assert("nikita-987",
5805 + space_needed_for_op(node, op) <= znode_free_space(node));
5806 +
5807 + assert("nikita-1286", coord_is_existing_item(coord));
5808 +
5809 + /*
5810 + * if item is expanded as a result of this operation, we should first
5811 + * change item size, than call ->b.paste item method. If item is
5812 + * shrunk, it should be done other way around: first call ->b.paste
5813 + * method, then reduce item size.
5814 + */
5815 +
5816 + real_size = space_needed_for_op(node, op);
5817 + if (real_size > 0)
5818 + node->nplug->change_item_size(coord, real_size);
5819 +
5820 + doing->restartable = 0;
5821 + info.doing = doing;
5822 + info.todo = todo;
5823 +
5824 + result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5825 +
5826 + if (real_size < 0)
5827 + node->nplug->change_item_size(coord, real_size);
5828 +
5829 + /* if we pasted at the beginning of the item, update item's key. */
5830 + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5831 + node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5832 +
5833 + znode_make_dirty(node);
5834 + return result;
5835 +}
5836 +
5837 +/* handle carry COP_EXTENT operation. */
5838 +static int carry_extent(carry_op * op /* operation to perform */ ,
5839 + carry_level * doing /* queue of operations @op
5840 + * is part of */ ,
5841 + carry_level * todo /* queue where new operations
5842 + * are accumulated */ )
5843 +{
5844 + znode *node;
5845 + carry_insert_data cdata;
5846 + coord_t coord;
5847 + reiser4_item_data data;
5848 + carry_op *delete_dummy;
5849 + carry_op *insert_extent;
5850 + int result;
5851 + carry_plugin_info info;
5852 +
5853 + assert("nikita-1751", op != NULL);
5854 + assert("nikita-1752", todo != NULL);
5855 + assert("nikita-1753", op->op == COP_EXTENT);
5856 +
5857 + /* extent insertion overview:
5858 +
5859 + extents live on the TWIG LEVEL, which is level one above the leaf
5860 + one. This complicates extent insertion logic somewhat: it may
5861 + happen (and going to happen all the time) that in logical key
5862 + ordering extent has to be placed between items I1 and I2, located
5863 + at the leaf level, but I1 and I2 are in the same formatted leaf
5864 + node N1. To insert extent one has to
5865 +
5866 + (1) reach node N1 and shift data between N1, its neighbors and
5867 + possibly newly allocated nodes until I1 and I2 fall into different
5868 + nodes. Since I1 and I2 are still neighboring items in logical key
5869 + order, they will be necessary utmost items in their respective
5870 + nodes.
5871 +
5872 + (2) After this new extent item is inserted into node on the twig
5873 + level.
5874 +
5875 + Fortunately this process can reuse almost all code from standard
5876 + insertion procedure (viz. make_space() and insert_paste_common()),
5877 + due to the following observation: make_space() only shifts data up
5878 + to and excluding or including insertion point. It never
5879 + "over-moves" through insertion point. Thus, one can use
5880 + make_space() to perform step (1). All required for this is just to
5881 + instruct free_space_shortage() to keep make_space() shifting data
5882 + until insertion point is at the node border.
5883 +
5884 + */
5885 +
5886 + /* perform common functionality of insert and paste. */
5887 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5888 + if (result != 0)
5889 + return result;
5890 +
5891 + node = op->u.extent.d->coord->node;
5892 + assert("nikita-1754", node != NULL);
5893 + assert("nikita-1755", node_plugin_by_node(node) != NULL);
5894 + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5895 +
5896 + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5897 + extent fits between items. */
5898 +
5899 + info.doing = doing;
5900 + info.todo = todo;
5901 +
5902 + /* there is another complication due to placement of extents on the
5903 + twig level: extents are "rigid" in the sense that key-range
5904 + occupied by extent cannot grow indefinitely to the right as it is
5905 + for the formatted leaf nodes. Because of this when search finds two
5906 + adjacent extents on the twig level, it has to "drill" to the leaf
5907 + level, creating new node. Here we are removing this node.
5908 + */
5909 + if (node_is_empty(node)) {
5910 + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5911 + if (IS_ERR(delete_dummy))
5912 + return PTR_ERR(delete_dummy);
5913 + delete_dummy->u.delete.child = NULL;
5914 + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5915 + ZF_SET(node, JNODE_HEARD_BANSHEE);
5916 + }
5917 +
5918 + /* proceed with inserting extent item into parent. We are definitely
5919 + inserting rather than pasting if we get that far. */
5920 + insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5921 + if (IS_ERR(insert_extent))
5922 + /* @delete_dummy will be automatically destroyed on the level
5923 + exiting */
5924 + return PTR_ERR(insert_extent);
5925 + /* NOTE-NIKITA insertion by key is simplest option here. Another
5926 + possibility is to insert on the left or right of already existing
5927 + item.
5928 + */
5929 + insert_extent->u.insert.type = COPT_KEY;
5930 + insert_extent->u.insert.d = op->u.extent.d;
5931 + assert("nikita-1719", op->u.extent.d->key != NULL);
5932 + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5933 + insert_extent->u.insert.flags =
5934 + znode_get_tree(node)->carry.new_extent_flags;
5935 +
5936 + /*
5937 + * if carry was asked to track lock handle we should actually track
5938 + * lock handle on the twig node rather than on the leaf where
5939 + * operation was started from. Transfer tracked lock handle.
5940 + */
5941 + if (doing->track_type) {
5942 + assert("nikita-3242", doing->tracked != NULL);
5943 + assert("nikita-3244", todo->tracked == NULL);
5944 + todo->tracked = doing->tracked;
5945 + todo->track_type = CARRY_TRACK_NODE;
5946 + doing->tracked = NULL;
5947 + doing->track_type = 0;
5948 + }
5949 +
5950 + return 0;
5951 +}
5952 +
5953 +/* update key in @parent between pointers to @left and @right.
5954 +
5955 + Find coords of @left and @right and update delimiting key between them.
5956 + This is helper function called by carry_update(). Finds position of
5957 + internal item involved. Updates item key. Updates delimiting keys of child
5958 + nodes involved.
5959 +*/
5960 +static int update_delimiting_key(znode * parent /* node key is updated
5961 + * in */ ,
5962 + znode * left /* child of @parent */ ,
5963 + znode * right /* child of @parent */ ,
5964 + carry_level * doing /* current carry
5965 + * level */ ,
5966 + carry_level * todo /* parent carry
5967 + * level */ ,
5968 + const char **error_msg /* place to
5969 + * store error
5970 + * message */ )
5971 +{
5972 + coord_t left_pos;
5973 + coord_t right_pos;
5974 + int result;
5975 + reiser4_key ldkey;
5976 + carry_plugin_info info;
5977 +
5978 + assert("nikita-1177", right != NULL);
5979 + /* find position of right left child in a parent */
5980 + result = find_child_ptr(parent, right, &right_pos);
5981 + if (result != NS_FOUND) {
5982 + *error_msg = "Cannot find position of right child";
5983 + return result;
5984 + }
5985 +
5986 + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5987 + /* find position of the left child in a parent */
5988 + result = find_child_ptr(parent, left, &left_pos);
5989 + if (result != NS_FOUND) {
5990 + *error_msg = "Cannot find position of left child";
5991 + return result;
5992 + }
5993 + assert("nikita-1355", left_pos.node != NULL);
5994 + } else
5995 + left_pos.node = NULL;
5996 +
5997 + /* check that they are separated by exactly one key and are basically
5998 + sane */
5999 + if (REISER4_DEBUG) {
6000 + if ((left_pos.node != NULL)
6001 + && !coord_is_existing_unit(&left_pos)) {
6002 + *error_msg = "Left child is bastard";
6003 + return RETERR(-EIO);
6004 + }
6005 + if (!coord_is_existing_unit(&right_pos)) {
6006 + *error_msg = "Right child is bastard";
6007 + return RETERR(-EIO);
6008 + }
6009 + if (left_pos.node != NULL &&
6010 + !coord_are_neighbors(&left_pos, &right_pos)) {
6011 + *error_msg = "Children are not direct siblings";
6012 + return RETERR(-EIO);
6013 + }
6014 + }
6015 + *error_msg = NULL;
6016 +
6017 + info.doing = doing;
6018 + info.todo = todo;
6019 +
6020 + /*
6021 + * If child node is not empty, new key of internal item is a key of
6022 + * leftmost item in the child node. If the child is empty, take its
6023 + * right delimiting key as a new key of the internal item. Precise key
6024 + * in the latter case is not important per se, because the child (and
6025 + * the internal item) are going to be killed shortly anyway, but we
6026 + * have to preserve correct order of keys in the parent node.
6027 + */
6028 +
6029 + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6030 + leftmost_key_in_node(right, &ldkey);
6031 + else {
6032 + read_lock_dk(znode_get_tree(parent));
6033 + ldkey = *znode_get_rd_key(right);
6034 + read_unlock_dk(znode_get_tree(parent));
6035 + }
6036 + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6037 + doing->restartable = 0;
6038 + znode_make_dirty(parent);
6039 + return 0;
6040 +}
6041 +
6042 +/* implements COP_UPDATE opration
6043 +
6044 + Update delimiting keys.
6045 +
6046 +*/
6047 +static int carry_update(carry_op * op /* operation to be performed */ ,
6048 + carry_level * doing /* current carry level */ ,
6049 + carry_level * todo/* next carry level */)
6050 +{
6051 + int result;
6052 + carry_node *missing UNUSED_ARG;
6053 + znode *left;
6054 + znode *right;
6055 + carry_node *lchild;
6056 + carry_node *rchild;
6057 + const char *error_msg;
6058 + reiser4_tree *tree;
6059 +
6060 + /*
6061 + * This operation is called to update key of internal item. This is
6062 + * necessary when carry shifted of cut data on the child
6063 + * level. Arguments of this operation are:
6064 + *
6065 + * @right --- child node. Operation should update key of internal
6066 + * item pointing to @right.
6067 + *
6068 + * @left --- left neighbor of @right. This parameter is optional.
6069 + */
6070 +
6071 + assert("nikita-902", op != NULL);
6072 + assert("nikita-903", todo != NULL);
6073 + assert("nikita-904", op->op == COP_UPDATE);
6074 +
6075 + lchild = op->u.update.left;
6076 + rchild = op->node;
6077 +
6078 + if (lchild != NULL) {
6079 + assert("nikita-1001", lchild->parent);
6080 + assert("nikita-1003", !lchild->left);
6081 + left = reiser4_carry_real(lchild);
6082 + } else
6083 + left = NULL;
6084 +
6085 + tree = znode_get_tree(rchild->node);
6086 + read_lock_tree(tree);
6087 + right = znode_parent(rchild->node);
6088 + read_unlock_tree(tree);
6089 +
6090 + if (right != NULL) {
6091 + result = update_delimiting_key(right,
6092 + lchild ? lchild->node : NULL,
6093 + rchild->node,
6094 + doing, todo, &error_msg);
6095 + } else {
6096 + error_msg = "Cannot find node to update key in";
6097 + result = RETERR(-EIO);
6098 + }
6099 + /* operation will be reposted to the next level by the
6100 + ->update_item_key() method of node plugin, if necessary. */
6101 +
6102 + if (result != 0) {
6103 + warning("nikita-999", "Error updating delimiting key: %s (%i)",
6104 + error_msg ? : "", result);
6105 + }
6106 + return result;
6107 +}
6108 +
6109 +/* move items from @node during carry */
6110 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
6111 + coord_t *insert_coord /* coord where new item
6112 + * is to be inserted */,
6113 + znode * node /* node which data are moved from */ ,
6114 + carry_level * doing /* active carry queue */ ,
6115 + carry_level * todo /* carry queue where new
6116 + * operations are to be put
6117 + * in */ ,
6118 + unsigned int including_insert_coord_p
6119 + /* true if @insertion_coord can be moved */ )
6120 +{
6121 + int result;
6122 + znode *source;
6123 + carry_plugin_info info;
6124 + node_plugin *nplug;
6125 +
6126 + source = insert_coord->node;
6127 +
6128 + info.doing = doing;
6129 + info.todo = todo;
6130 +
6131 + nplug = node_plugin_by_node(node);
6132 + result = nplug->shift(insert_coord, node,
6133 + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6134 + (int)including_insert_coord_p, &info);
6135 + /* the only error ->shift() method of node plugin can return is
6136 + -ENOMEM due to carry node/operation allocation. */
6137 + assert("nikita-915", result >= 0 || result == -ENOMEM);
6138 + if (result > 0) {
6139 + /*
6140 + * if some number of bytes was actually shifted, mark nodes
6141 + * dirty, and carry level as non-restartable.
6142 + */
6143 + doing->restartable = 0;
6144 + znode_make_dirty(source);
6145 + znode_make_dirty(node);
6146 + }
6147 +
6148 + assert("nikita-2077", coord_check(insert_coord));
6149 + return 0;
6150 +}
6151 +
6152 +typedef carry_node *(*carry_iterator) (carry_node * node);
6153 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6154 + carry_iterator iterator);
6155 +
6156 +static carry_node *pool_level_list_prev(carry_node *node)
6157 +{
6158 + return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6159 +}
6160 +
6161 +/* look for the left neighbor of given carry node in a carry queue.
6162 +
6163 + This is used by find_left_neighbor(), but I am not sure that this
6164 + really gives any advantage. More statistics required.
6165 +
6166 +*/
6167 +carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6168 + * of */ ,
6169 + carry_level * level/* level to scan */)
6170 +{
6171 + return find_dir_carry(node, level,
6172 + (carry_iterator) pool_level_list_prev);
6173 +}
6174 +
6175 +static carry_node *pool_level_list_next(carry_node *node)
6176 +{
6177 + return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6178 +}
6179 +
6180 +/* look for the right neighbor of given carry node in a
6181 + carry queue.
6182 +
6183 + This is used by find_right_neighbor(), but I am not sure that this
6184 + really gives any advantage. More statistics required.
6185 +
6186 +*/
6187 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6188 + * of */ ,
6189 + carry_level * level/* level to scan */)
6190 +{
6191 + return find_dir_carry(node, level,
6192 + (carry_iterator) pool_level_list_next);
6193 +}
6194 +
6195 +/* look for the left or right neighbor of given carry node in a carry
6196 + queue.
6197 +
6198 + Helper function used by find_{left|right}_carry().
6199 +*/
6200 +static carry_node *find_dir_carry(carry_node * node /* node to start
6201 + * scanning from */ ,
6202 + carry_level * level /* level to scan */ ,
6203 + carry_iterator iterator /* operation to
6204 + * move to the
6205 + * next node */)
6206 +{
6207 + carry_node *neighbor;
6208 +
6209 + assert("nikita-1059", node != NULL);
6210 + assert("nikita-1060", level != NULL);
6211 +
6212 + /* scan list of carry nodes on this list dir-ward, skipping all
6213 + carry nodes referencing the same znode. */
6214 + neighbor = node;
6215 + while (1) {
6216 + neighbor = iterator(neighbor);
6217 + if (carry_node_end(level, neighbor))
6218 + /* list head is reached */
6219 + return NULL;
6220 + if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6221 + return neighbor;
6222 + }
6223 +}
6224 +
6225 +/*
6226 + * Memory reservation estimation.
6227 + *
6228 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6229 + * takes tree in consistent state (e.g., that search tree invariants hold),
6230 + * and leaves tree consistent after it finishes. This means that when some
6231 + * error occurs carry cannot simply return if there are pending carry
6232 + * operations. Generic solution for this problem is carry-undo either as
6233 + * transaction manager feature (requiring checkpoints and isolation), or
6234 + * through some carry specific mechanism.
6235 + *
6236 + * Our current approach is to panic if carry hits an error while tree is
6237 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6238 + * this "memory reservation" mechanism was added.
6239 + *
6240 + * Memory reservation is implemented by perthread-pages.diff patch from
6241 + * core-patches. Its API is defined in <linux/gfp.h>
6242 + *
6243 + * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6244 + * void perthread_pages_release(int nrpages);
6245 + * int perthread_pages_count(void);
6246 + *
6247 + * carry estimates its worst case memory requirements at the entry, reserved
6248 + * enough memory, and released unused pages before returning.
6249 + *
6250 + * Code below estimates worst case memory requirements for a given carry
6251 + * queue. This is dome by summing worst case memory requirements for each
6252 + * operation in the queue.
6253 + *
6254 + */
6255 +
6256 +/*
6257 + * Memory memory requirements of many operations depends on the tree
6258 + * height. For example, item insertion requires new node to be inserted at
6259 + * each tree level in the worst case. What tree height should be used for
6260 + * estimation? Current tree height is wrong, because tree height can change
6261 + * between the time when estimation was done and the time when operation is
6262 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6263 + * is also not desirable, because it would lead to the huge over-estimation
6264 + * all the time. Plausible solution is "capped tree height": if current tree
6265 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6266 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6267 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6268 + * to be increased even more during short interval of time.
6269 + */
6270 +#define TREE_HEIGHT_CAP (5)
6271 +
6272 +/* return capped tree height for the @tree. See comment above. */
6273 +static int cap_tree_height(reiser4_tree * tree)
6274 +{
6275 + return max_t(int, tree->height, TREE_HEIGHT_CAP);
6276 +}
6277 +
6278 +/* return capped tree height for the current tree. */
6279 +static int capped_height(void)
6280 +{
6281 + return cap_tree_height(current_tree);
6282 +}
6283 +
6284 +/* return number of pages required to store given number of bytes */
6285 +static int bytes_to_pages(int bytes)
6286 +{
6287 + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6288 +}
6289 +
6290 +/* how many pages are required to allocate znodes during item insertion. */
6291 +static int carry_estimate_znodes(void)
6292 +{
6293 + /*
6294 + * Note, that there we have some problem here: there is no way to
6295 + * reserve pages specifically for the given slab. This means that
6296 + * these pages can be hijacked for some other end.
6297 + */
6298 +
6299 + /* in the worst case we need 3 new znode on each tree level */
6300 + return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6301 +}
6302 +
6303 +/*
6304 + * how many pages are required to load bitmaps. One bitmap per level.
6305 + */
6306 +static int carry_estimate_bitmaps(void)
6307 +{
6308 + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6309 + int bytes;
6310 +
6311 + bytes = capped_height() * (0 + /* bnode should be added, but
6312 + * it is private to bitmap.c,
6313 + * skip for now. */
6314 + 2 * sizeof(jnode));
6315 + /* working and commit jnodes */
6316 + return bytes_to_pages(bytes) + 2; /* and their contents */
6317 + } else
6318 + /* bitmaps were pre-loaded during mount */
6319 + return 0;
6320 +}
6321 +
6322 +/* worst case item insertion memory requirements */
6323 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6324 +{
6325 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6326 + /* new atom */
6327 + capped_height() + /* new block on each level */
6328 + 1 + /* and possibly extra new block at the leaf level */
6329 + 3; /* loading of leaves into memory */
6330 +}
6331 +
6332 +/* worst case item deletion memory requirements */
6333 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6334 +{
6335 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6336 + /* new atom */
6337 + 3; /* loading of leaves into memory */
6338 +}
6339 +
6340 +/* worst case tree cut memory requirements */
6341 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6342 +{
6343 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6344 + /* new atom */
6345 + 3; /* loading of leaves into memory */
6346 +}
6347 +
6348 +/* worst case memory requirements of pasting into item */
6349 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6350 +{
6351 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6352 + /* new atom */
6353 + capped_height() + /* new block on each level */
6354 + 1 + /* and possibly extra new block at the leaf level */
6355 + 3; /* loading of leaves into memory */
6356 +}
6357 +
6358 +/* worst case memory requirements of extent insertion */
6359 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6360 +{
6361 + return carry_estimate_insert(op, level) + /* insert extent */
6362 + carry_estimate_delete(op, level); /* kill leaf */
6363 +}
6364 +
6365 +/* worst case memory requirements of key update */
6366 +static int carry_estimate_update(carry_op * op, carry_level * level)
6367 +{
6368 + return 0;
6369 +}
6370 +
6371 +/* worst case memory requirements of flow insertion */
6372 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6373 +{
6374 + int newnodes;
6375 +
6376 + newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6377 + CARRY_FLOW_NEW_NODES_LIMIT);
6378 + /*
6379 + * roughly estimate insert_flow as a sequence of insertions.
6380 + */
6381 + return newnodes * carry_estimate_insert(op, level);
6382 +}
6383 +
6384 +/* This is dispatch table for carry operations. It can be trivially
6385 + abstracted into useful plugin: tunable balancing policy is a good
6386 + thing. */
6387 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6388 + [COP_INSERT] = {
6389 + .handler = carry_insert,
6390 + .estimate = carry_estimate_insert}
6391 + ,
6392 + [COP_DELETE] = {
6393 + .handler = carry_delete,
6394 + .estimate = carry_estimate_delete}
6395 + ,
6396 + [COP_CUT] = {
6397 + .handler = carry_cut,
6398 + .estimate = carry_estimate_cut}
6399 + ,
6400 + [COP_PASTE] = {
6401 + .handler = carry_paste,
6402 + .estimate = carry_estimate_paste}
6403 + ,
6404 + [COP_EXTENT] = {
6405 + .handler = carry_extent,
6406 + .estimate = carry_estimate_extent}
6407 + ,
6408 + [COP_UPDATE] = {
6409 + .handler = carry_update,
6410 + .estimate = carry_estimate_update}
6411 + ,
6412 + [COP_INSERT_FLOW] = {
6413 + .handler = carry_insert_flow,
6414 + .estimate = carry_estimate_insert_flow}
6415 +};
6416 +
6417 +/* Make Linus happy.
6418 + Local variables:
6419 + c-indentation-style: "K&R"
6420 + mode-name: "LC"
6421 + c-basic-offset: 8
6422 + tab-width: 8
6423 + fill-column: 120
6424 + scroll-step: 1
6425 + End:
6426 +*/
6427 diff -urN linux-2.6.33.orig/fs/reiser4/carry_ops.h linux-2.6.33/fs/reiser4/carry_ops.h
6428 --- linux-2.6.33.orig/fs/reiser4/carry_ops.h 1970-01-01 01:00:00.000000000 +0100
6429 +++ linux-2.6.33/fs/reiser4/carry_ops.h 2010-03-04 19:33:22.000000000 +0100
6430 @@ -0,0 +1,43 @@
6431 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
6432 + reiser4/README */
6433 +
6434 +/* implementation of carry operations. See carry_ops.c for details. */
6435 +
6436 +#if !defined(__CARRY_OPS_H__)
6437 +#define __CARRY_OPS_H__
6438 +
6439 +#include "forward.h"
6440 +#include "znode.h"
6441 +#include "carry.h"
6442 +
6443 +/* carry operation handlers */
6444 +typedef struct carry_op_handler {
6445 + /* perform operation */
6446 + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6447 + /* estimate memory requirements for @op */
6448 + int (*estimate) (carry_op * op, carry_level * level);
6449 +} carry_op_handler;
6450 +
6451 +/* This is dispatch table for carry operations. It can be trivially
6452 + abstracted into useful plugin: tunable balancing policy is a good
6453 + thing. */
6454 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6455 +
6456 +unsigned int space_needed(const znode * node, const coord_t *coord,
6457 + const reiser4_item_data * data, int inserting);
6458 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6459 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6460 +
6461 +/* __CARRY_OPS_H__ */
6462 +#endif
6463 +
6464 +/* Make Linus happy.
6465 + Local variables:
6466 + c-indentation-style: "K&R"
6467 + mode-name: "LC"
6468 + c-basic-offset: 8
6469 + tab-width: 8
6470 + fill-column: 120
6471 + scroll-step: 1
6472 + End:
6473 +*/
6474 diff -urN linux-2.6.33.orig/fs/reiser4/context.c linux-2.6.33/fs/reiser4/context.c
6475 --- linux-2.6.33.orig/fs/reiser4/context.c 1970-01-01 01:00:00.000000000 +0100
6476 +++ linux-2.6.33/fs/reiser4/context.c 2010-03-04 19:33:22.000000000 +0100
6477 @@ -0,0 +1,289 @@
6478 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6479 +
6480 +/* Manipulation of reiser4_context */
6481 +
6482 +/*
6483 + * global context used during system call. Variable of this type is allocated
6484 + * on the stack at the beginning of the reiser4 part of the system call and
6485 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6486 + * passing pointer to current transaction and current lockstack (both in
6487 + * one-to-one mapping with threads) all over the call chain.
6488 + *
6489 + * It's kind of like those global variables the prof used to tell you not to
6490 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6491 + *
6492 + * In some situations it is desirable to have ability to enter reiser4_context
6493 + * more than once for the same thread (nested contexts). For example, there
6494 + * are some functions that can be called either directly from VFS/VM or from
6495 + * already active reiser4 context (->writepage, for example).
6496 + *
6497 + * In such situations "child" context acts like dummy: all activity is
6498 + * actually performed in the top level context, and get_current_context()
6499 + * always returns top level context.
6500 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6501 + * nested any way.
6502 + *
6503 + * Note that there is an important difference between reiser4 uses
6504 + * ->fs_context and the way other file systems use it. Other file systems
6505 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6506 + * (this is why ->fs_context was initially called ->journal_info). This means,
6507 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6508 + * to the file system, they assume that some transaction is already underway,
6509 + * and usually bail out, because starting nested transaction would most likely
6510 + * lead to the deadlock. This gives false positives with reiser4, because we
6511 + * set ->fs_context before starting transaction.
6512 + */
6513 +
6514 +#include "debug.h"
6515 +#include "super.h"
6516 +#include "context.h"
6517 +#include "vfs_ops.h" /* for reiser4_throttle_write() */
6518 +
6519 +#include <linux/writeback.h> /* for current_is_pdflush() */
6520 +#include <linux/hardirq.h>
6521 +
6522 +static void _reiser4_init_context(reiser4_context * context,
6523 + struct super_block *super)
6524 +{
6525 + memset(context, 0, sizeof(*context));
6526 +
6527 + context->super = super;
6528 + context->magic = context_magic;
6529 + context->outer = current->journal_info;
6530 + current->journal_info = (void *)context;
6531 + context->nr_children = 0;
6532 + context->gfp_mask = GFP_KERNEL;
6533 +
6534 + init_lock_stack(&context->stack);
6535 +
6536 + reiser4_txn_begin(context);
6537 +
6538 + /* initialize head of tap list */
6539 + INIT_LIST_HEAD(&context->taps);
6540 +#if REISER4_DEBUG
6541 + context->task = current;
6542 +#endif
6543 + grab_space_enable();
6544 +}
6545 +
6546 +/* initialize context and bind it to the current thread
6547 +
6548 + This function should be called at the beginning of reiser4 part of
6549 + syscall.
6550 +*/
6551 +reiser4_context * reiser4_init_context(struct super_block *super)
6552 +{
6553 + reiser4_context *context;
6554 +
6555 + assert("nikita-2662", !in_interrupt() && !in_irq());
6556 + assert("nikita-3357", super != NULL);
6557 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6558 +
6559 + context = get_current_context_check();
6560 + if (context && context->super == super) {
6561 + context = (reiser4_context *) current->journal_info;
6562 + context->nr_children++;
6563 + return context;
6564 + }
6565 +
6566 + context = kmalloc(sizeof(*context), GFP_KERNEL);
6567 + if (context == NULL)
6568 + return ERR_PTR(RETERR(-ENOMEM));
6569 +
6570 + _reiser4_init_context(context, super);
6571 + return context;
6572 +}
6573 +
6574 +/* this is used in scan_mgr which is called with spinlock held and in
6575 + reiser4_fill_super magic */
6576 +void init_stack_context(reiser4_context *context, struct super_block *super)
6577 +{
6578 + assert("nikita-2662", !in_interrupt() && !in_irq());
6579 + assert("nikita-3357", super != NULL);
6580 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6581 + assert("vs-12", !is_in_reiser4_context());
6582 +
6583 + _reiser4_init_context(context, super);
6584 + context->on_stack = 1;
6585 + return;
6586 +}
6587 +
6588 +/* cast lock stack embedded into reiser4 context up to its container */
6589 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6590 +{
6591 + return container_of(owner, reiser4_context, stack);
6592 +}
6593 +
6594 +/* true if there is already _any_ reiser4 context for the current thread */
6595 +int is_in_reiser4_context(void)
6596 +{
6597 + reiser4_context *ctx;
6598 +
6599 + ctx = current->journal_info;
6600 + return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6601 +}
6602 +
6603 +/*
6604 + * call balance dirty pages for the current context.
6605 + *
6606 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6607 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6608 + * write---this covers vast majority of all dirty traffic), but we cannot do
6609 + * this immediately when formatted node is dirtied, because long term lock is
6610 + * usually held at that time. To work around this, dirtying of formatted node
6611 + * simply increases ->nr_marked_dirty counter in the current reiser4
6612 + * context. When we are about to leave this context,
6613 + * balance_dirty_pages_ratelimited() is called, if necessary.
6614 + *
6615 + * This introduces another problem: sometimes we do not want to run
6616 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6617 + * because some important lock (like ->i_mutex on the parent directory) is
6618 + * held. To achieve this, ->nobalance flag can be set in the current context.
6619 + */
6620 +static void reiser4_throttle_write_at(reiser4_context *context)
6621 +{
6622 + reiser4_super_info_data *sbinfo = get_super_private(context->super);
6623 +
6624 + /*
6625 + * call balance_dirty_pages_ratelimited() to process formatted nodes
6626 + * dirtied during this system call. Do that only if we are not in mount
6627 + * and there were nodes dirtied in this context and we are not in
6628 + * writepage (to avoid deadlock) and not in pdflush
6629 + */
6630 + if (sbinfo != NULL && sbinfo->fake != NULL &&
6631 + context->nr_marked_dirty != 0 &&
6632 + !(current->flags & PF_MEMALLOC) &&
6633 + !current_is_flush_bd_task())
6634 + /* FIXME-EDWARD: throttle with nr_marked_dirty? */
6635 + reiser4_throttle_write(sbinfo->fake, 1);
6636 +}
6637 +
6638 +/* release resources associated with context.
6639 +
6640 + This function should be called at the end of "session" with reiser4,
6641 + typically just before leaving reiser4 driver back to VFS.
6642 +
6643 + This is good place to put some degugging consistency checks, like that
6644 + thread released all locks and closed transcrash etc.
6645 +
6646 +*/
6647 +static void reiser4_done_context(reiser4_context * context)
6648 + /* context being released */
6649 +{
6650 + assert("nikita-860", context != NULL);
6651 + assert("nikita-859", context->magic == context_magic);
6652 + assert("vs-646", (reiser4_context *) current->journal_info == context);
6653 + assert("zam-686", !in_interrupt() && !in_irq());
6654 +
6655 + /* only do anything when leaving top-level reiser4 context. All nested
6656 + * contexts are just dummies. */
6657 + if (context->nr_children == 0) {
6658 + assert("jmacd-673", context->trans == NULL);
6659 + assert("jmacd-1002", lock_stack_isclean(&context->stack));
6660 + assert("nikita-1936", reiser4_no_counters_are_held());
6661 + assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6662 + assert("zam-1004", ergo(get_super_private(context->super),
6663 + get_super_private(context->super)->delete_mutex_owner !=
6664 + current));
6665 +
6666 + /* release all grabbed but as yet unused blocks */
6667 + if (context->grabbed_blocks != 0)
6668 + all_grabbed2free();
6669 +
6670 + /*
6671 + * synchronize against longterm_unlock_znode():
6672 + * wake_up_requestor() wakes up requestors without holding
6673 + * zlock (otherwise they will immediately bump into that lock
6674 + * after wake up on another CPU). To work around (rare)
6675 + * situation where requestor has been woken up asynchronously
6676 + * and managed to run until completion (and destroy its
6677 + * context and lock stack) before wake_up_requestor() called
6678 + * wake_up() on it, wake_up_requestor() synchronize on lock
6679 + * stack spin lock. It has actually been observed that spin
6680 + * lock _was_ locked at this point, because
6681 + * wake_up_requestor() took interrupt.
6682 + */
6683 + spin_lock_stack(&context->stack);
6684 + spin_unlock_stack(&context->stack);
6685 +
6686 + assert("zam-684", context->nr_children == 0);
6687 + /* restore original ->fs_context value */
6688 + current->journal_info = context->outer;
6689 + if (context->on_stack == 0)
6690 + kfree(context);
6691 + } else {
6692 + context->nr_children--;
6693 +#if REISER4_DEBUG
6694 + assert("zam-685", context->nr_children >= 0);
6695 +#endif
6696 + }
6697 +}
6698 +
6699 +/*
6700 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6701 + * transaction. Call done_context() to do context related book-keeping.
6702 + */
6703 +void reiser4_exit_context(reiser4_context * context)
6704 +{
6705 + assert("nikita-3021", reiser4_schedulable());
6706 +
6707 + if (context->nr_children == 0) {
6708 + if (!context->nobalance)
6709 + reiser4_throttle_write_at(context);
6710 +
6711 + /* if filesystem is mounted with -o sync or -o dirsync - commit
6712 + transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6713 + commiting on exit_context when inode semaphore is held and
6714 + to have ktxnmgrd to do commit instead to get better
6715 + concurrent filesystem accesses. But, when one mounts with -o
6716 + sync, he cares more about reliability than about
6717 + performance. So, for now we have this simple mount -o sync
6718 + support. */
6719 + if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6720 + txn_atom *atom;
6721 +
6722 + atom = get_current_atom_locked_nocheck();
6723 + if (atom) {
6724 + atom->flags |= ATOM_FORCE_COMMIT;
6725 + context->trans->flags &= ~TXNH_DONT_COMMIT;
6726 + spin_unlock_atom(atom);
6727 + }
6728 + }
6729 + reiser4_txn_end(context);
6730 + }
6731 + reiser4_done_context(context);
6732 +}
6733 +
6734 +void reiser4_ctx_gfp_mask_set(void)
6735 +{
6736 + reiser4_context *ctx;
6737 +
6738 + ctx = get_current_context();
6739 + if (ctx->entd == 0 &&
6740 + list_empty(&ctx->stack.locks) &&
6741 + ctx->trans->atom == NULL)
6742 + ctx->gfp_mask = GFP_KERNEL;
6743 + else
6744 + ctx->gfp_mask = GFP_NOFS;
6745 +}
6746 +
6747 +void reiser4_ctx_gfp_mask_force(gfp_t mask)
6748 +{
6749 + reiser4_context *ctx;
6750 + ctx = get_current_context();
6751 +
6752 + assert("edward-1454", ctx != NULL);
6753 +
6754 + ctx->gfp_mask = mask;
6755 +}
6756 +
6757 +/*
6758 + * Local variables:
6759 + * c-indentation-style: "K&R"
6760 + * mode-name: "LC"
6761 + * c-basic-offset: 8
6762 + * tab-width: 8
6763 + * fill-column: 120
6764 + * scroll-step: 1
6765 + * End:
6766 + */
6767 diff -urN linux-2.6.33.orig/fs/reiser4/context.h linux-2.6.33/fs/reiser4/context.h
6768 --- linux-2.6.33.orig/fs/reiser4/context.h 1970-01-01 01:00:00.000000000 +0100
6769 +++ linux-2.6.33/fs/reiser4/context.h 2010-03-04 19:33:22.000000000 +0100
6770 @@ -0,0 +1,228 @@
6771 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6772 + * reiser4/README */
6773 +
6774 +/* Reiser4 context. See context.c for details. */
6775 +
6776 +#if !defined( __REISER4_CONTEXT_H__ )
6777 +#define __REISER4_CONTEXT_H__
6778 +
6779 +#include "forward.h"
6780 +#include "debug.h"
6781 +#include "dformat.h"
6782 +#include "tap.h"
6783 +#include "lock.h"
6784 +
6785 +#include <linux/types.h> /* for __u?? */
6786 +#include <linux/fs.h> /* for struct super_block */
6787 +#include <linux/spinlock.h>
6788 +#include <linux/sched.h> /* for struct task_struct */
6789 +
6790 +/* reiser4 per-thread context */
6791 +struct reiser4_context {
6792 + /* magic constant. For identification of reiser4 contexts. */
6793 + __u32 magic;
6794 +
6795 + /* current lock stack. See lock.[ch]. This is where list of all
6796 + locks taken by current thread is kept. This is also used in
6797 + deadlock detection. */
6798 + lock_stack stack;
6799 +
6800 + /* current transcrash. */
6801 + txn_handle *trans;
6802 + /* transaction handle embedded into reiser4_context. ->trans points
6803 + * here by default. */
6804 + txn_handle trans_in_ctx;
6805 +
6806 + /* super block we are working with. To get the current tree
6807 + use &get_super_private (reiser4_get_current_sb ())->tree. */
6808 + struct super_block *super;
6809 +
6810 + /* parent fs activation */
6811 + struct fs_activation *outer;
6812 +
6813 + /* per-thread grabbed (for further allocation) blocks counter */
6814 + reiser4_block_nr grabbed_blocks;
6815 +
6816 + /* list of taps currently monitored. See tap.c */
6817 + struct list_head taps;
6818 +
6819 + /* grabbing space is enabled */
6820 + unsigned int grab_enabled:1;
6821 + /* should be set when we are write dirty nodes to disk in jnode_flush or
6822 + * reiser4_write_logs() */
6823 + unsigned int writeout_mode:1;
6824 + /* true, if current thread is an ent thread */
6825 + unsigned int entd:1;
6826 + /* true, if balance_dirty_pages() should not be run when leaving this
6827 + * context. This is used to avoid lengthly balance_dirty_pages()
6828 + * operation when holding some important resource, like directory
6829 + * ->i_mutex */
6830 + unsigned int nobalance:1;
6831 +
6832 + /* this bit is used on reiser4_done_context to decide whether context is
6833 + kmalloc-ed and has to be kfree-ed */
6834 + unsigned int on_stack:1;
6835 +
6836 + /* count non-trivial jnode_set_dirty() calls */
6837 + unsigned long nr_marked_dirty;
6838 +
6839 + /* reiser4_writeback_inodes calls (via generic_writeback_sb_inodes)
6840 + * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6841 + * captures pages. When number of pages captured in one
6842 + * reiser4_sync_inodes reaches some threshold - some atoms get
6843 + * flushed */
6844 + int nr_captured;
6845 + int nr_children; /* number of child contexts */
6846 +#if REISER4_DEBUG
6847 + /* debugging information about reiser4 locks held by the current
6848 + * thread */
6849 + reiser4_lock_cnt_info locks;
6850 + struct task_struct *task; /* so we can easily find owner of the stack */
6851 +
6852 + /*
6853 + * disk space grabbing debugging support
6854 + */
6855 + /* how many disk blocks were grabbed by the first call to
6856 + * reiser4_grab_space() in this context */
6857 + reiser4_block_nr grabbed_initially;
6858 +
6859 + /* list of all threads doing flush currently */
6860 + struct list_head flushers_link;
6861 + /* information about last error encountered by reiser4 */
6862 + err_site err;
6863 +#endif
6864 + void *vp;
6865 + gfp_t gfp_mask;
6866 +};
6867 +
6868 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6869 +
6870 +/* Debugging helps. */
6871 +#if REISER4_DEBUG
6872 +extern void print_contexts(void);
6873 +#endif
6874 +
6875 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6876 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6877 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6878 +
6879 +extern reiser4_context *reiser4_init_context(struct super_block *);
6880 +extern void init_stack_context(reiser4_context *, struct super_block *);
6881 +extern void reiser4_exit_context(reiser4_context *);
6882 +
6883 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6884 + catch accesses to staled or uninitialized contexts. */
6885 +#define context_magic ((__u32) 0x4b1b5d0b)
6886 +
6887 +extern int is_in_reiser4_context(void);
6888 +
6889 +/*
6890 + * return reiser4_context for the thread @tsk
6891 + */
6892 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6893 +{
6894 + assert("vs-1682",
6895 + ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6896 + return (reiser4_context *) tsk->journal_info;
6897 +}
6898 +
6899 +/*
6900 + * return reiser4 context of the current thread, or NULL if there is none.
6901 + */
6902 +static inline reiser4_context *get_current_context_check(void)
6903 +{
6904 + if (is_in_reiser4_context())
6905 + return get_context(current);
6906 + else
6907 + return NULL;
6908 +}
6909 +
6910 +static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6911 +
6912 +/* return context associated with current thread */
6913 +static inline reiser4_context *get_current_context(void)
6914 +{
6915 + return get_context(current);
6916 +}
6917 +
6918 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6919 +{
6920 + reiser4_context *ctx;
6921 +
6922 + ctx = get_current_context_check();
6923 + return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6924 +}
6925 +
6926 +void reiser4_ctx_gfp_mask_set(void);
6927 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6928 +
6929 +/*
6930 + * true if current thread is in the write-out mode. Thread enters write-out
6931 + * mode during jnode_flush and reiser4_write_logs().
6932 + */
6933 +static inline int is_writeout_mode(void)
6934 +{
6935 + return get_current_context()->writeout_mode;
6936 +}
6937 +
6938 +/*
6939 + * enter write-out mode
6940 + */
6941 +static inline void writeout_mode_enable(void)
6942 +{
6943 + assert("zam-941", !get_current_context()->writeout_mode);
6944 + get_current_context()->writeout_mode = 1;
6945 +}
6946 +
6947 +/*
6948 + * leave write-out mode
6949 + */
6950 +static inline void writeout_mode_disable(void)
6951 +{
6952 + assert("zam-942", get_current_context()->writeout_mode);
6953 + get_current_context()->writeout_mode = 0;
6954 +}
6955 +
6956 +static inline void grab_space_enable(void)
6957 +{
6958 + get_current_context()->grab_enabled = 1;
6959 +}
6960 +
6961 +static inline void grab_space_disable(void)
6962 +{
6963 + get_current_context()->grab_enabled = 0;
6964 +}
6965 +
6966 +static inline void grab_space_set_enabled(int enabled)
6967 +{
6968 + get_current_context()->grab_enabled = enabled;
6969 +}
6970 +
6971 +static inline int is_grab_enabled(reiser4_context * ctx)
6972 +{
6973 + return ctx->grab_enabled;
6974 +}
6975 +
6976 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6977 + * flush would be performed when it is closed. This is necessary when handle
6978 + * has to be closed under some coarse semaphore, like i_mutex of
6979 + * directory. Commit will be performed by ktxnmgrd. */
6980 +static inline void context_set_commit_async(reiser4_context * context)
6981 +{
6982 + context->nobalance = 1;
6983 + context->trans->flags |= TXNH_DONT_COMMIT;
6984 +}
6985 +
6986 +/* __REISER4_CONTEXT_H__ */
6987 +#endif
6988 +
6989 +/* Make Linus happy.
6990 + Local variables:
6991 + c-indentation-style: "K&R"
6992 + mode-name: "LC"
6993 + c-basic-offset: 8
6994 + tab-width: 8
6995 + fill-column: 120
6996 + scroll-step: 1
6997 + End:
6998 +*/
6999 diff -urN linux-2.6.33.orig/fs/reiser4/coord.c linux-2.6.33/fs/reiser4/coord.c
7000 --- linux-2.6.33.orig/fs/reiser4/coord.c 1970-01-01 01:00:00.000000000 +0100
7001 +++ linux-2.6.33/fs/reiser4/coord.c 2010-03-04 19:33:22.000000000 +0100
7002 @@ -0,0 +1,928 @@
7003 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7004 + reiser4/README */
7005 +
7006 +#include "forward.h"
7007 +#include "debug.h"
7008 +#include "dformat.h"
7009 +#include "tree.h"
7010 +#include "plugin/item/item.h"
7011 +#include "znode.h"
7012 +#include "coord.h"
7013 +
7014 +/* Internal constructor. */
7015 +static inline void
7016 +coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos,
7017 + pos_in_node_t unit_pos, between_enum between)
7018 +{
7019 + coord->node = (znode *) node;
7020 + coord_set_item_pos(coord, item_pos);
7021 + coord->unit_pos = unit_pos;
7022 + coord->between = between;
7023 + ON_DEBUG(coord->plug_v = 0);
7024 + ON_DEBUG(coord->body_v = 0);
7025 +
7026 + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord,
7027 + node, item_pos, unit_pos, coord_tween_tostring (between)); */
7028 +}
7029 +
7030 +/* after shifting of node content, coord previously set properly may become
7031 + invalid, try to "normalize" it. */
7032 +void coord_normalize(coord_t *coord)
7033 +{
7034 + znode *node;
7035 +
7036 + node = coord->node;
7037 + assert("vs-683", node);
7038 +
7039 + coord_clear_iplug(coord);
7040 +
7041 + if (node_is_empty(node)) {
7042 + coord_init_first_unit(coord, node);
7043 + } else if ((coord->between == AFTER_ITEM)
7044 + || (coord->between == AFTER_UNIT)) {
7045 + return;
7046 + } else if (coord->item_pos == coord_num_items(coord)
7047 + && coord->between == BEFORE_ITEM) {
7048 + coord_dec_item_pos(coord);
7049 + coord->between = AFTER_ITEM;
7050 + } else if (coord->unit_pos == coord_num_units(coord)
7051 + && coord->between == BEFORE_UNIT) {
7052 + coord->unit_pos--;
7053 + coord->between = AFTER_UNIT;
7054 + } else if (coord->item_pos == coord_num_items(coord)
7055 + && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7056 + coord_dec_item_pos(coord);
7057 + coord->unit_pos = 0;
7058 + coord->between = AFTER_ITEM;
7059 + }
7060 +}
7061 +
7062 +/* Copy a coordinate. */
7063 +void coord_dup(coord_t *coord, const coord_t *old_coord)
7064 +{
7065 + assert("jmacd-9800", coord_check(old_coord));
7066 + coord_dup_nocheck(coord, old_coord);
7067 +}
7068 +
7069 +/* Copy a coordinate without check. Useful when old_coord->node is not
7070 + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7071 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord)
7072 +{
7073 + coord->node = old_coord->node;
7074 + coord_set_item_pos(coord, old_coord->item_pos);
7075 + coord->unit_pos = old_coord->unit_pos;
7076 + coord->between = old_coord->between;
7077 + coord->iplugid = old_coord->iplugid;
7078 + ON_DEBUG(coord->plug_v = old_coord->plug_v);
7079 + ON_DEBUG(coord->body_v = old_coord->body_v);
7080 +}
7081 +
7082 +/* Initialize an invalid coordinate. */
7083 +void coord_init_invalid(coord_t *coord, const znode * node)
7084 +{
7085 + coord_init_values(coord, node, 0, 0, INVALID_COORD);
7086 +}
7087 +
7088 +void coord_init_first_unit_nocheck(coord_t *coord, const znode * node)
7089 +{
7090 + coord_init_values(coord, node, 0, 0, AT_UNIT);
7091 +}
7092 +
7093 +/* Initialize a coordinate to point at the first unit of the first item. If the
7094 + node is empty, it is positioned at the EMPTY_NODE. */
7095 +void coord_init_first_unit(coord_t *coord, const znode * node)
7096 +{
7097 + int is_empty = node_is_empty(node);
7098 +
7099 + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7100 +
7101 + assert("jmacd-9801", coord_check(coord));
7102 +}
7103 +
7104 +/* Initialize a coordinate to point at the last unit of the last item. If the
7105 + node is empty, it is positioned at the EMPTY_NODE. */
7106 +void coord_init_last_unit(coord_t *coord, const znode * node)
7107 +{
7108 + int is_empty = node_is_empty(node);
7109 +
7110 + coord_init_values(coord, node,
7111 + (is_empty ? 0 : node_num_items(node) - 1), 0,
7112 + (is_empty ? EMPTY_NODE : AT_UNIT));
7113 + if (!is_empty)
7114 + coord->unit_pos = coord_last_unit_pos(coord);
7115 + assert("jmacd-9802", coord_check(coord));
7116 +}
7117 +
7118 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7119 + positioned at the EMPTY_NODE. */
7120 +void coord_init_before_first_item(coord_t *coord, const znode * node)
7121 +{
7122 + int is_empty = node_is_empty(node);
7123 +
7124 + coord_init_values(coord, node, 0, 0,
7125 + (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7126 +
7127 + assert("jmacd-9803", coord_check(coord));
7128 +}
7129 +
7130 +/* Initialize a coordinate to after the last item. If the node is empty, it is
7131 + positioned at the EMPTY_NODE. */
7132 +void coord_init_after_last_item(coord_t *coord, const znode * node)
7133 +{
7134 + int is_empty = node_is_empty(node);
7135 +
7136 + coord_init_values(coord, node,
7137 + (is_empty ? 0 : node_num_items(node) - 1), 0,
7138 + (is_empty ? EMPTY_NODE : AFTER_ITEM));
7139 +
7140 + assert("jmacd-9804", coord_check(coord));
7141 +}
7142 +
7143 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7144 + already to existing item */
7145 +void coord_init_after_item_end(coord_t *coord)
7146 +{
7147 + coord->between = AFTER_UNIT;
7148 + coord->unit_pos = coord_last_unit_pos(coord);
7149 +}
7150 +
7151 +/* Initialize a coordinate to before the item. Coord must be set already to
7152 + existing item */
7153 +void coord_init_before_item(coord_t *coord)
7154 +{
7155 + coord->unit_pos = 0;
7156 + coord->between = BEFORE_ITEM;
7157 +}
7158 +
7159 +/* Initialize a coordinate to after the item. Coord must be set already to
7160 + existing item */
7161 +void coord_init_after_item(coord_t *coord)
7162 +{
7163 + coord->unit_pos = 0;
7164 + coord->between = AFTER_ITEM;
7165 +}
7166 +
7167 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7168 + it was not clear how actually */
7169 +void coord_init_zero(coord_t *coord)
7170 +{
7171 + memset(coord, 0, sizeof(*coord));
7172 +}
7173 +
7174 +/* Return the number of units at the present item.
7175 + Asserts coord_is_existing_item(). */
7176 +unsigned coord_num_units(const coord_t *coord)
7177 +{
7178 + assert("jmacd-9806", coord_is_existing_item(coord));
7179 +
7180 + return item_plugin_by_coord(coord)->b.nr_units(coord);
7181 +}
7182 +
7183 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7184 +/* Audited by: green(2002.06.15) */
7185 +int coord_is_invalid(const coord_t *coord)
7186 +{
7187 + return coord->between == INVALID_COORD;
7188 +}
7189 +
7190 +/* Returns true if the coordinate is positioned at an existing item, not before
7191 + or after an item. It may be placed at, before, or after any unit within the
7192 + item, whether existing or not. */
7193 +int coord_is_existing_item(const coord_t *coord)
7194 +{
7195 + switch (coord->between) {
7196 + case EMPTY_NODE:
7197 + case BEFORE_ITEM:
7198 + case AFTER_ITEM:
7199 + case INVALID_COORD:
7200 + return 0;
7201 +
7202 + case BEFORE_UNIT:
7203 + case AT_UNIT:
7204 + case AFTER_UNIT:
7205 + return coord->item_pos < coord_num_items(coord);
7206 + }
7207 +
7208 + impossible("jmacd-9900", "unreachable coord: %p", coord);
7209 + return 0;
7210 +}
7211 +
7212 +/* Returns true if the coordinate is positioned at an existing unit, not before
7213 + or after a unit. */
7214 +/* Audited by: green(2002.06.15) */
7215 +int coord_is_existing_unit(const coord_t *coord)
7216 +{
7217 + switch (coord->between) {
7218 + case EMPTY_NODE:
7219 + case BEFORE_UNIT:
7220 + case AFTER_UNIT:
7221 + case BEFORE_ITEM:
7222 + case AFTER_ITEM:
7223 + case INVALID_COORD:
7224 + return 0;
7225 +
7226 + case AT_UNIT:
7227 + return (coord->item_pos < coord_num_items(coord)
7228 + && coord->unit_pos < coord_num_units(coord));
7229 + }
7230 +
7231 + impossible("jmacd-9902", "unreachable");
7232 + return 0;
7233 +}
7234 +
7235 +/* Returns true if the coordinate is positioned at the first unit of the first
7236 + item. Not true for empty nodes nor coordinates positioned before the first
7237 + item. */
7238 +/* Audited by: green(2002.06.15) */
7239 +int coord_is_leftmost_unit(const coord_t *coord)
7240 +{
7241 + return (coord->between == AT_UNIT && coord->item_pos == 0
7242 + && coord->unit_pos == 0);
7243 +}
7244 +
7245 +#if REISER4_DEBUG
7246 +/* For assertions only, checks for a valid coordinate. */
7247 +int coord_check(const coord_t *coord)
7248 +{
7249 + if (coord->node == NULL)
7250 + return 0;
7251 + if (znode_above_root(coord->node))
7252 + return 1;
7253 +
7254 + switch (coord->between) {
7255 + default:
7256 + case INVALID_COORD:
7257 + return 0;
7258 + case EMPTY_NODE:
7259 + if (!node_is_empty(coord->node))
7260 + return 0;
7261 + return coord->item_pos == 0 && coord->unit_pos == 0;
7262 +
7263 + case BEFORE_UNIT:
7264 + case AFTER_UNIT:
7265 + if (node_is_empty(coord->node) && (coord->item_pos == 0)
7266 + && (coord->unit_pos == 0))
7267 + return 1;
7268 + case AT_UNIT:
7269 + break;
7270 + case AFTER_ITEM:
7271 + case BEFORE_ITEM:
7272 + /* before/after item should not set unit_pos. */
7273 + if (coord->unit_pos != 0)
7274 + return 0;
7275 + break;
7276 + }
7277 +
7278 + if (coord->item_pos >= node_num_items(coord->node))
7279 + return 0;
7280 +
7281 + /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7282 + between is set either AFTER_ITEM or BEFORE_ITEM */
7283 + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7284 + return 1;
7285 +
7286 + if (coord_is_iplug_set(coord) &&
7287 + coord->unit_pos >
7288 + item_plugin_by_coord(coord)->b.nr_units(coord) - 1)
7289 + return 0;
7290 + return 1;
7291 +}
7292 +#endif
7293 +
7294 +/* Adjust coordinate boundaries based on the number of items prior to
7295 + coord_next/prev. Returns 1 if the new position is does not exist. */
7296 +static int coord_adjust_items(coord_t *coord, unsigned items, int is_next)
7297 +{
7298 + /* If the node is invalid, leave it. */
7299 + if (coord->between == INVALID_COORD)
7300 + return 1;
7301 +
7302 + /* If the node is empty, set it appropriately. */
7303 + if (items == 0) {
7304 + coord->between = EMPTY_NODE;
7305 + coord_set_item_pos(coord, 0);
7306 + coord->unit_pos = 0;
7307 + return 1;
7308 + }
7309 +
7310 + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7311 + if (coord->between == EMPTY_NODE) {
7312 + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7313 + coord_set_item_pos(coord, 0);
7314 + coord->unit_pos = 0;
7315 + return 0;
7316 + }
7317 +
7318 + /* If the item_pos is out-of-range, set it appropriatly. */
7319 + if (coord->item_pos >= items) {
7320 + coord->between = AFTER_ITEM;
7321 + coord_set_item_pos(coord, items - 1);
7322 + coord->unit_pos = 0;
7323 + /* If is_next, return 1 (can't go any further). */
7324 + return is_next;
7325 + }
7326 +
7327 + return 0;
7328 +}
7329 +
7330 +/* Advances the coordinate by one unit to the right. If empty, no change. If
7331 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
7332 + position is an existing unit. */
7333 +int coord_next_unit(coord_t *coord)
7334 +{
7335 + unsigned items = coord_num_items(coord);
7336 +
7337 + if (coord_adjust_items(coord, items, 1) == 1)
7338 + return 1;
7339 +
7340 + switch (coord->between) {
7341 + case BEFORE_UNIT:
7342 + /* Now it is positioned at the same unit. */
7343 + coord->between = AT_UNIT;
7344 + return 0;
7345 +
7346 + case AFTER_UNIT:
7347 + case AT_UNIT:
7348 + /* If it was at or after a unit and there are more units in this
7349 + item, advance to the next one. */
7350 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7351 + coord->unit_pos += 1;
7352 + coord->between = AT_UNIT;
7353 + return 0;
7354 + }
7355 +
7356 + /* Otherwise, it is crossing an item boundary and treated as if
7357 + it was after the current item. */
7358 + coord->between = AFTER_ITEM;
7359 + coord->unit_pos = 0;
7360 + /* FALLTHROUGH */
7361 +
7362 + case AFTER_ITEM:
7363 + /* Check for end-of-node. */
7364 + if (coord->item_pos == items - 1)
7365 + return 1;
7366 +
7367 + coord_inc_item_pos(coord);
7368 + coord->unit_pos = 0;
7369 + coord->between = AT_UNIT;
7370 + return 0;
7371 +
7372 + case BEFORE_ITEM:
7373 + /* The adjust_items checks ensure that we are valid here. */
7374 + coord->unit_pos = 0;
7375 + coord->between = AT_UNIT;
7376 + return 0;
7377 +
7378 + case INVALID_COORD:
7379 + case EMPTY_NODE:
7380 + /* Handled in coord_adjust_items(). */
7381 + break;
7382 + }
7383 +
7384 + impossible("jmacd-9902", "unreachable");
7385 + return 0;
7386 +}
7387 +
7388 +/* Advances the coordinate by one item to the right. If empty, no change. If
7389 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
7390 + position is an existing item. */
7391 +int coord_next_item(coord_t *coord)
7392 +{
7393 + unsigned items = coord_num_items(coord);
7394 +
7395 + if (coord_adjust_items(coord, items, 1) == 1)
7396 + return 1;
7397 +
7398 + switch (coord->between) {
7399 + case AFTER_UNIT:
7400 + case AT_UNIT:
7401 + case BEFORE_UNIT:
7402 + case AFTER_ITEM:
7403 + /* Check for end-of-node. */
7404 + if (coord->item_pos == items - 1) {
7405 + coord->between = AFTER_ITEM;
7406 + coord->unit_pos = 0;
7407 + coord_clear_iplug(coord);
7408 + return 1;
7409 + }
7410 +
7411 + /* Anywhere in an item, go to the next one. */
7412 + coord->between = AT_UNIT;
7413 + coord_inc_item_pos(coord);
7414 + coord->unit_pos = 0;
7415 + return 0;
7416 +
7417 + case BEFORE_ITEM:
7418 + /* The out-of-range check ensures that we are valid here. */
7419 + coord->unit_pos = 0;
7420 + coord->between = AT_UNIT;
7421 + return 0;
7422 + case INVALID_COORD:
7423 + case EMPTY_NODE:
7424 + /* Handled in coord_adjust_items(). */
7425 + break;
7426 + }
7427 +
7428 + impossible("jmacd-9903", "unreachable");
7429 + return 0;
7430 +}
7431 +
7432 +/* Advances the coordinate by one unit to the left. If empty, no change. If
7433 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
7434 + position is an existing unit. */
7435 +int coord_prev_unit(coord_t *coord)
7436 +{
7437 + unsigned items = coord_num_items(coord);
7438 +
7439 + if (coord_adjust_items(coord, items, 0) == 1)
7440 + return 1;
7441 +
7442 + switch (coord->between) {
7443 + case AT_UNIT:
7444 + case BEFORE_UNIT:
7445 + if (coord->unit_pos > 0) {
7446 + coord->unit_pos -= 1;
7447 + coord->between = AT_UNIT;
7448 + return 0;
7449 + }
7450 +
7451 + if (coord->item_pos == 0) {
7452 + coord->between = BEFORE_ITEM;
7453 + return 1;
7454 + }
7455 +
7456 + coord_dec_item_pos(coord);
7457 + coord->unit_pos = coord_last_unit_pos(coord);
7458 + coord->between = AT_UNIT;
7459 + return 0;
7460 +
7461 + case AFTER_UNIT:
7462 + /* What if unit_pos is out-of-range? */
7463 + assert("jmacd-5442",
7464 + coord->unit_pos <= coord_last_unit_pos(coord));
7465 + coord->between = AT_UNIT;
7466 + return 0;
7467 +
7468 + case BEFORE_ITEM:
7469 + if (coord->item_pos == 0)
7470 + return 1;
7471 +
7472 + coord_dec_item_pos(coord);
7473 + /* FALLTHROUGH */
7474 +
7475 + case AFTER_ITEM:
7476 + coord->between = AT_UNIT;
7477 + coord->unit_pos = coord_last_unit_pos(coord);
7478 + return 0;
7479 +
7480 + case INVALID_COORD:
7481 + case EMPTY_NODE:
7482 + break;
7483 + }
7484 +
7485 + impossible("jmacd-9904", "unreachable");
7486 + return 0;
7487 +}
7488 +
7489 +/* Advances the coordinate by one item to the left. If empty, no change. If
7490 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
7491 + position is an existing item. */
7492 +int coord_prev_item(coord_t *coord)
7493 +{
7494 + unsigned items = coord_num_items(coord);
7495 +
7496 + if (coord_adjust_items(coord, items, 0) == 1)
7497 + return 1;
7498 +
7499 + switch (coord->between) {
7500 + case AT_UNIT:
7501 + case AFTER_UNIT:
7502 + case BEFORE_UNIT:
7503 + case BEFORE_ITEM:
7504 +
7505 + if (coord->item_pos == 0) {
7506 + coord->between = BEFORE_ITEM;
7507 + coord->unit_pos = 0;
7508 + return 1;
7509 + }
7510 +
7511 + coord_dec_item_pos(coord);
7512 + coord->unit_pos = 0;
7513 + coord->between = AT_UNIT;
7514 + return 0;
7515 +
7516 + case AFTER_ITEM:
7517 + coord->between = AT_UNIT;
7518 + coord->unit_pos = 0;
7519 + return 0;
7520 +
7521 + case INVALID_COORD:
7522 + case EMPTY_NODE:
7523 + break;
7524 + }
7525 +
7526 + impossible("jmacd-9905", "unreachable");
7527 + return 0;
7528 +}
7529 +
7530 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
7531 + sideof argument. */
7532 +void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir)
7533 +{
7534 + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7535 + if (dir == LEFT_SIDE) {
7536 + coord_init_first_unit(coord, node);
7537 + } else {
7538 + coord_init_last_unit(coord, node);
7539 + }
7540 +}
7541 +
7542 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
7543 + on sideof argument. */
7544 +/* Audited by: green(2002.06.15) */
7545 +int coord_is_after_sideof_unit(coord_t *coord, sideof dir)
7546 +{
7547 + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7548 + if (dir == LEFT_SIDE) {
7549 + return coord_is_before_leftmost(coord);
7550 + } else {
7551 + return coord_is_after_rightmost(coord);
7552 + }
7553 +}
7554 +
7555 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument.
7556 + */
7557 +/* Audited by: green(2002.06.15) */
7558 +int coord_sideof_unit(coord_t *coord, sideof dir)
7559 +{
7560 + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7561 + if (dir == LEFT_SIDE) {
7562 + return coord_prev_unit(coord);
7563 + } else {
7564 + return coord_next_unit(coord);
7565 + }
7566 +}
7567 +
7568 +#if REISER4_DEBUG
7569 +int coords_equal(const coord_t *c1, const coord_t *c2)
7570 +{
7571 + assert("nikita-2840", c1 != NULL);
7572 + assert("nikita-2841", c2 != NULL);
7573 +
7574 + return
7575 + c1->node == c2->node &&
7576 + c1->item_pos == c2->item_pos &&
7577 + c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7578 +}
7579 +#endif /* REISER4_DEBUG */
7580 +
7581 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
7582 + coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
7583 + NCOORD_INSIDE. */
7584 +/* Audited by: green(2002.06.15) */
7585 +coord_wrt_node coord_wrt(const coord_t *coord)
7586 +{
7587 + if (coord_is_before_leftmost(coord))
7588 + return COORD_ON_THE_LEFT;
7589 +
7590 + if (coord_is_after_rightmost(coord))
7591 + return COORD_ON_THE_RIGHT;
7592 +
7593 + return COORD_INSIDE;
7594 +}
7595 +
7596 +/* Returns true if the coordinate is positioned after the last item or after the
7597 + last unit of the last item or it is an empty node. */
7598 +/* Audited by: green(2002.06.15) */
7599 +int coord_is_after_rightmost(const coord_t *coord)
7600 +{
7601 + assert("jmacd-7313", coord_check(coord));
7602 +
7603 + switch (coord->between) {
7604 + case INVALID_COORD:
7605 + case AT_UNIT:
7606 + case BEFORE_UNIT:
7607 + case BEFORE_ITEM:
7608 + return 0;
7609 +
7610 + case EMPTY_NODE:
7611 + return 1;
7612 +
7613 + case AFTER_ITEM:
7614 + return (coord->item_pos == node_num_items(coord->node) - 1);
7615 +
7616 + case AFTER_UNIT:
7617 + return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7618 + coord->unit_pos == coord_last_unit_pos(coord));
7619 + }
7620 +
7621 + impossible("jmacd-9908", "unreachable");
7622 + return 0;
7623 +}
7624 +
7625 +/* Returns true if the coordinate is positioned before the first item or it is
7626 + an empty node. */
7627 +int coord_is_before_leftmost(const coord_t *coord)
7628 +{
7629 + /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7630 + necessary to check if coord is set before leftmost
7631 + assert ("jmacd-7313", coord_check (coord)); */
7632 + switch (coord->between) {
7633 + case INVALID_COORD:
7634 + case AT_UNIT:
7635 + case AFTER_ITEM:
7636 + case AFTER_UNIT:
7637 + return 0;
7638 +
7639 + case EMPTY_NODE:
7640 + return 1;
7641 +
7642 + case BEFORE_ITEM:
7643 + case BEFORE_UNIT:
7644 + return (coord->item_pos == 0) && (coord->unit_pos == 0);
7645 + }
7646 +
7647 + impossible("jmacd-9908", "unreachable");
7648 + return 0;
7649 +}
7650 +
7651 +/* Returns true if the coordinate is positioned after a item, before a item,
7652 + after the last unit of an item, before the first unit of an item, or at an
7653 + empty node. */
7654 +/* Audited by: green(2002.06.15) */
7655 +int coord_is_between_items(const coord_t *coord)
7656 +{
7657 + assert("jmacd-7313", coord_check(coord));
7658 +
7659 + switch (coord->between) {
7660 + case INVALID_COORD:
7661 + case AT_UNIT:
7662 + return 0;
7663 +
7664 + case AFTER_ITEM:
7665 + case BEFORE_ITEM:
7666 + case EMPTY_NODE:
7667 + return 1;
7668 +
7669 + case BEFORE_UNIT:
7670 + return coord->unit_pos == 0;
7671 +
7672 + case AFTER_UNIT:
7673 + return coord->unit_pos == coord_last_unit_pos(coord);
7674 + }
7675 +
7676 + impossible("jmacd-9908", "unreachable");
7677 + return 0;
7678 +}
7679 +
7680 +#if REISER4_DEBUG
7681 +/* Returns true if the coordinates are positioned at adjacent units, regardless
7682 + of before-after or item boundaries. */
7683 +int coord_are_neighbors(coord_t *c1, coord_t *c2)
7684 +{
7685 + coord_t *left;
7686 + coord_t *right;
7687 +
7688 + assert("nikita-1241", c1 != NULL);
7689 + assert("nikita-1242", c2 != NULL);
7690 + assert("nikita-1243", c1->node == c2->node);
7691 + assert("nikita-1244", coord_is_existing_unit(c1));
7692 + assert("nikita-1245", coord_is_existing_unit(c2));
7693 +
7694 + left = right = NULL;
7695 + switch (coord_compare(c1, c2)) {
7696 + case COORD_CMP_ON_LEFT:
7697 + left = c1;
7698 + right = c2;
7699 + break;
7700 + case COORD_CMP_ON_RIGHT:
7701 + left = c2;
7702 + right = c1;
7703 + break;
7704 + case COORD_CMP_SAME:
7705 + return 0;
7706 + default:
7707 + wrong_return_value("nikita-1246", "compare_coords()");
7708 + }
7709 + assert("vs-731", left && right);
7710 + if (left->item_pos == right->item_pos) {
7711 + return left->unit_pos + 1 == right->unit_pos;
7712 + } else if (left->item_pos + 1 == right->item_pos) {
7713 + return (left->unit_pos == coord_last_unit_pos(left))
7714 + && (right->unit_pos == 0);
7715 + } else {
7716 + return 0;
7717 + }
7718 +}
7719 +#endif /* REISER4_DEBUG */
7720 +
7721 +/* Assuming two coordinates are positioned in the same node, return
7722 + COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's
7723 + position relative to c2. */
7724 +/* Audited by: green(2002.06.15) */
7725 +coord_cmp coord_compare(coord_t *c1, coord_t *c2)
7726 +{
7727 + assert("vs-209", c1->node == c2->node);
7728 + assert("vs-194", coord_is_existing_unit(c1)
7729 + && coord_is_existing_unit(c2));
7730 +
7731 + if (c1->item_pos > c2->item_pos)
7732 + return COORD_CMP_ON_RIGHT;
7733 + if (c1->item_pos < c2->item_pos)
7734 + return COORD_CMP_ON_LEFT;
7735 + if (c1->unit_pos > c2->unit_pos)
7736 + return COORD_CMP_ON_RIGHT;
7737 + if (c1->unit_pos < c2->unit_pos)
7738 + return COORD_CMP_ON_LEFT;
7739 + return COORD_CMP_SAME;
7740 +}
7741 +
7742 +/* If the coordinate is between items, shifts it to the right. Returns 0 on
7743 + success and non-zero if there is no position to the right. */
7744 +int coord_set_to_right(coord_t *coord)
7745 +{
7746 + unsigned items = coord_num_items(coord);
7747 +
7748 + if (coord_adjust_items(coord, items, 1) == 1)
7749 + return 1;
7750 +
7751 + switch (coord->between) {
7752 + case AT_UNIT:
7753 + return 0;
7754 +
7755 + case BEFORE_ITEM:
7756 + case BEFORE_UNIT:
7757 + coord->between = AT_UNIT;
7758 + return 0;
7759 +
7760 + case AFTER_UNIT:
7761 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7762 + coord->unit_pos += 1;
7763 + coord->between = AT_UNIT;
7764 + return 0;
7765 + } else {
7766 +
7767 + coord->unit_pos = 0;
7768 +
7769 + if (coord->item_pos == items - 1) {
7770 + coord->between = AFTER_ITEM;
7771 + return 1;
7772 + }
7773 +
7774 + coord_inc_item_pos(coord);
7775 + coord->between = AT_UNIT;
7776 + return 0;
7777 + }
7778 +
7779 + case AFTER_ITEM:
7780 + if (coord->item_pos == items - 1)
7781 + return 1;
7782 +
7783 + coord_inc_item_pos(coord);
7784 + coord->unit_pos = 0;
7785 + coord->between = AT_UNIT;
7786 + return 0;
7787 +
7788 + case EMPTY_NODE:
7789 + return 1;
7790 +
7791 + case INVALID_COORD:
7792 + break;
7793 + }
7794 +
7795 + impossible("jmacd-9920", "unreachable");
7796 + return 0;
7797 +}
7798 +
7799 +/* If the coordinate is between items, shifts it to the left. Returns 0 on
7800 + success and non-zero if there is no position to the left. */
7801 +int coord_set_to_left(coord_t *coord)
7802 +{
7803 + unsigned items = coord_num_items(coord);
7804 +
7805 + if (coord_adjust_items(coord, items, 0) == 1)
7806 + return 1;
7807 +
7808 + switch (coord->between) {
7809 + case AT_UNIT:
7810 + return 0;
7811 +
7812 + case AFTER_UNIT:
7813 + coord->between = AT_UNIT;
7814 + return 0;
7815 +
7816 + case AFTER_ITEM:
7817 + coord->between = AT_UNIT;
7818 + coord->unit_pos = coord_last_unit_pos(coord);
7819 + return 0;
7820 +
7821 + case BEFORE_UNIT:
7822 + if (coord->unit_pos > 0) {
7823 + coord->unit_pos -= 1;
7824 + coord->between = AT_UNIT;
7825 + return 0;
7826 + } else {
7827 +
7828 + if (coord->item_pos == 0) {
7829 + coord->between = BEFORE_ITEM;
7830 + return 1;
7831 + }
7832 +
7833 + coord->unit_pos = coord_last_unit_pos(coord);
7834 + coord_dec_item_pos(coord);
7835 + coord->between = AT_UNIT;
7836 + return 0;
7837 + }
7838 +
7839 + case BEFORE_ITEM:
7840 + if (coord->item_pos == 0)
7841 + return 1;
7842 +
7843 + coord_dec_item_pos(coord);
7844 + coord->unit_pos = coord_last_unit_pos(coord);
7845 + coord->between = AT_UNIT;
7846 + return 0;
7847 +
7848 + case EMPTY_NODE:
7849 + return 1;
7850 +
7851 + case INVALID_COORD:
7852 + break;
7853 + }
7854 +
7855 + impossible("jmacd-9920", "unreachable");
7856 + return 0;
7857 +}
7858 +
7859 +static const char *coord_tween_tostring(between_enum n)
7860 +{
7861 + switch (n) {
7862 + case BEFORE_UNIT:
7863 + return "before unit";
7864 + case BEFORE_ITEM:
7865 + return "before item";
7866 + case AT_UNIT:
7867 + return "at unit";
7868 + case AFTER_UNIT:
7869 + return "after unit";
7870 + case AFTER_ITEM:
7871 + return "after item";
7872 + case EMPTY_NODE:
7873 + return "empty node";
7874 + case INVALID_COORD:
7875 + return "invalid";
7876 + default:
7877 + {
7878 + static char buf[30];
7879 +
7880 + sprintf(buf, "unknown: %i", n);
7881 + return buf;
7882 + }
7883 + }
7884 +}
7885 +
7886 +void print_coord(const char *mes, const coord_t *coord, int node)
7887 +{
7888 + if (coord == NULL) {
7889 + printk("%s: null\n", mes);
7890 + return;
7891 + }
7892 + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7893 + mes, coord->item_pos, coord->unit_pos,
7894 + coord_tween_tostring(coord->between), coord->iplugid);
7895 +}
7896 +
7897 +int
7898 +item_utmost_child_real_block(const coord_t *coord, sideof side,
7899 + reiser4_block_nr * blk)
7900 +{
7901 + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7902 + side,
7903 + blk);
7904 +}
7905 +
7906 +int item_utmost_child(const coord_t *coord, sideof side, jnode ** child)
7907 +{
7908 + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7909 +}
7910 +
7911 +/* @count bytes of flow @f got written, update correspondingly f->length,
7912 + f->data and f->key */
7913 +void move_flow_forward(flow_t *f, unsigned count)
7914 +{
7915 + if (f->data)
7916 + f->data += count;
7917 + f->length -= count;
7918 + set_key_offset(&f->key, get_key_offset(&f->key) + count);
7919 +}
7920 +
7921 +/*
7922 + Local variables:
7923 + c-indentation-style: "K&R"
7924 + mode-name: "LC"
7925 + c-basic-offset: 8
7926 + tab-width: 8
7927 + fill-column: 120
7928 + scroll-step: 1
7929 + End:
7930 +*/
7931 diff -urN linux-2.6.33.orig/fs/reiser4/coord.h linux-2.6.33/fs/reiser4/coord.h
7932 --- linux-2.6.33.orig/fs/reiser4/coord.h 1970-01-01 01:00:00.000000000 +0100
7933 +++ linux-2.6.33/fs/reiser4/coord.h 2010-03-04 19:33:22.000000000 +0100
7934 @@ -0,0 +1,399 @@
7935 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7936 + reiser4/README */
7937 +
7938 +/* Coords */
7939 +
7940 +#if !defined(__REISER4_COORD_H__)
7941 +#define __REISER4_COORD_H__
7942 +
7943 +#include "forward.h"
7944 +#include "debug.h"
7945 +#include "dformat.h"
7946 +#include "key.h"
7947 +
7948 +/* insertions happen between coords in the tree, so we need some means
7949 + of specifying the sense of betweenness. */
7950 +typedef enum {
7951 + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7952 + AT_UNIT,
7953 + AFTER_UNIT,
7954 + BEFORE_ITEM,
7955 + AFTER_ITEM,
7956 + INVALID_COORD,
7957 + EMPTY_NODE,
7958 +} between_enum;
7959 +
7960 +/* location of coord w.r.t. its node */
7961 +typedef enum {
7962 + COORD_ON_THE_LEFT = -1,
7963 + COORD_ON_THE_RIGHT = +1,
7964 + COORD_INSIDE = 0
7965 +} coord_wrt_node;
7966 +
7967 +typedef enum {
7968 + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7969 +} coord_cmp;
7970 +
7971 +struct coord {
7972 + /* node in a tree */
7973 + /* 0 */ znode *node;
7974 +
7975 + /* position of item within node */
7976 + /* 4 */ pos_in_node_t item_pos;
7977 + /* position of unit within item */
7978 + /* 6 */ pos_in_node_t unit_pos;
7979 + /* optimization: plugin of item is stored in coord_t. Until this was
7980 + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7981 + is invalidated (set to 0xff) on each modification of ->item_pos,
7982 + and all such modifications are funneled through coord_*_item_pos()
7983 + functions below.
7984 + */
7985 + /* 8 */ char iplugid;
7986 + /* position of coord w.r.t. to neighboring items and/or units.
7987 + Values are taken from &between_enum above.
7988 + */
7989 + /* 9 */ char between;
7990 + /* padding. It will be added by the compiler anyway to conform to the
7991 + * C language alignment requirements. We keep it here to be on the
7992 + * safe side and to have a clear picture of the memory layout of this
7993 + * structure. */
7994 + /* 10 */ __u16 pad;
7995 + /* 12 */ int offset;
7996 +#if REISER4_DEBUG
7997 + unsigned long plug_v;
7998 + unsigned long body_v;
7999 +#endif
8000 +};
8001 +
8002 +#define INVALID_PLUGID ((char)((1 << 8) - 1))
8003 +#define INVALID_OFFSET -1
8004 +
8005 +static inline void coord_clear_iplug(coord_t *coord)
8006 +{
8007 + assert("nikita-2835", coord != NULL);
8008 + coord->iplugid = INVALID_PLUGID;
8009 + coord->offset = INVALID_OFFSET;
8010 +}
8011 +
8012 +static inline int coord_is_iplug_set(const coord_t *coord)
8013 +{
8014 + assert("nikita-2836", coord != NULL);
8015 + return coord->iplugid != INVALID_PLUGID;
8016 +}
8017 +
8018 +static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos)
8019 +{
8020 + assert("nikita-2478", coord != NULL);
8021 + coord->item_pos = pos;
8022 + coord_clear_iplug(coord);
8023 +}
8024 +
8025 +static inline void coord_dec_item_pos(coord_t *coord)
8026 +{
8027 + assert("nikita-2480", coord != NULL);
8028 + --coord->item_pos;
8029 + coord_clear_iplug(coord);
8030 +}
8031 +
8032 +static inline void coord_inc_item_pos(coord_t *coord)
8033 +{
8034 + assert("nikita-2481", coord != NULL);
8035 + ++coord->item_pos;
8036 + coord_clear_iplug(coord);
8037 +}
8038 +
8039 +static inline void coord_add_item_pos(coord_t *coord, int delta)
8040 +{
8041 + assert("nikita-2482", coord != NULL);
8042 + coord->item_pos += delta;
8043 + coord_clear_iplug(coord);
8044 +}
8045 +
8046 +static inline void coord_invalid_item_pos(coord_t *coord)
8047 +{
8048 + assert("nikita-2832", coord != NULL);
8049 + coord->item_pos = (unsigned short)~0;
8050 + coord_clear_iplug(coord);
8051 +}
8052 +
8053 +/* Reverse a direction. */
8054 +static inline sideof sideof_reverse(sideof side)
8055 +{
8056 + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8057 +}
8058 +
8059 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8060 +
8061 + "first" and "last"
8062 + "next" and "prev"
8063 + "before" and "after"
8064 + "leftmost" and "rightmost"
8065 +
8066 + But I think the chosen names are decent the way they are.
8067 +*/
8068 +
8069 +/* COORD INITIALIZERS */
8070 +
8071 +/* Initialize an invalid coordinate. */
8072 +extern void coord_init_invalid(coord_t *coord, const znode * node);
8073 +
8074 +extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node);
8075 +
8076 +/* Initialize a coordinate to point at the first unit of the first item. If the
8077 + node is empty, it is positioned at the EMPTY_NODE. */
8078 +extern void coord_init_first_unit(coord_t *coord, const znode * node);
8079 +
8080 +/* Initialize a coordinate to point at the last unit of the last item. If the
8081 + node is empty, it is positioned at the EMPTY_NODE. */
8082 +extern void coord_init_last_unit(coord_t *coord, const znode * node);
8083 +
8084 +/* Initialize a coordinate to before the first item. If the node is empty, it is
8085 + positioned at the EMPTY_NODE. */
8086 +extern void coord_init_before_first_item(coord_t *coord, const znode * node);
8087 +
8088 +/* Initialize a coordinate to after the last item. If the node is empty, it is
8089 + positioned at the EMPTY_NODE. */
8090 +extern void coord_init_after_last_item(coord_t *coord, const znode * node);
8091 +
8092 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8093 + already to existing item */
8094 +void coord_init_after_item_end(coord_t *coord);
8095 +
8096 +/* Initialize a coordinate to before the item. Coord must be set already to
8097 + existing item */
8098 +void coord_init_before_item(coord_t *);
8099 +/* Initialize a coordinate to after the item. Coord must be set already to
8100 + existing item */
8101 +void coord_init_after_item(coord_t *);
8102 +
8103 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
8104 + sideof argument. */
8105 +extern void coord_init_sideof_unit(coord_t *coord, const znode * node,
8106 + sideof dir);
8107 +
8108 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8109 + it was not clear how actually
8110 + FIXME-VS: added by vs (2002, june, 8) */
8111 +extern void coord_init_zero(coord_t *coord);
8112 +
8113 +/* COORD METHODS */
8114 +
8115 +/* after shifting of node content, coord previously set properly may become
8116 + invalid, try to "normalize" it. */
8117 +void coord_normalize(coord_t *coord);
8118 +
8119 +/* Copy a coordinate. */
8120 +extern void coord_dup(coord_t *coord, const coord_t *old_coord);
8121 +
8122 +/* Copy a coordinate without check. */
8123 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord);
8124 +
8125 +unsigned coord_num_units(const coord_t *coord);
8126 +
8127 +/* Return the last valid unit number at the present item (i.e.,
8128 + coord_num_units() - 1). */
8129 +static inline unsigned coord_last_unit_pos(const coord_t *coord)
8130 +{
8131 + return coord_num_units(coord) - 1;
8132 +}
8133 +
8134 +#if REISER4_DEBUG
8135 +/* For assertions only, checks for a valid coordinate. */
8136 +extern int coord_check(const coord_t *coord);
8137 +
8138 +extern unsigned long znode_times_locked(const znode * z);
8139 +
8140 +static inline void coord_update_v(coord_t *coord)
8141 +{
8142 + coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8143 +}
8144 +#endif
8145 +
8146 +extern int coords_equal(const coord_t *c1, const coord_t *c2);
8147 +
8148 +extern void print_coord(const char *mes, const coord_t *coord, int print_node);
8149 +
8150 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
8151 + coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
8152 + NCOORD_INSIDE. */
8153 +extern coord_wrt_node coord_wrt(const coord_t *coord);
8154 +
8155 +/* Returns true if the coordinates are positioned at adjacent units, regardless
8156 + of before-after or item boundaries. */
8157 +extern int coord_are_neighbors(coord_t *c1, coord_t *c2);
8158 +
8159 +/* Assuming two coordinates are positioned in the same node, return
8160 + NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's
8161 + position relative to c2. */
8162 +extern coord_cmp coord_compare(coord_t *c1, coord_t *c2);
8163 +
8164 +/* COORD PREDICATES */
8165 +
8166 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8167 +extern int coord_is_invalid(const coord_t *coord);
8168 +
8169 +/* Returns true if the coordinate is positioned at an existing item, not before
8170 + or after an item. It may be placed at, before, or after any unit within the
8171 + item, whether existing or not. If this is true you can call methods of the
8172 + item plugin. */
8173 +extern int coord_is_existing_item(const coord_t *coord);
8174 +
8175 +/* Returns true if the coordinate is positioned after a item, before a item,
8176 + after the last unit of an item, before the first unit of an item, or at an
8177 + empty node. */
8178 +extern int coord_is_between_items(const coord_t *coord);
8179 +
8180 +/* Returns true if the coordinate is positioned at an existing unit, not before
8181 + or after a unit. */
8182 +extern int coord_is_existing_unit(const coord_t *coord);
8183 +
8184 +/* Returns true if the coordinate is positioned at an empty node. */
8185 +extern int coord_is_empty(const coord_t *coord);
8186 +
8187 +/* Returns true if the coordinate is positioned at the first unit of the first
8188 + item. Not true for empty nodes nor coordinates positioned before the first
8189 + item. */
8190 +extern int coord_is_leftmost_unit(const coord_t *coord);
8191 +
8192 +/* Returns true if the coordinate is positioned after the last item or after the
8193 + last unit of the last item or it is an empty node. */
8194 +extern int coord_is_after_rightmost(const coord_t *coord);
8195 +
8196 +/* Returns true if the coordinate is positioned before the first item or it is
8197 + an empty node. */
8198 +extern int coord_is_before_leftmost(const coord_t *coord);
8199 +
8200 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
8201 + on sideof argument. */
8202 +extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir);
8203 +
8204 +/* COORD MODIFIERS */
8205 +
8206 +/* Advances the coordinate by one unit to the right. If empty, no change. If
8207 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
8208 + position is an existing unit. */
8209 +extern int coord_next_unit(coord_t *coord);
8210 +
8211 +/* Advances the coordinate by one item to the right. If empty, no change. If
8212 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
8213 + position is an existing item. */
8214 +extern int coord_next_item(coord_t *coord);
8215 +
8216 +/* Advances the coordinate by one unit to the left. If empty, no change. If
8217 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
8218 + position is an existing unit. */
8219 +extern int coord_prev_unit(coord_t *coord);
8220 +
8221 +/* Advances the coordinate by one item to the left. If empty, no change. If
8222 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
8223 + position is an existing item. */
8224 +extern int coord_prev_item(coord_t *coord);
8225 +
8226 +/* If the coordinate is between items, shifts it to the right. Returns 0 on
8227 + success and non-zero if there is no position to the right. */
8228 +extern int coord_set_to_right(coord_t *coord);
8229 +
8230 +/* If the coordinate is between items, shifts it to the left. Returns 0 on
8231 + success and non-zero if there is no position to the left. */
8232 +extern int coord_set_to_left(coord_t *coord);
8233 +
8234 +/* If the coordinate is at an existing unit, set to after that unit. Returns 0
8235 + on success and non-zero if the unit did not exist. */
8236 +extern int coord_set_after_unit(coord_t *coord);
8237 +
8238 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof
8239 + argument. */
8240 +extern int coord_sideof_unit(coord_t *coord, sideof dir);
8241 +
8242 +/* iterate over all units in @node */
8243 +#define for_all_units(coord, node) \
8244 + for (coord_init_before_first_item((coord), (node)) ; \
8245 + coord_next_unit(coord) == 0 ;)
8246 +
8247 +/* iterate over all items in @node */
8248 +#define for_all_items(coord, node) \
8249 + for (coord_init_before_first_item((coord), (node)) ; \
8250 + coord_next_item(coord) == 0 ;)
8251 +
8252 +/* COORD/ITEM METHODS */
8253 +
8254 +extern int item_utmost_child_real_block(const coord_t *coord, sideof side,
8255 + reiser4_block_nr * blk);
8256 +extern int item_utmost_child(const coord_t *coord, sideof side,
8257 + jnode ** child);
8258 +
8259 +/* a flow is a sequence of bytes being written to or read from the tree. The
8260 + tree will slice the flow into items while storing it into nodes, but all of
8261 + that is hidden from anything outside the tree. */
8262 +
8263 +struct flow {
8264 + reiser4_key key; /* key of start of flow's sequence of bytes */
8265 + loff_t length; /* length of flow's sequence of bytes */
8266 + char *data; /* start of flow's sequence of bytes */
8267 + int user; /* if 1 data is user space, 0 - kernel space */
8268 + rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8269 +};
8270 +
8271 +void move_flow_forward(flow_t *f, unsigned count);
8272 +
8273 +/* &reiser4_item_data - description of data to be inserted or pasted
8274 +
8275 + Q: articulate the reasons for the difference between this and flow.
8276 +
8277 + A: Becides flow we insert into tree other things: stat data, directory
8278 + entry, etc. To insert them into tree one has to provide this structure. If
8279 + one is going to insert flow - he can use insert_flow, where this structure
8280 + does not have to be created
8281 +*/
8282 +struct reiser4_item_data {
8283 + /* actual data to be inserted. If NULL, ->create_item() will not
8284 + do xmemcpy itself, leaving this up to the caller. This can
8285 + save some amount of unnecessary memory copying, for example,
8286 + during insertion of stat data.
8287 +
8288 + */
8289 + char *data;
8290 + /* 1 if 'char * data' contains pointer to user space and 0 if it is
8291 + kernel space */
8292 + int user;
8293 + /* amount of data we are going to insert or paste */
8294 + int length;
8295 + /* "Arg" is opaque data that is passed down to the
8296 + ->create_item() method of node layout, which in turn
8297 + hands it to the ->create_hook() of item being created. This
8298 + arg is currently used by:
8299 +
8300 + . ->create_hook() of internal item
8301 + (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8302 + . ->paste() method of directory item.
8303 + . ->create_hook() of extent item
8304 +
8305 + For internal item, this is left "brother" of new node being
8306 + inserted and it is used to add new node into sibling list
8307 + after parent to it was just inserted into parent.
8308 +
8309 + While ->arg does look somewhat of unnecessary compication,
8310 + it actually saves a lot of headache in many places, because
8311 + all data necessary to insert or paste new data into tree are
8312 + collected in one place, and this eliminates a lot of extra
8313 + argument passing and storing everywhere.
8314 +
8315 + */
8316 + void *arg;
8317 + /* plugin of item we are inserting */
8318 + item_plugin *iplug;
8319 +};
8320 +
8321 +/* __REISER4_COORD_H__ */
8322 +#endif
8323 +
8324 +/* Make Linus happy.
8325 + Local variables:
8326 + c-indentation-style: "K&R"
8327 + mode-name: "LC"
8328 + c-basic-offset: 8
8329 + tab-width: 8
8330 + fill-column: 120
8331 + scroll-step: 1
8332 + End:
8333 +*/
8334 diff -urN linux-2.6.33.orig/fs/reiser4/debug.c linux-2.6.33/fs/reiser4/debug.c
8335 --- linux-2.6.33.orig/fs/reiser4/debug.c 1970-01-01 01:00:00.000000000 +0100
8336 +++ linux-2.6.33/fs/reiser4/debug.c 2010-03-04 19:33:22.000000000 +0100
8337 @@ -0,0 +1,308 @@
8338 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8339 + * reiser4/README */
8340 +
8341 +/* Debugging facilities. */
8342 +
8343 +/*
8344 + * This file contains generic debugging functions used by reiser4. Roughly
8345 + * following:
8346 + *
8347 + * panicking: reiser4_do_panic(), reiser4_print_prefix().
8348 + *
8349 + * locking:
8350 + * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8351 + * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8352 + *
8353 + * error code monitoring (see comment before RETERR macro):
8354 + * reiser4_return_err(), reiser4_report_err().
8355 + *
8356 + * stack back-tracing: fill_backtrace()
8357 + *
8358 + * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8359 + * reiser4_debugtrap().
8360 + *
8361 + */
8362 +
8363 +#include "reiser4.h"
8364 +#include "context.h"
8365 +#include "super.h"
8366 +#include "txnmgr.h"
8367 +#include "znode.h"
8368 +
8369 +#include <linux/sysfs.h>
8370 +#include <linux/slab.h>
8371 +#include <linux/types.h>
8372 +#include <linux/fs.h>
8373 +#include <linux/spinlock.h>
8374 +#include <linux/kallsyms.h>
8375 +#include <linux/vmalloc.h>
8376 +#include <linux/ctype.h>
8377 +#include <linux/sysctl.h>
8378 +#include <linux/hardirq.h>
8379 +
8380 +#if 0
8381 +#if REISER4_DEBUG
8382 +static void reiser4_report_err(void);
8383 +#else
8384 +#define reiser4_report_err() noop
8385 +#endif
8386 +#endif /* 0 */
8387 +
8388 +/*
8389 + * global buffer where message given to reiser4_panic is formatted.
8390 + */
8391 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8392 +
8393 +/*
8394 + * lock protecting consistency of panic_buf under concurrent panics
8395 + */
8396 +static DEFINE_SPINLOCK(panic_guard);
8397 +
8398 +/* Your best friend. Call it on each occasion. This is called by
8399 + fs/reiser4/debug.h:reiser4_panic(). */
8400 +void reiser4_do_panic(const char *format/* format string */ , ... /* rest */)
8401 +{
8402 + static int in_panic = 0;
8403 + va_list args;
8404 +
8405 + /*
8406 + * check for recursive panic.
8407 + */
8408 + if (in_panic == 0) {
8409 + in_panic = 1;
8410 +
8411 + spin_lock(&panic_guard);
8412 + va_start(args, format);
8413 + vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8414 + va_end(args);
8415 + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8416 + spin_unlock(&panic_guard);
8417 +
8418 + /*
8419 + * if kernel debugger is configured---drop in. Early dropping
8420 + * into kgdb is not always convenient, because panic message
8421 + * is not yet printed most of the times. But:
8422 + *
8423 + * (1) message can be extracted from printk_buf[]
8424 + * (declared static inside of printk()), and
8425 + *
8426 + * (2) sometimes serial/kgdb combo dies while printing
8427 + * long panic message, so it's more prudent to break into
8428 + * debugger earlier.
8429 + *
8430 + */
8431 + DEBUGON(1);
8432 + }
8433 + /* to make gcc happy about noreturn attribute */
8434 + panic("%s", panic_buf);
8435 +}
8436 +
8437 +#if 0
8438 +void
8439 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8440 + const char *function, const char *file, int lineno)
8441 +{
8442 + const char *comm;
8443 + int pid;
8444 +
8445 + if (unlikely(in_interrupt() || in_irq())) {
8446 + comm = "interrupt";
8447 + pid = 0;
8448 + } else {
8449 + comm = current->comm;
8450 + pid = current->pid;
8451 + }
8452 + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8453 + level, comm, pid, function, file, lineno, mid);
8454 + if (reperr)
8455 + reiser4_report_err();
8456 +}
8457 +#endif /* 0 */
8458 +
8459 +/* Preemption point: this should be called periodically during long running
8460 + operations (carry, allocate, and squeeze are best examples) */
8461 +int reiser4_preempt_point(void)
8462 +{
8463 + assert("nikita-3008", reiser4_schedulable());
8464 + cond_resched();
8465 + return signal_pending(current);
8466 +}
8467 +
8468 +#if REISER4_DEBUG
8469 +/* Debugging aid: return struct where information about locks taken by current
8470 + thread is accumulated. This can be used to formulate lock ordering
8471 + constraints and various assertions.
8472 +
8473 +*/
8474 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
8475 +{
8476 + reiser4_context *ctx = get_current_context();
8477 + assert("jmacd-1123", ctx != NULL);
8478 + return &ctx->locks;
8479 +}
8480 +
8481 +/*
8482 + * print human readable information about locks held by the reiser4 context.
8483 + */
8484 +static void print_lock_counters(const char *prefix,
8485 + const reiser4_lock_cnt_info * info)
8486 +{
8487 + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8488 + "jload: %i, "
8489 + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8490 + "ktxnmgrd: %i, fq: %i\n"
8491 + "inode: %i, "
8492 + "cbk_cache: %i (r:%i,w%i), "
8493 + "eflush: %i, "
8494 + "zlock: %i,\n"
8495 + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8496 + "d: %i, x: %i, t: %i\n", prefix,
8497 + info->spin_locked_jnode,
8498 + info->rw_locked_tree, info->read_locked_tree,
8499 + info->write_locked_tree,
8500 + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8501 + info->spin_locked_jload,
8502 + info->spin_locked_txnh,
8503 + info->spin_locked_atom, info->spin_locked_stack,
8504 + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8505 + info->spin_locked_fq,
8506 + info->spin_locked_inode,
8507 + info->rw_locked_cbk_cache,
8508 + info->read_locked_cbk_cache,
8509 + info->write_locked_cbk_cache,
8510 + info->spin_locked_super_eflush,
8511 + info->spin_locked_zlock,
8512 + info->spin_locked,
8513 + info->long_term_locked_znode,
8514 + info->inode_sem_r, info->inode_sem_w,
8515 + info->d_refs, info->x_refs, info->t_refs);
8516 +}
8517 +
8518 +/* check that no spinlocks are held */
8519 +int reiser4_schedulable(void)
8520 +{
8521 + if (get_current_context_check() != NULL) {
8522 + if (!LOCK_CNT_NIL(spin_locked)) {
8523 + print_lock_counters("in atomic", reiser4_lock_counters());
8524 + return 0;
8525 + }
8526 + }
8527 + might_sleep();
8528 + return 1;
8529 +}
8530 +/*
8531 + * return true, iff no locks are held.
8532 + */
8533 +int reiser4_no_counters_are_held(void)
8534 +{
8535 + reiser4_lock_cnt_info *counters;
8536 +
8537 + counters = reiser4_lock_counters();
8538 + return
8539 + (counters->spin_locked_zlock == 0) &&
8540 + (counters->spin_locked_jnode == 0) &&
8541 + (counters->rw_locked_tree == 0) &&
8542 + (counters->read_locked_tree == 0) &&
8543 + (counters->write_locked_tree == 0) &&
8544 + (counters->rw_locked_dk == 0) &&
8545 + (counters->read_locked_dk == 0) &&
8546 + (counters->write_locked_dk == 0) &&
8547 + (counters->spin_locked_txnh == 0) &&
8548 + (counters->spin_locked_atom == 0) &&
8549 + (counters->spin_locked_stack == 0) &&
8550 + (counters->spin_locked_txnmgr == 0) &&
8551 + (counters->spin_locked_inode == 0) &&
8552 + (counters->spin_locked == 0) &&
8553 + (counters->long_term_locked_znode == 0) &&
8554 + (counters->inode_sem_r == 0) &&
8555 + (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8556 +}
8557 +
8558 +/*
8559 + * return true, iff transaction commit can be done under locks held by the
8560 + * current thread.
8561 + */
8562 +int reiser4_commit_check_locks(void)
8563 +{
8564 + reiser4_lock_cnt_info *counters;
8565 + int inode_sem_r;
8566 + int inode_sem_w;
8567 + int result;
8568 +
8569 + /*
8570 + * inode's read/write semaphore is the only reiser4 lock that can be
8571 + * held during commit.
8572 + */
8573 +
8574 + counters = reiser4_lock_counters();
8575 + inode_sem_r = counters->inode_sem_r;
8576 + inode_sem_w = counters->inode_sem_w;
8577 +
8578 + counters->inode_sem_r = counters->inode_sem_w = 0;
8579 + result = reiser4_no_counters_are_held();
8580 + counters->inode_sem_r = inode_sem_r;
8581 + counters->inode_sem_w = inode_sem_w;
8582 + return result;
8583 +}
8584 +
8585 +/*
8586 + * fill "error site" in the current reiser4 context. See comment before RETERR
8587 + * macro for more details.
8588 + */
8589 +void reiser4_return_err(int code, const char *file, int line)
8590 +{
8591 + if (code < 0 && is_in_reiser4_context()) {
8592 + reiser4_context *ctx = get_current_context();
8593 +
8594 + if (ctx != NULL) {
8595 + ctx->err.code = code;
8596 + ctx->err.file = file;
8597 + ctx->err.line = line;
8598 + }
8599 + }
8600 +}
8601 +
8602 +#if 0
8603 +/*
8604 + * report error information recorder by reiser4_return_err().
8605 + */
8606 +static void reiser4_report_err(void)
8607 +{
8608 + reiser4_context *ctx = get_current_context_check();
8609 +
8610 + if (ctx != NULL) {
8611 + if (ctx->err.code != 0) {
8612 + printk("code: %i at %s:%i\n",
8613 + ctx->err.code, ctx->err.file, ctx->err.line);
8614 + }
8615 + }
8616 +}
8617 +#endif /* 0 */
8618 +
8619 +#endif /* REISER4_DEBUG */
8620 +
8621 +#if KERNEL_DEBUGGER
8622 +
8623 +/*
8624 + * this functions just drops into kernel debugger. It is a convenient place to
8625 + * put breakpoint in.
8626 + */
8627 +void reiser4_debugtrap(void)
8628 +{
8629 + /* do nothing. Put break point here. */
8630 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8631 + extern void kgdb_breakpoint(void);
8632 + kgdb_breakpoint();
8633 +#endif
8634 +}
8635 +#endif
8636 +
8637 +/* Make Linus happy.
8638 + Local variables:
8639 + c-indentation-style: "K&R"
8640 + mode-name: "LC"
8641 + c-basic-offset: 8
8642 + tab-width: 8
8643 + fill-column: 120
8644 + End:
8645 +*/
8646 diff -urN linux-2.6.33.orig/fs/reiser4/debug.h linux-2.6.33/fs/reiser4/debug.h
8647 --- linux-2.6.33.orig/fs/reiser4/debug.h 1970-01-01 01:00:00.000000000 +0100
8648 +++ linux-2.6.33/fs/reiser4/debug.h 2010-03-04 19:33:22.000000000 +0100
8649 @@ -0,0 +1,351 @@
8650 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8651 + reiser4/README */
8652 +
8653 +/* Declarations of debug macros. */
8654 +
8655 +#if !defined(__FS_REISER4_DEBUG_H__)
8656 +#define __FS_REISER4_DEBUG_H__
8657 +
8658 +#include "forward.h"
8659 +#include "reiser4.h"
8660 +
8661 +/* generic function to produce formatted output, decorating it with
8662 + whatever standard prefixes/postfixes we want. "Fun" is a function
8663 + that will be actually called, can be printk, panic etc.
8664 + This is for use by other debugging macros, not by users. */
8665 +#define DCALL(lev, fun, reperr, label, format, ...) \
8666 +({ \
8667 + fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8668 + current->comm, current->pid, __FUNCTION__, \
8669 + __FILE__, __LINE__, label, ## __VA_ARGS__); \
8670 +})
8671 +
8672 +/*
8673 + * cause kernel to crash
8674 + */
8675 +#define reiser4_panic(mid, format, ...) \
8676 + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8677 +
8678 +/* print message with indication of current process, file, line and
8679 + function */
8680 +#define reiser4_log(label, format, ...) \
8681 + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8682 +
8683 +/* Assertion checked during compilation.
8684 + If "cond" is false (0) we get duplicate case label in switch.
8685 + Use this to check something like famous
8686 + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8687 + in 3.x journal.c. If cassertion fails you get compiler error,
8688 + so no "maintainer-id".
8689 +*/
8690 +#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } })
8691 +
8692 +#define noop do {; } while (0)
8693 +
8694 +#if REISER4_DEBUG
8695 +/* version of info that only actually prints anything when _d_ebugging
8696 + is on */
8697 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8698 +/* macro to catch logical errors. Put it into `default' clause of
8699 + switch() statement. */
8700 +#define impossible(label, format, ...) \
8701 + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8702 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8703 + called. Use this for checking logical consistency and _never_ call
8704 + this to check correctness of external data: disk blocks and user-input . */
8705 +#define assert(label, cond) \
8706 +({ \
8707 + /* call_on_each_assert(); */ \
8708 + if (cond) { \
8709 + /* put negated check to avoid using !(cond) that would lose \
8710 + * warnings for things like assert(a = b); */ \
8711 + ; \
8712 + } else { \
8713 + DEBUGON(1); \
8714 + reiser4_panic(label, "assertion failed: %s", #cond); \
8715 + } \
8716 +})
8717 +
8718 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8719 +#define check_me(label, expr) assert(label, (expr))
8720 +
8721 +#define ON_DEBUG(exp) exp
8722 +
8723 +extern int reiser4_schedulable(void);
8724 +extern void call_on_each_assert(void);
8725 +
8726 +#else
8727 +
8728 +#define dinfo(format, args...) noop
8729 +#define impossible(label, format, args...) noop
8730 +#define assert(label, cond) noop
8731 +#define check_me(label, expr) ((void) (expr))
8732 +#define ON_DEBUG(exp)
8733 +#define reiser4_schedulable() might_sleep()
8734 +
8735 +/* REISER4_DEBUG */
8736 +#endif
8737 +
8738 +#if REISER4_DEBUG
8739 +/* per-thread information about lock acquired by this thread. Used by lock
8740 + * ordering checking in spin_macros.h */
8741 +typedef struct reiser4_lock_cnt_info {
8742 + int rw_locked_tree;
8743 + int read_locked_tree;
8744 + int write_locked_tree;
8745 +
8746 + int rw_locked_dk;
8747 + int read_locked_dk;
8748 + int write_locked_dk;
8749 +
8750 + int rw_locked_cbk_cache;
8751 + int read_locked_cbk_cache;
8752 + int write_locked_cbk_cache;
8753 +
8754 + int spin_locked_zlock;
8755 + int spin_locked_jnode;
8756 + int spin_locked_jload;
8757 + int spin_locked_txnh;
8758 + int spin_locked_atom;
8759 + int spin_locked_stack;
8760 + int spin_locked_txnmgr;
8761 + int spin_locked_ktxnmgrd;
8762 + int spin_locked_fq;
8763 + int spin_locked_inode;
8764 + int spin_locked_super_eflush;
8765 + int spin_locked;
8766 + int long_term_locked_znode;
8767 +
8768 + int inode_sem_r;
8769 + int inode_sem_w;
8770 +
8771 + int d_refs;
8772 + int x_refs;
8773 + int t_refs;
8774 +} reiser4_lock_cnt_info;
8775 +
8776 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8777 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8778 +
8779 +/* increment lock-counter @counter, if present */
8780 +#define LOCK_CNT_INC(counter) \
8781 + IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8782 +
8783 +/* decrement lock-counter @counter, if present */
8784 +#define LOCK_CNT_DEC(counter) \
8785 + IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8786 +
8787 +/* check that lock-counter is zero. This is for use in assertions */
8788 +#define LOCK_CNT_NIL(counter) \
8789 + IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8790 +
8791 +/* check that lock-counter is greater than zero. This is for use in
8792 + * assertions */
8793 +#define LOCK_CNT_GTZ(counter) \
8794 + IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8795 +#define LOCK_CNT_LT(counter,n) \
8796 + IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8797 +
8798 +#else /* REISER4_DEBUG */
8799 +
8800 +/* no-op versions on the above */
8801 +
8802 +typedef struct reiser4_lock_cnt_info {
8803 +} reiser4_lock_cnt_info;
8804 +
8805 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8806 +#define LOCK_CNT_INC(counter) noop
8807 +#define LOCK_CNT_DEC(counter) noop
8808 +#define LOCK_CNT_NIL(counter) (1)
8809 +#define LOCK_CNT_GTZ(counter) (1)
8810 +#define LOCK_CNT_LT(counter, n) (1)
8811 +
8812 +#endif /* REISER4_DEBUG */
8813 +
8814 +#define assert_spin_not_locked(lock) BUG_ON(0)
8815 +#define assert_rw_write_locked(lock) BUG_ON(0)
8816 +#define assert_rw_read_locked(lock) BUG_ON(0)
8817 +#define assert_rw_locked(lock) BUG_ON(0)
8818 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8819 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8820 +#define assert_rw_not_locked(lock) BUG_ON(0)
8821 +
8822 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8823 + option. */
8824 +typedef enum {
8825 + /* print a lot of information during panic. When this is on all jnodes
8826 + * are listed. This can be *very* large output. Usually you don't want
8827 + * this. Especially over serial line. */
8828 + REISER4_VERBOSE_PANIC = 0x00000001,
8829 + /* print a lot of information during umount */
8830 + REISER4_VERBOSE_UMOUNT = 0x00000002,
8831 + /* print gathered statistics on umount */
8832 + REISER4_STATS_ON_UMOUNT = 0x00000004,
8833 + /* check node consistency */
8834 + REISER4_CHECK_NODE = 0x00000008
8835 +} reiser4_debug_flags;
8836 +
8837 +extern int is_in_reiser4_context(void);
8838 +
8839 +/*
8840 + * evaluate expression @e only if with reiser4 context
8841 + */
8842 +#define ON_CONTEXT(e) do { \
8843 + if (is_in_reiser4_context()) { \
8844 + e; \
8845 + } } while (0)
8846 +
8847 +/*
8848 + * evaluate expression @e only when within reiser4_context and debugging is
8849 + * on.
8850 + */
8851 +#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e))
8852 +
8853 +/*
8854 + * complain about unexpected function result and crash. Used in "default"
8855 + * branches of switch statements and alike to assert that invalid results are
8856 + * not silently ignored.
8857 + */
8858 +#define wrong_return_value(label, function) \
8859 + impossible(label, "wrong return value from " function)
8860 +
8861 +/* Issue different types of reiser4 messages to the console */
8862 +#define warning(label, format, ...) \
8863 + DCALL(KERN_WARNING, \
8864 + printk, 1, label, "WARNING: " format , ## __VA_ARGS__)
8865 +#define notice(label, format, ...) \
8866 + DCALL(KERN_NOTICE, \
8867 + printk, 1, label, "NOTICE: " format , ## __VA_ARGS__)
8868 +
8869 +/* mark not yet implemented functionality */
8870 +#define not_yet(label, format, ...) \
8871 + reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__)
8872 +
8873 +extern void reiser4_do_panic(const char *format, ...)
8874 + __attribute__ ((noreturn, format(printf, 1, 2)));
8875 +
8876 +extern int reiser4_preempt_point(void);
8877 +extern void reiser4_print_stats(void);
8878 +
8879 +#if REISER4_DEBUG
8880 +extern int reiser4_no_counters_are_held(void);
8881 +extern int reiser4_commit_check_locks(void);
8882 +#else
8883 +#define reiser4_no_counters_are_held() (1)
8884 +#define reiser4_commit_check_locks() (1)
8885 +#endif
8886 +
8887 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8888 +#define IS_POW(i) \
8889 +({ \
8890 + typeof(i) __i; \
8891 + \
8892 + __i = (i); \
8893 + !(__i & (__i - 1)); \
8894 +})
8895 +
8896 +#define KERNEL_DEBUGGER (1)
8897 +
8898 +#if KERNEL_DEBUGGER
8899 +
8900 +extern void reiser4_debugtrap(void);
8901 +
8902 +/*
8903 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8904 + * kgdb is not compiled in, do nothing.
8905 + */
8906 +#define DEBUGON(cond) \
8907 +({ \
8908 + if (unlikely(cond)) \
8909 + reiser4_debugtrap(); \
8910 +})
8911 +#else
8912 +#define DEBUGON(cond) noop
8913 +#endif
8914 +
8915 +/*
8916 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8917 + *
8918 + * Suppose some strange and/or unexpected code is returned from some function
8919 + * (for example, write(2) returns -EEXIST). It is possible to place a
8920 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8921 + * in what particular place -EEXIST was generated first?
8922 + *
8923 + * In reiser4 all places where actual error codes are produced (that is,
8924 + * statements of the form
8925 + *
8926 + * return -EFOO; // (1), or
8927 + *
8928 + * result = -EFOO; // (2)
8929 + *
8930 + * are replaced with
8931 + *
8932 + * return RETERR(-EFOO); // (1a), and
8933 + *
8934 + * result = RETERR(-EFOO); // (2a) respectively
8935 + *
8936 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8937 + * printed in error and warning messages. Moreover, it's possible to put a
8938 + * conditional breakpoint in reiser4_return_err (low-level function called
8939 + * by RETERR() to do the actual work) to break into debugger immediately
8940 + * when particular error happens.
8941 + *
8942 + */
8943 +
8944 +#if REISER4_DEBUG
8945 +
8946 +/*
8947 + * data-type to store information about where error happened ("error site").
8948 + */
8949 +typedef struct err_site {
8950 + int code; /* error code */
8951 + const char *file; /* source file, filled by __FILE__ */
8952 + int line; /* source file line, filled by __LINE__ */
8953 +} err_site;
8954 +
8955 +extern void reiser4_return_err(int code, const char *file, int line);
8956 +
8957 +/*
8958 + * fill &get_current_context()->err_site with error information.
8959 + */
8960 +#define RETERR(code) \
8961 +({ \
8962 + typeof(code) __code; \
8963 + \
8964 + __code = (code); \
8965 + reiser4_return_err(__code, __FILE__, __LINE__); \
8966 + __code; \
8967 +})
8968 +
8969 +#else
8970 +
8971 +/*
8972 + * no-op versions of the above
8973 + */
8974 +
8975 +typedef struct err_site {
8976 +} err_site;
8977 +#define RETERR(code) code
8978 +#endif
8979 +
8980 +#if REISER4_LARGE_KEY
8981 +/*
8982 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8983 + */
8984 +#define ON_LARGE_KEY(...) __VA_ARGS__
8985 +#else
8986 +#define ON_LARGE_KEY(...)
8987 +#endif
8988 +
8989 +/* __FS_REISER4_DEBUG_H__ */
8990 +#endif
8991 +
8992 +/* Make Linus happy.
8993 + Local variables:
8994 + c-indentation-style: "K&R"
8995 + mode-name: "LC"
8996 + c-basic-offset: 8
8997 + tab-width: 8
8998 + fill-column: 120
8999 + End:
9000 +*/
9001 diff -urN linux-2.6.33.orig/fs/reiser4/dformat.h linux-2.6.33/fs/reiser4/dformat.h
9002 --- linux-2.6.33.orig/fs/reiser4/dformat.h 1970-01-01 01:00:00.000000000 +0100
9003 +++ linux-2.6.33/fs/reiser4/dformat.h 2010-03-04 19:33:22.000000000 +0100
9004 @@ -0,0 +1,71 @@
9005 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9006 + reiser4/README */
9007 +
9008 +/* Formats of on-disk data and conversion functions. */
9009 +
9010 +/* put all item formats in the files describing the particular items,
9011 + our model is, everything you need to do to add an item to reiser4,
9012 + (excepting the changes to the plugin that uses the item which go
9013 + into the file defining that plugin), you put into one file. */
9014 +/* Data on disk are stored in little-endian format.
9015 + To declare fields of on-disk structures, use d8, d16, d32 and d64.
9016 + d??tocpu() and cputod??() to convert. */
9017 +
9018 +#if !defined(__FS_REISER4_DFORMAT_H__)
9019 +#define __FS_REISER4_DFORMAT_H__
9020 +
9021 +#include <asm/byteorder.h>
9022 +#include <asm/unaligned.h>
9023 +#include <linux/types.h>
9024 +
9025 +typedef __u8 d8;
9026 +typedef __le16 d16;
9027 +typedef __le32 d32;
9028 +typedef __le64 d64;
9029 +
9030 +#define PACKED __attribute__((packed))
9031 +
9032 +/* data-type for block number */
9033 +typedef __u64 reiser4_block_nr;
9034 +
9035 +/* data-type for block number on disk, disk format */
9036 +typedef __le64 reiser4_dblock_nr;
9037 +
9038 +/**
9039 + * disk_addr_eq - compare disk addresses
9040 + * @b1: pointer to block number ot compare
9041 + * @b2: pointer to block number ot compare
9042 + *
9043 + * Returns true if if disk addresses are the same
9044 + */
9045 +static inline int disk_addr_eq(const reiser4_block_nr * b1,
9046 + const reiser4_block_nr * b2)
9047 +{
9048 + assert("nikita-1033", b1 != NULL);
9049 + assert("nikita-1266", b2 != NULL);
9050 +
9051 + return !memcmp(b1, b2, sizeof *b1);
9052 +}
9053 +
9054 +/* structure of master reiser4 super block */
9055 +typedef struct reiser4_master_sb {
9056 + char magic[16]; /* "ReIsEr4" */
9057 + __le16 disk_plugin_id; /* id of disk layout plugin */
9058 + __le16 blocksize;
9059 + char uuid[16]; /* unique id */
9060 + char label[16]; /* filesystem label */
9061 + __le64 diskmap; /* location of the diskmap. 0 if not present */
9062 +} reiser4_master_sb;
9063 +
9064 +/* __FS_REISER4_DFORMAT_H__ */
9065 +#endif
9066 +
9067 +/*
9068 + * Local variables:
9069 + * c-indentation-style: "K&R"
9070 + * mode-name: "LC"
9071 + * c-basic-offset: 8
9072 + * tab-width: 8
9073 + * fill-column: 79
9074 + * End:
9075 + */
9076 diff -urN linux-2.6.33.orig/fs/reiser4/dscale.c linux-2.6.33/fs/reiser4/dscale.c
9077 --- linux-2.6.33.orig/fs/reiser4/dscale.c 1970-01-01 01:00:00.000000000 +0100
9078 +++ linux-2.6.33/fs/reiser4/dscale.c 2010-03-04 19:33:22.000000000 +0100
9079 @@ -0,0 +1,192 @@
9080 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9081 + * reiser4/README */
9082 +
9083 +/* Scalable on-disk integers */
9084 +
9085 +/*
9086 + * Various on-disk structures contain integer-like structures. Stat-data
9087 + * contain [yes, "data" is plural, check the dictionary] file size, link
9088 + * count; extent unit contains extent width etc. To accommodate for general
9089 + * case enough space is reserved to keep largest possible value. 64 bits in
9090 + * all cases above. But in overwhelming majority of cases numbers actually
9091 + * stored in these fields will be comparatively small and reserving 8 bytes is
9092 + * a waste of precious disk bandwidth.
9093 + *
9094 + * Scalable integers are one way to solve this problem. dscale_write()
9095 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9096 + * depending on the magnitude of the value supplied. dscale_read() reads value
9097 + * previously stored by dscale_write().
9098 + *
9099 + * dscale_write() produces format not completely unlike of UTF: two highest
9100 + * bits of the first byte are used to store "tag". One of 4 possible tag
9101 + * values is chosen depending on the number being encoded:
9102 + *
9103 + * 0 ... 0x3f => 0 [table 1]
9104 + * 0x40 ... 0x3fff => 1
9105 + * 0x4000 ... 0x3fffffff => 2
9106 + * 0x40000000 ... 0xffffffffffffffff => 3
9107 + *
9108 + * (see dscale_range() function)
9109 + *
9110 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9111 + * to be stored, so in this case there is no place in the first byte to store
9112 + * tag. For such values tag is stored in an extra 9th byte.
9113 + *
9114 + * As _highest_ bits are used for the test (which is natural) scaled integers
9115 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9116 + * uses LITTLE-ENDIAN.
9117 + *
9118 + */
9119 +
9120 +#include "debug.h"
9121 +#include "dscale.h"
9122 +
9123 +/* return tag of scaled integer stored at @address */
9124 +static int gettag(const unsigned char *address)
9125 +{
9126 + /* tag is stored in two highest bits */
9127 + return (*address) >> 6;
9128 +}
9129 +
9130 +/* clear tag from value. Clear tag embedded into @value. */
9131 +static void cleartag(__u64 *value, int tag)
9132 +{
9133 + /*
9134 + * W-w-what ?!
9135 + *
9136 + * Actually, this is rather simple: @value passed here was read by
9137 + * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9138 + * zeroes. Tag is still stored in the highest (arithmetically)
9139 + * non-zero bits of @value, but relative position of tag within __u64
9140 + * depends on @tag.
9141 + *
9142 + * For example if @tag is 0, it's stored 2 highest bits of lowest
9143 + * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9144 + *
9145 + * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9146 + * and it's offset if (2 * 8) - 2 == 14 bits.
9147 + *
9148 + * See table 1 above for details.
9149 + *
9150 + * All these cases are captured by the formula:
9151 + */
9152 + *value &= ~(3 << (((1 << tag) << 3) - 2));
9153 + /*
9154 + * That is, clear two (3 == 0t11) bits at the offset
9155 + *
9156 + * 8 * (2 ^ tag) - 2,
9157 + *
9158 + * that is, two highest bits of (2 ^ tag)-th byte of @value.
9159 + */
9160 +}
9161 +
9162 +/* return tag for @value. See table 1 above for details. */
9163 +static int dscale_range(__u64 value)
9164 +{
9165 + if (value > 0x3fffffff)
9166 + return 3;
9167 + if (value > 0x3fff)
9168 + return 2;
9169 + if (value > 0x3f)
9170 + return 1;
9171 + return 0;
9172 +}
9173 +
9174 +/* restore value stored at @adderss by dscale_write() and return number of
9175 + * bytes consumed */
9176 +int dscale_read(unsigned char *address, __u64 *value)
9177 +{
9178 + int tag;
9179 +
9180 + /* read tag */
9181 + tag = gettag(address);
9182 + switch (tag) {
9183 + case 3:
9184 + /* In this case tag is stored in an extra byte, skip this byte
9185 + * and decode value stored in the next 8 bytes.*/
9186 + *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9187 + /* worst case: 8 bytes for value itself plus one byte for
9188 + * tag. */
9189 + return 9;
9190 + case 0:
9191 + *value = get_unaligned(address);
9192 + break;
9193 + case 1:
9194 + *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9195 + break;
9196 + case 2:
9197 + *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9198 + break;
9199 + default:
9200 + return RETERR(-EIO);
9201 + }
9202 + /* clear tag embedded into @value */
9203 + cleartag(value, tag);
9204 + /* number of bytes consumed is (2 ^ tag)---see table 1. */
9205 + return 1 << tag;
9206 +}
9207 +
9208 +/* number of bytes consumed */
9209 +int dscale_bytes_to_read(unsigned char *address)
9210 +{
9211 + int tag;
9212 +
9213 + tag = gettag(address);
9214 + switch (tag) {
9215 + case 0:
9216 + case 1:
9217 + case 2:
9218 + return 1 << tag;
9219 + case 3:
9220 + return 9;
9221 + default:
9222 + return RETERR(-EIO);
9223 + }
9224 +}
9225 +
9226 +/* store @value at @address and return number of bytes consumed */
9227 +int dscale_write(unsigned char *address, __u64 value)
9228 +{
9229 + int tag;
9230 + int shift;
9231 + __be64 v;
9232 + unsigned char *valarr;
9233 +
9234 + tag = dscale_range(value);
9235 + v = __cpu_to_be64(value);
9236 + valarr = (unsigned char *)&v;
9237 + shift = (tag == 3) ? 1 : 0;
9238 + memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9239 + *address |= (tag << 6);
9240 + return shift + (1 << tag);
9241 +}
9242 +
9243 +/* number of bytes required to store @value */
9244 +int dscale_bytes_to_write(__u64 value)
9245 +{
9246 + int bytes;
9247 +
9248 + bytes = 1 << dscale_range(value);
9249 + if (bytes == 8)
9250 + ++bytes;
9251 + return bytes;
9252 +}
9253 +
9254 +/* returns true if @value and @other require the same number of bytes to be
9255 + * stored. Used by detect when data structure (like stat-data) has to be
9256 + * expanded or contracted. */
9257 +int dscale_fit(__u64 value, __u64 other)
9258 +{
9259 + return dscale_range(value) == dscale_range(other);
9260 +}
9261 +
9262 +/* Make Linus happy.
9263 + Local variables:
9264 + c-indentation-style: "K&R"
9265 + mode-name: "LC"
9266 + c-basic-offset: 8
9267 + tab-width: 8
9268 + fill-column: 120
9269 + scroll-step: 1
9270 + End:
9271 +*/
9272 diff -urN linux-2.6.33.orig/fs/reiser4/dscale.h linux-2.6.33/fs/reiser4/dscale.h
9273 --- linux-2.6.33.orig/fs/reiser4/dscale.h 1970-01-01 01:00:00.000000000 +0100
9274 +++ linux-2.6.33/fs/reiser4/dscale.h 2010-03-04 19:33:22.000000000 +0100
9275 @@ -0,0 +1,28 @@
9276 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9277 + * reiser4/README */
9278 +
9279 +/* Scalable on-disk integers. See dscale.h for details. */
9280 +
9281 +#if !defined(__FS_REISER4_DSCALE_H__)
9282 +#define __FS_REISER4_DSCALE_H__
9283 +
9284 +#include "dformat.h"
9285 +
9286 +extern int dscale_read(unsigned char *address, __u64 *value);
9287 +extern int dscale_write(unsigned char *address, __u64 value);
9288 +extern int dscale_bytes_to_read(unsigned char *address);
9289 +extern int dscale_bytes_to_write(__u64 value);
9290 +extern int dscale_fit(__u64 value, __u64 other);
9291 +
9292 +/* __FS_REISER4_DSCALE_H__ */
9293 +#endif
9294 +
9295 +/* Make Linus happy.
9296 + Local variables:
9297 + c-indentation-style: "K&R"
9298 + mode-name: "LC"
9299 + c-basic-offset: 8
9300 + tab-width: 8
9301 + fill-column: 120
9302 + End:
9303 +*/
9304 diff -urN linux-2.6.33.orig/fs/reiser4/entd.c linux-2.6.33/fs/reiser4/entd.c
9305 --- linux-2.6.33.orig/fs/reiser4/entd.c 1970-01-01 01:00:00.000000000 +0100
9306 +++ linux-2.6.33/fs/reiser4/entd.c 2010-03-04 19:33:22.000000000 +0100
9307 @@ -0,0 +1,338 @@
9308 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9309 + * reiser4/README */
9310 +
9311 +/* Ent daemon. */
9312 +
9313 +#include "debug.h"
9314 +#include "txnmgr.h"
9315 +#include "tree.h"
9316 +#include "entd.h"
9317 +#include "super.h"
9318 +#include "context.h"
9319 +#include "reiser4.h"
9320 +#include "vfs_ops.h"
9321 +#include "page_cache.h"
9322 +#include "inode.h"
9323 +
9324 +#include <linux/sched.h> /* struct task_struct */
9325 +#include <linux/suspend.h>
9326 +#include <linux/kernel.h>
9327 +#include <linux/writeback.h>
9328 +#include <linux/time.h> /* INITIAL_JIFFIES */
9329 +#include <linux/backing-dev.h> /* bdi_write_congested */
9330 +#include <linux/wait.h>
9331 +#include <linux/kthread.h>
9332 +#include <linux/freezer.h>
9333 +
9334 +#define DEF_PRIORITY 12
9335 +#define MAX_ENTD_ITERS 10
9336 +
9337 +static void entd_flush(struct super_block *, struct wbq *);
9338 +static int entd(void *arg);
9339 +
9340 +/*
9341 + * set ->comm field of end thread to make its state visible to the user level
9342 + */
9343 +#define entd_set_comm(state) \
9344 + snprintf(current->comm, sizeof(current->comm), \
9345 + "ent:%s%s", super->s_id, (state))
9346 +
9347 +/**
9348 + * reiser4_init_entd - initialize entd context and start kernel daemon
9349 + * @super: super block to start ent thread for
9350 + *
9351 + * Creates entd contexts, starts kernel thread and waits until it
9352 + * initializes.
9353 + */
9354 +int reiser4_init_entd(struct super_block *super)
9355 +{
9356 + entd_context *ctx;
9357 +
9358 + assert("nikita-3104", super != NULL);
9359 +
9360 + ctx = get_entd_context(super);
9361 +
9362 + memset(ctx, 0, sizeof *ctx);
9363 + spin_lock_init(&ctx->guard);
9364 + init_waitqueue_head(&ctx->wait);
9365 +#if REISER4_DEBUG
9366 + INIT_LIST_HEAD(&ctx->flushers_list);
9367 +#endif
9368 + /* lists of writepage requests */
9369 + INIT_LIST_HEAD(&ctx->todo_list);
9370 + INIT_LIST_HEAD(&ctx->done_list);
9371 + /* start entd */
9372 + ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9373 + if (IS_ERR(ctx->tsk))
9374 + return PTR_ERR(ctx->tsk);
9375 + return 0;
9376 +}
9377 +
9378 +static void put_wbq(struct wbq *rq)
9379 +{
9380 + iput(rq->mapping->host);
9381 + complete(&rq->completion);
9382 +}
9383 +
9384 +/* ent should be locked */
9385 +static struct wbq *__get_wbq(entd_context * ent)
9386 +{
9387 + struct wbq *wbq;
9388 +
9389 + if (list_empty(&ent->todo_list))
9390 + return NULL;
9391 +
9392 + ent->nr_todo_reqs--;
9393 + wbq = list_entry(ent->todo_list.next, struct wbq, link);
9394 + list_del_init(&wbq->link);
9395 + return wbq;
9396 +}
9397 +
9398 +/* ent thread function */
9399 +static int entd(void *arg)
9400 +{
9401 + struct super_block *super;
9402 + entd_context *ent;
9403 + int done = 0;
9404 +
9405 + super = arg;
9406 + /* do_fork() just copies task_struct into the new
9407 + thread. ->fs_context shouldn't be copied of course. This shouldn't
9408 + be a problem for the rest of the code though.
9409 + */
9410 + current->journal_info = NULL;
9411 +
9412 + ent = get_entd_context(super);
9413 +
9414 + while (!done) {
9415 + try_to_freeze();
9416 +
9417 + spin_lock(&ent->guard);
9418 + while (ent->nr_todo_reqs != 0) {
9419 + struct wbq *rq;
9420 +
9421 + assert("", list_empty(&ent->done_list));
9422 +
9423 + /* take request from the queue head */
9424 + rq = __get_wbq(ent);
9425 + assert("", rq != NULL);
9426 + ent->cur_request = rq;
9427 + spin_unlock(&ent->guard);
9428 +
9429 + entd_set_comm("!");
9430 + entd_flush(super, rq);
9431 +
9432 + put_wbq(rq);
9433 +
9434 + /*
9435 + * wakeup all requestors and iput their inodes
9436 + */
9437 + spin_lock(&ent->guard);
9438 + while (!list_empty(&ent->done_list)) {
9439 + rq = list_entry(ent->done_list.next, struct wbq, link);
9440 + list_del_init(&rq->link);
9441 + ent->nr_done_reqs--;
9442 + spin_unlock(&ent->guard);
9443 + assert("", rq->written == 1);
9444 + put_wbq(rq);
9445 + spin_lock(&ent->guard);
9446 + }
9447 + }
9448 + spin_unlock(&ent->guard);
9449 +
9450 + entd_set_comm(".");
9451 +
9452 + {
9453 + DEFINE_WAIT(__wait);
9454 +
9455 + do {
9456 + prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9457 + if (kthread_should_stop()) {
9458 + done = 1;
9459 + break;
9460 + }
9461 + if (ent->nr_todo_reqs != 0)
9462 + break;
9463 + schedule();
9464 + } while (0);
9465 + finish_wait(&ent->wait, &__wait);
9466 + }
9467 + }
9468 + BUG_ON(ent->nr_todo_reqs != 0);
9469 + return 0;
9470 +}
9471 +
9472 +/**
9473 + * reiser4_done_entd - stop entd kernel thread
9474 + * @super: super block to stop ent thread for
9475 + *
9476 + * It is called on umount. Sends stop signal to entd and wait until it handles
9477 + * it.
9478 + */
9479 +void reiser4_done_entd(struct super_block *super)
9480 +{
9481 + entd_context *ent;
9482 +
9483 + assert("nikita-3103", super != NULL);
9484 +
9485 + ent = get_entd_context(super);
9486 + assert("zam-1055", ent->tsk != NULL);
9487 + kthread_stop(ent->tsk);
9488 +}
9489 +
9490 +/* called at the beginning of jnode_flush to register flusher thread with ent
9491 + * daemon */
9492 +void reiser4_enter_flush(struct super_block *super)
9493 +{
9494 + entd_context *ent;
9495 +
9496 + assert("zam-1029", super != NULL);
9497 + ent = get_entd_context(super);
9498 +
9499 + assert("zam-1030", ent != NULL);
9500 +
9501 + spin_lock(&ent->guard);
9502 + ent->flushers++;
9503 +#if REISER4_DEBUG
9504 + list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9505 +#endif
9506 + spin_unlock(&ent->guard);
9507 +}
9508 +
9509 +/* called at the end of jnode_flush */
9510 +void reiser4_leave_flush(struct super_block *super)
9511 +{
9512 + entd_context *ent;
9513 + int wake_up_ent;
9514 +
9515 + assert("zam-1027", super != NULL);
9516 + ent = get_entd_context(super);
9517 +
9518 + assert("zam-1028", ent != NULL);
9519 +
9520 + spin_lock(&ent->guard);
9521 + ent->flushers--;
9522 + wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9523 +#if REISER4_DEBUG
9524 + list_del_init(&get_current_context()->flushers_link);
9525 +#endif
9526 + spin_unlock(&ent->guard);
9527 + if (wake_up_ent)
9528 + wake_up_process(ent->tsk);
9529 +}
9530 +
9531 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9532 +
9533 +static void entd_flush(struct super_block *super, struct wbq *rq)
9534 +{
9535 + reiser4_context ctx;
9536 + int tmp;
9537 +
9538 + init_stack_context(&ctx, super);
9539 + ctx.entd = 1;
9540 + ctx.gfp_mask = GFP_NOFS;
9541 +
9542 + rq->wbc->range_start = page_offset(rq->page);
9543 + rq->wbc->range_end = rq->wbc->range_start +
9544 + (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9545 + tmp = rq->wbc->nr_to_write;
9546 +
9547 + assert("edward-1561", super == rq->wbc->sb);
9548 +
9549 + rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9550 +
9551 + if (rq->wbc->nr_to_write > 0) {
9552 + rq->wbc->range_start = 0;
9553 + rq->wbc->range_end = LLONG_MAX;
9554 + writeback_inodes_wbc(rq->wbc);
9555 + }
9556 + rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9557 +
9558 + reiser4_writeout(super, rq->wbc);
9559 + context_set_commit_async(&ctx);
9560 + reiser4_exit_context(&ctx);
9561 +}
9562 +
9563 +/**
9564 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9565 + * @page: page to be written
9566 + * @wbc: writeback control passed to reiser4_writepage
9567 + *
9568 + * Creates a request, puts it on entd list of requests, wakeups entd if
9569 + * necessary, waits until entd completes with the request.
9570 + */
9571 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9572 +{
9573 + struct super_block *sb;
9574 + struct inode *inode;
9575 + entd_context *ent;
9576 + struct wbq rq;
9577 +
9578 + assert("", PageLocked(page));
9579 + assert("", page->mapping != NULL);
9580 +
9581 + sb = page->mapping->host->i_sb;
9582 + ent = get_entd_context(sb);
9583 + assert("", ent && ent->done == 0);
9584 +
9585 + /*
9586 + * we are going to unlock page and ask ent thread to write the
9587 + * page. Re-dirty page before unlocking so that if ent thread fails to
9588 + * write it - it will remain dirty
9589 + */
9590 + set_page_dirty_notag(page);
9591 +
9592 + /*
9593 + * pin inode in memory, unlock page, entd_flush will iput. We can not
9594 + * iput here becasue we can not allow delete_inode to be called here
9595 + */
9596 + inode = igrab(page->mapping->host);
9597 + unlock_page(page);
9598 + if (inode == NULL)
9599 + /* inode is getting freed */
9600 + return 0;
9601 +
9602 + /* init wbq */
9603 + INIT_LIST_HEAD(&rq.link);
9604 + rq.magic = WBQ_MAGIC;
9605 + rq.wbc = wbc;
9606 + rq.page = page;
9607 + rq.mapping = inode->i_mapping;
9608 + rq.node = NULL;
9609 + rq.written = 0;
9610 + init_completion(&rq.completion);
9611 +
9612 + /* add request to entd's list of writepage requests */
9613 + spin_lock(&ent->guard);
9614 + ent->nr_todo_reqs++;
9615 + list_add_tail(&rq.link, &ent->todo_list);
9616 + if (ent->nr_todo_reqs == 1)
9617 + wake_up_process(ent->tsk);
9618 +
9619 + spin_unlock(&ent->guard);
9620 +
9621 + /* wait until entd finishes */
9622 + wait_for_completion(&rq.completion);
9623 +
9624 + if (rq.written)
9625 + /* Eventually ENTD has written the page to disk. */
9626 + return 0;
9627 + return 0;
9628 +}
9629 +
9630 +int wbq_available(void)
9631 +{
9632 + struct super_block *sb = reiser4_get_current_sb();
9633 + entd_context *ent = get_entd_context(sb);
9634 + return ent->nr_todo_reqs;
9635 +}
9636 +
9637 +/*
9638 + * Local variables:
9639 + * c-indentation-style: "K&R"
9640 + * mode-name: "LC"
9641 + * c-basic-offset: 8
9642 + * tab-width: 8
9643 + * fill-column: 79
9644 + * End:
9645 + */
9646 diff -urN linux-2.6.33.orig/fs/reiser4/entd.h linux-2.6.33/fs/reiser4/entd.h
9647 --- linux-2.6.33.orig/fs/reiser4/entd.h 1970-01-01 01:00:00.000000000 +0100
9648 +++ linux-2.6.33/fs/reiser4/entd.h 2010-03-04 19:33:22.000000000 +0100
9649 @@ -0,0 +1,90 @@
9650 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9651 +
9652 +/* Ent daemon. */
9653 +
9654 +#ifndef __ENTD_H__
9655 +#define __ENTD_H__
9656 +
9657 +#include "context.h"
9658 +
9659 +#include <linux/fs.h>
9660 +#include <linux/completion.h>
9661 +#include <linux/wait.h>
9662 +#include <linux/spinlock.h>
9663 +#include <linux/sched.h> /* for struct task_struct */
9664 +
9665 +#define WBQ_MAGIC 0x7876dc76
9666 +
9667 +/* write-back request. */
9668 +struct wbq {
9669 + int magic;
9670 + struct list_head link; /* list head of this list is in entd context */
9671 + struct writeback_control *wbc;
9672 + struct page *page;
9673 + struct address_space *mapping;
9674 + struct completion completion;
9675 + jnode *node; /* set if ent thread captured requested page */
9676 + int written; /* set if ent thread wrote requested page */
9677 +};
9678 +
9679 +/* ent-thread context. This is used to synchronize starting/stopping ent
9680 + * threads. */
9681 +typedef struct entd_context {
9682 + /* wait queue that ent thread waits on for more work. It's
9683 + * signaled by write_page_by_ent(). */
9684 + wait_queue_head_t wait;
9685 + /* spinlock protecting other fields */
9686 + spinlock_t guard;
9687 + /* ent thread */
9688 + struct task_struct *tsk;
9689 + /* set to indicate that ent thread should leave. */
9690 + int done;
9691 + /* counter of active flushers */
9692 + int flushers;
9693 + /*
9694 + * when reiser4_writepage asks entd to write a page - it adds struct
9695 + * wbq to this list
9696 + */
9697 + struct list_head todo_list;
9698 + /* number of elements on the above list */
9699 + int nr_todo_reqs;
9700 +
9701 + struct wbq *cur_request;
9702 + /*
9703 + * when entd writes a page it moves write-back request from todo_list
9704 + * to done_list. This list is used at the end of entd iteration to
9705 + * wakeup requestors and iput inodes.
9706 + */
9707 + struct list_head done_list;
9708 + /* number of elements on the above list */
9709 + int nr_done_reqs;
9710 +
9711 +#if REISER4_DEBUG
9712 + /* list of all active flushers */
9713 + struct list_head flushers_list;
9714 +#endif
9715 +} entd_context;
9716 +
9717 +extern int reiser4_init_entd(struct super_block *);
9718 +extern void reiser4_done_entd(struct super_block *);
9719 +
9720 +extern void reiser4_enter_flush(struct super_block *);
9721 +extern void reiser4_leave_flush(struct super_block *);
9722 +
9723 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9724 +extern int wbq_available(void);
9725 +extern void ent_writes_page(struct super_block *, struct page *);
9726 +
9727 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9728 +/* __ENTD_H__ */
9729 +#endif
9730 +
9731 +/* Make Linus happy.
9732 + Local variables:
9733 + c-indentation-style: "K&R"
9734 + mode-name: "LC"
9735 + c-basic-offset: 8
9736 + tab-width: 8
9737 + fill-column: 120
9738 + End:
9739 +*/
9740 diff -urN linux-2.6.33.orig/fs/reiser4/eottl.c linux-2.6.33/fs/reiser4/eottl.c
9741 --- linux-2.6.33.orig/fs/reiser4/eottl.c 1970-01-01 01:00:00.000000000 +0100
9742 +++ linux-2.6.33/fs/reiser4/eottl.c 2010-03-04 19:33:22.000000000 +0100
9743 @@ -0,0 +1,510 @@
9744 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9745 + reiser4/README */
9746 +
9747 +#include "forward.h"
9748 +#include "debug.h"
9749 +#include "key.h"
9750 +#include "coord.h"
9751 +#include "plugin/item/item.h"
9752 +#include "plugin/node/node.h"
9753 +#include "znode.h"
9754 +#include "block_alloc.h"
9755 +#include "tree_walk.h"
9756 +#include "tree_mod.h"
9757 +#include "carry.h"
9758 +#include "tree.h"
9759 +#include "super.h"
9760 +
9761 +#include <linux/types.h> /* for __u?? */
9762 +
9763 +/*
9764 + * Extents on the twig level (EOTTL) handling.
9765 + *
9766 + * EOTTL poses some problems to the tree traversal, that are better explained
9767 + * by example.
9768 + *
9769 + * Suppose we have block B1 on the twig level with the following items:
9770 + *
9771 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9772 + * offset)
9773 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9774 + * 2. internal item I2 with key (10:0:0:0)
9775 + *
9776 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9777 + * then intra-node lookup is done. This lookup finished on the E1, because the
9778 + * key we are looking for is larger than the key of E1 and is smaller than key
9779 + * the of I2.
9780 + *
9781 + * Here search is stuck.
9782 + *
9783 + * After some thought it is clear what is wrong here: extents on the twig level
9784 + * break some basic property of the *search* tree (on the pretext, that they
9785 + * restore property of balanced tree).
9786 + *
9787 + * Said property is the following: if in the internal node of the search tree
9788 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9789 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9790 + * through the Pointer.
9791 + *
9792 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9793 + * cannot expand indefinitely to the right to include any item with
9794 + *
9795 + * Key1 <= Key <= Key2.
9796 + *
9797 + * For example, our E1 extent is only responsible for the data with keys
9798 + *
9799 + * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9800 + *
9801 + * so, key range
9802 + *
9803 + * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9804 + *
9805 + * is orphaned: there is no way to get there from the tree root.
9806 + *
9807 + * In other words, extent pointers are different than normal child pointers as
9808 + * far as search tree is concerned, and this creates such problems.
9809 + *
9810 + * Possible solution for this problem is to insert our item into node pointed
9811 + * to by I2. There are some problems through:
9812 + *
9813 + * (1) I2 can be in a different node.
9814 + * (2) E1 can be immediately followed by another extent E2.
9815 + *
9816 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9817 + * for locks/coords as necessary.
9818 + *
9819 + * (2) is more complex. Solution here is to insert new empty leaf node and
9820 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9821 + * further complicated by possibility that E2 is in a different node, etc.
9822 + *
9823 + * Problems:
9824 + *
9825 + * (1) if there was internal item I2 immediately on the right of an extent E1
9826 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9827 + * key of S1 will be less than smallest key in the N2. Normally, search key
9828 + * checks that key we are looking for is in the range of keys covered by the
9829 + * node key is being looked in. To work around of this situation, while
9830 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9831 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9832 + * coord_by_key() and is only cleared when we are about to enter situation
9833 + * described above.
9834 + *
9835 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9836 + * searching for the key that is between E1 and E2 we only have to insert new
9837 + * empty leaf node when coord_by_key was called for insertion, rather than just
9838 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9839 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9840 + * performed by insert_by_key() and friends.
9841 + *
9842 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9843 + * case it requires modification of node content which is only possible under
9844 + * write lock. It may well happen that we only have read lock on the node where
9845 + * new internal pointer is to be inserted (common case: lookup of non-existent
9846 + * stat-data that fells between two extents). If only read lock is held, tree
9847 + * traversal is restarted with lock_level modified so that next time we hit
9848 + * this problem, write lock will be held. Once we have write lock, balancing
9849 + * will be performed.
9850 + */
9851 +
9852 +/**
9853 + * is_next_item_internal - check whether next item is internal
9854 + * @coord: coordinate of extent item in twig node
9855 + * @key: search key
9856 + * @lh: twig node lock handle
9857 + *
9858 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9859 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9860 + * to that node, @coord is set to its first unit. If next item is not internal
9861 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9862 + * is returned if search restart has to be done.
9863 + */
9864 +static int
9865 +is_next_item_internal(coord_t *coord, const reiser4_key * key,
9866 + lock_handle * lh)
9867 +{
9868 + coord_t next;
9869 + lock_handle rn;
9870 + int result;
9871 +
9872 + coord_dup(&next, coord);
9873 + if (coord_next_unit(&next) == 0) {
9874 + /* next unit is in this node */
9875 + if (item_is_internal(&next)) {
9876 + coord_dup(coord, &next);
9877 + return 1;
9878 + }
9879 + assert("vs-3", item_is_extent(&next));
9880 + return 0;
9881 + }
9882 +
9883 + /*
9884 + * next unit either does not exist or is in right neighbor. If it is in
9885 + * right neighbor we have to check right delimiting key because
9886 + * concurrent thread could get their first and insert item with a key
9887 + * smaller than @key
9888 + */
9889 + read_lock_dk(current_tree);
9890 + result = keycmp(key, znode_get_rd_key(coord->node));
9891 + read_unlock_dk(current_tree);
9892 + assert("vs-6", result != EQUAL_TO);
9893 + if (result == GREATER_THAN)
9894 + return 2;
9895 +
9896 + /* lock right neighbor */
9897 + init_lh(&rn);
9898 + result = reiser4_get_right_neighbor(&rn, coord->node,
9899 + znode_is_wlocked(coord->node) ?
9900 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9901 + GN_CAN_USE_UPPER_LEVELS);
9902 + if (result == -E_NO_NEIGHBOR) {
9903 + /* we are on the rightmost edge of the tree */
9904 + done_lh(&rn);
9905 + return 0;
9906 + }
9907 +
9908 + if (result) {
9909 + assert("vs-4", result < 0);
9910 + done_lh(&rn);
9911 + return result;
9912 + }
9913 +
9914 + /*
9915 + * check whether concurrent thread managed to insert item with a key
9916 + * smaller than @key
9917 + */
9918 + read_lock_dk(current_tree);
9919 + result = keycmp(key, znode_get_ld_key(rn.node));
9920 + read_unlock_dk(current_tree);
9921 + assert("vs-6", result != EQUAL_TO);
9922 + if (result == GREATER_THAN) {
9923 + done_lh(&rn);
9924 + return 2;
9925 + }
9926 +
9927 + result = zload(rn.node);
9928 + if (result) {
9929 + assert("vs-5", result < 0);
9930 + done_lh(&rn);
9931 + return result;
9932 + }
9933 +
9934 + coord_init_first_unit(&next, rn.node);
9935 + if (item_is_internal(&next)) {
9936 + /*
9937 + * next unit is in right neighbor and it is an unit of internal
9938 + * item. Unlock coord->node. Move @lh to right neighbor. @coord
9939 + * is set to the first unit of right neighbor.
9940 + */
9941 + coord_dup(coord, &next);
9942 + zrelse(rn.node);
9943 + done_lh(lh);
9944 + move_lh(lh, &rn);
9945 + return 1;
9946 + }
9947 +
9948 + /*
9949 + * next unit is unit of extent item. Return without chaning @lh and
9950 + * @coord.
9951 + */
9952 + assert("vs-6", item_is_extent(&next));
9953 + zrelse(rn.node);
9954 + done_lh(&rn);
9955 + return 0;
9956 +}
9957 +
9958 +/**
9959 + * rd_key - calculate key of an item next to the given one
9960 + * @coord: position in a node
9961 + * @key: storage for result key
9962 + *
9963 + * @coord is set between items or after the last item in a node. Calculate key
9964 + * of item to the right of @coord.
9965 + */
9966 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9967 +{
9968 + coord_t dup;
9969 +
9970 + assert("nikita-2281", coord_is_between_items(coord));
9971 + coord_dup(&dup, coord);
9972 +
9973 + if (coord_set_to_right(&dup) == 0)
9974 + /* next item is in this node. Return its key. */
9975 + unit_key_by_coord(&dup, key);
9976 + else {
9977 + /*
9978 + * next item either does not exist or is in right
9979 + * neighbor. Return znode's right delimiting key.
9980 + */
9981 + read_lock_dk(current_tree);
9982 + *key = *znode_get_rd_key(coord->node);
9983 + read_unlock_dk(current_tree);
9984 + }
9985 + return key;
9986 +}
9987 +
9988 +/**
9989 + * add_empty_leaf - insert empty leaf between two extents
9990 + * @insert_coord: position in twig node between two extents
9991 + * @lh: twig node lock handle
9992 + * @key: left delimiting key of new node
9993 + * @rdkey: right delimiting key of new node
9994 + *
9995 + * Inserts empty leaf node between two extent items. It is necessary when we
9996 + * have to insert an item on leaf level between two extents (items on the twig
9997 + * level).
9998 + */
9999 +static int
10000 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10001 + const reiser4_key *key, const reiser4_key *rdkey)
10002 +{
10003 + int result;
10004 + carry_pool *pool;
10005 + carry_level *todo;
10006 + reiser4_item_data *item;
10007 + carry_insert_data *cdata;
10008 + carry_op *op;
10009 + znode *node;
10010 + reiser4_tree *tree;
10011 +
10012 + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10013 + tree = znode_get_tree(insert_coord->node);
10014 + node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
10015 + if (IS_ERR(node))
10016 + return PTR_ERR(node);
10017 +
10018 + /* setup delimiting keys for node being inserted */
10019 + write_lock_dk(tree);
10020 + znode_set_ld_key(node, key);
10021 + znode_set_rd_key(node, rdkey);
10022 + ON_DEBUG(node->creator = current);
10023 + ON_DEBUG(node->first_key = *key);
10024 + write_unlock_dk(tree);
10025 +
10026 + ZF_SET(node, JNODE_ORPHAN);
10027 +
10028 + /*
10029 + * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10030 + * carry_insert_data
10031 + */
10032 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10033 + sizeof(*item) + sizeof(*cdata));
10034 + if (IS_ERR(pool))
10035 + return PTR_ERR(pool);
10036 + todo = (carry_level *) (pool + 1);
10037 + init_carry_level(todo, pool);
10038 +
10039 + item = (reiser4_item_data *) (todo + 3);
10040 + cdata = (carry_insert_data *) (item + 1);
10041 +
10042 + op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
10043 + if (!IS_ERR(op)) {
10044 + cdata->coord = insert_coord;
10045 + cdata->key = key;
10046 + cdata->data = item;
10047 + op->u.insert.d = cdata;
10048 + op->u.insert.type = COPT_ITEM_DATA;
10049 + build_child_ptr_data(node, item);
10050 + item->arg = NULL;
10051 + /* have @insert_coord to be set at inserted item after
10052 + insertion is done */
10053 + todo->track_type = CARRY_TRACK_CHANGE;
10054 + todo->tracked = lh;
10055 +
10056 + result = reiser4_carry(todo, NULL);
10057 + if (result == 0) {
10058 + /*
10059 + * pin node in memory. This is necessary for
10060 + * znode_make_dirty() below.
10061 + */
10062 + result = zload(node);
10063 + if (result == 0) {
10064 + lock_handle local_lh;
10065 +
10066 + /*
10067 + * if we inserted new child into tree we have
10068 + * to mark it dirty so that flush will be able
10069 + * to process it.
10070 + */
10071 + init_lh(&local_lh);
10072 + result = longterm_lock_znode(&local_lh, node,
10073 + ZNODE_WRITE_LOCK,
10074 + ZNODE_LOCK_LOPRI);
10075 + if (result == 0) {
10076 + znode_make_dirty(node);
10077 +
10078 + /*
10079 + * when internal item pointing to @node
10080 + * was inserted into twig node
10081 + * create_hook_internal did not connect
10082 + * it properly because its right
10083 + * neighbor was not known. Do it
10084 + * here
10085 + */
10086 + write_lock_tree(tree);
10087 + assert("nikita-3312",
10088 + znode_is_right_connected(node));
10089 + assert("nikita-2984",
10090 + node->right == NULL);
10091 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10092 + write_unlock_tree(tree);
10093 + result =
10094 + connect_znode(insert_coord, node);
10095 + ON_DEBUG(if (result == 0) check_dkeys(node););
10096 +
10097 + done_lh(lh);
10098 + move_lh(lh, &local_lh);
10099 + assert("vs-1676", node_is_empty(node));
10100 + coord_init_first_unit(insert_coord,
10101 + node);
10102 + } else {
10103 + warning("nikita-3136",
10104 + "Cannot lock child");
10105 + }
10106 + done_lh(&local_lh);
10107 + zrelse(node);
10108 + }
10109 + }
10110 + } else
10111 + result = PTR_ERR(op);
10112 + zput(node);
10113 + done_carry_pool(pool);
10114 + return result;
10115 +}
10116 +
10117 +/**
10118 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10119 + * @h: search handle
10120 + * @outcome: flag saying whether search has to restart or is done
10121 + *
10122 + * Handles search on twig level. If this function completes search itself then
10123 + * it returns 1. If search has to go one level down then 0 is returned. If
10124 + * error happens then LOOKUP_DONE is returned via @outcome and error code is
10125 + * saved in @h->result.
10126 + */
10127 +int handle_eottl(cbk_handle *h, int *outcome)
10128 +{
10129 + int result;
10130 + reiser4_key key;
10131 + coord_t *coord;
10132 +
10133 + coord = h->coord;
10134 +
10135 + if (h->level != TWIG_LEVEL ||
10136 + (coord_is_existing_item(coord) && item_is_internal(coord))) {
10137 + /* Continue to traverse tree downward. */
10138 + return 0;
10139 + }
10140 +
10141 + /*
10142 + * make sure that @h->coord is set to twig node and that it is either
10143 + * set to extent item or after extent item
10144 + */
10145 + assert("vs-356", h->level == TWIG_LEVEL);
10146 + assert("vs-357", ({
10147 + coord_t lcoord;
10148 + coord_dup(&lcoord, coord);
10149 + check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10150 + item_is_extent(&lcoord);
10151 + }
10152 + ));
10153 +
10154 + if (*outcome == NS_FOUND) {
10155 + /* we have found desired key on twig level in extent item */
10156 + h->result = CBK_COORD_FOUND;
10157 + *outcome = LOOKUP_DONE;
10158 + return 1;
10159 + }
10160 +
10161 + if (!(h->flags & CBK_FOR_INSERT)) {
10162 + /* tree traversal is not for insertion. Just return
10163 + CBK_COORD_NOTFOUND. */
10164 + h->result = CBK_COORD_NOTFOUND;
10165 + *outcome = LOOKUP_DONE;
10166 + return 1;
10167 + }
10168 +
10169 + /* take a look at the item to the right of h -> coord */
10170 + result = is_next_item_internal(coord, h->key, h->active_lh);
10171 + if (unlikely(result < 0)) {
10172 + h->error = "get_right_neighbor failed";
10173 + h->result = result;
10174 + *outcome = LOOKUP_DONE;
10175 + return 1;
10176 + }
10177 + if (result == 0) {
10178 + /*
10179 + * item to the right is also an extent one. Allocate a new node
10180 + * and insert pointer to it after item h -> coord.
10181 + *
10182 + * This is a result of extents being located at the twig
10183 + * level. For explanation, see comment just above
10184 + * is_next_item_internal().
10185 + */
10186 + znode *loaded;
10187 +
10188 + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10189 + /*
10190 + * we got node read locked, restart coord_by_key to
10191 + * have write lock on twig level
10192 + */
10193 + h->lock_level = TWIG_LEVEL;
10194 + h->lock_mode = ZNODE_WRITE_LOCK;
10195 + *outcome = LOOKUP_REST;
10196 + return 1;
10197 + }
10198 +
10199 + loaded = coord->node;
10200 + result =
10201 + add_empty_leaf(coord, h->active_lh, h->key,
10202 + rd_key(coord, &key));
10203 + if (result) {
10204 + h->error = "could not add empty leaf";
10205 + h->result = result;
10206 + *outcome = LOOKUP_DONE;
10207 + return 1;
10208 + }
10209 + /* added empty leaf is locked (h->active_lh), its parent node
10210 + is unlocked, h->coord is set as EMPTY */
10211 + assert("vs-13", coord->between == EMPTY_NODE);
10212 + assert("vs-14", znode_is_write_locked(coord->node));
10213 + assert("vs-15",
10214 + WITH_DATA(coord->node, node_is_empty(coord->node)));
10215 + assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10216 + assert("vs-17", coord->node == h->active_lh->node);
10217 + *outcome = LOOKUP_DONE;
10218 + h->result = CBK_COORD_NOTFOUND;
10219 + return 1;
10220 + } else if (result == 1) {
10221 + /*
10222 + * this is special case mentioned in the comment on
10223 + * tree.h:cbk_flags. We have found internal item immediately on
10224 + * the right of extent, and we are going to insert new item
10225 + * there. Key of item we are going to insert is smaller than
10226 + * leftmost key in the node pointed to by said internal item
10227 + * (otherwise search wouldn't come to the extent in the first
10228 + * place).
10229 + *
10230 + * This is a result of extents being located at the twig
10231 + * level. For explanation, see comment just above
10232 + * is_next_item_internal().
10233 + */
10234 + h->flags &= ~CBK_TRUST_DK;
10235 + } else {
10236 + assert("vs-8", result == 2);
10237 + *outcome = LOOKUP_REST;
10238 + return 1;
10239 + }
10240 + assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10241 + return 0;
10242 +}
10243 +
10244 +/*
10245 + * Local variables:
10246 + * c-indentation-style: "K&R"
10247 + * mode-name: "LC"
10248 + * c-basic-offset: 8
10249 + * tab-width: 8
10250 + * fill-column: 120
10251 + * scroll-step: 1
10252 + * End:
10253 + */
10254 diff -urN linux-2.6.33.orig/fs/reiser4/estimate.c linux-2.6.33/fs/reiser4/estimate.c
10255 --- linux-2.6.33.orig/fs/reiser4/estimate.c 1970-01-01 01:00:00.000000000 +0100
10256 +++ linux-2.6.33/fs/reiser4/estimate.c 2010-03-04 19:33:22.000000000 +0100
10257 @@ -0,0 +1,129 @@
10258 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10259 + reiser4/README */
10260 +
10261 +#include "debug.h"
10262 +#include "dformat.h"
10263 +#include "tree.h"
10264 +#include "carry.h"
10265 +#include "inode.h"
10266 +#include "plugin/cluster.h"
10267 +#include "plugin/item/ctail.h"
10268 +
10269 +/* This returns how many nodes might get dirty and added nodes if @children
10270 + nodes are dirtied
10271 +
10272 + Amount of internals which will get dirty or get allocated we estimate as 5%
10273 + of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and
10274 + the current block on the leaf level, 2 neighbour nodes + the current (or 1
10275 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on
10276 + upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level,
10277 + 2 on upper + 1 for root.
10278 +
10279 + Do not calculate the current node of the lowest level here - this is overhead
10280 + only.
10281 +
10282 + children is almost always 1 here. Exception is flow insertion
10283 +*/
10284 +static reiser4_block_nr
10285 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10286 +{
10287 + reiser4_block_nr ten_percent;
10288 +
10289 + ten_percent = ((103 * childen) >> 10);
10290 +
10291 + /* If we have too many balancings at the time, tree height can raise on
10292 + more then 1. Assume that if tree_height is 5, it can raise on 1 only.
10293 + */
10294 + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10295 +}
10296 +
10297 +/* this returns maximal possible number of nodes which can be modified plus
10298 + number of new nodes which can be required to perform insertion of one item
10299 + into the tree */
10300 +/* it is only called when tree height changes, or gets initialized */
10301 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10302 +{
10303 + return 1 + max_balance_overhead(1, height);
10304 +}
10305 +
10306 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10307 +{
10308 + return tree->estimate_one_insert;
10309 +}
10310 +
10311 +/* this returns maximal possible number of nodes which can be modified plus
10312 + number of new nodes which can be required to perform insertion of one unit
10313 + into an item in the tree */
10314 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10315 +{
10316 + /* estimate insert into item just like item insertion */
10317 + return tree->estimate_one_insert;
10318 +}
10319 +
10320 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10321 +{
10322 + /* on item removal reiser4 does not try to pack nodes more complact, so,
10323 + only one node may be dirtied on leaf level */
10324 + return tree->estimate_one_insert;
10325 +}
10326 +
10327 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and
10328 + dirty 3 existing nodes (insert point and both its neighbors).
10329 + Max_balance_overhead should estimate number of blocks which may change/get
10330 + added on internal levels */
10331 +reiser4_block_nr estimate_insert_flow(tree_level height)
10332 +{
10333 + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10334 + CARRY_FLOW_NEW_NODES_LIMIT,
10335 + height);
10336 +}
10337 +
10338 +/* returnes max number of nodes can be occupied by disk cluster */
10339 +static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped)
10340 +{
10341 + int per_cluster;
10342 + per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10343 + return 3 + per_cluster +
10344 + max_balance_overhead(3 + per_cluster,
10345 + REISER4_MAX_ZTREE_HEIGHT);
10346 +}
10347 +
10348 +/* how many nodes might get dirty and added
10349 + during insertion of a disk cluster */
10350 +reiser4_block_nr estimate_insert_cluster(struct inode *inode)
10351 +{
10352 + return estimate_cluster(inode, 1); /* 24 */
10353 +}
10354 +
10355 +/* how many nodes might get dirty and added
10356 + during update of a (prepped or unprepped) disk cluster */
10357 +reiser4_block_nr estimate_update_cluster(struct inode *inode)
10358 +{
10359 + return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10360 +}
10361 +
10362 +/* How many nodes occupied by a disk cluster might get dirty.
10363 + Note that this estimation is not precise (i.e. disk cluster
10364 + can occupy more nodes).
10365 + Q: Why we don't use precise estimation?
10366 + A: 1.Because precise estimation is fairly bad: 65536 nodes
10367 + for 64K logical cluster, it means 256M of dead space on
10368 + a partition
10369 + 2.It is a very rare case when disk cluster occupies more
10370 + nodes then this estimation returns.
10371 +*/
10372 +reiser4_block_nr estimate_dirty_cluster(struct inode *inode)
10373 +{
10374 + return cluster_nrpages(inode) + 4;
10375 +}
10376 +
10377 +/* Make Linus happy.
10378 + Local variables:
10379 + c-indentation-style: "K&R"
10380 + mode-name: "LC"
10381 + c-basic-offset: 8
10382 + tab-width: 8
10383 + fill-column: 120
10384 + scroll-step: 1
10385 + End:
10386 +*/
10387 diff -urN linux-2.6.33.orig/fs/reiser4/export_ops.c linux-2.6.33/fs/reiser4/export_ops.c
10388 --- linux-2.6.33.orig/fs/reiser4/export_ops.c 1970-01-01 01:00:00.000000000 +0100
10389 +++ linux-2.6.33/fs/reiser4/export_ops.c 2010-03-04 19:33:22.000000000 +0100
10390 @@ -0,0 +1,328 @@
10391 +/* Copyright 2005 by Hans Reiser, licensing governed by
10392 + * reiser4/README */
10393 +
10394 +#include "inode.h"
10395 +#include "plugin/plugin.h"
10396 +
10397 +/*
10398 + * Supported file-handle types
10399 + */
10400 +typedef enum {
10401 + FH_WITH_PARENT = 0x10, /* file handle with parent */
10402 + FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10403 +} reiser4_fhtype;
10404 +
10405 +#define NFSERROR (255)
10406 +
10407 +/* initialize place-holder for object */
10408 +static void object_on_wire_init(reiser4_object_on_wire *o)
10409 +{
10410 + o->plugin = NULL;
10411 +}
10412 +
10413 +/* finish with @o */
10414 +static void object_on_wire_done(reiser4_object_on_wire *o)
10415 +{
10416 + if (o->plugin != NULL)
10417 + o->plugin->wire.done(o);
10418 +}
10419 +
10420 +/*
10421 + * read serialized object identity from @addr and store information about
10422 + * object in @obj. This is dual to encode_inode().
10423 + */
10424 +static char *decode_inode(struct super_block *s, char *addr,
10425 + reiser4_object_on_wire * obj)
10426 +{
10427 + file_plugin *fplug;
10428 +
10429 + /* identifier of object plugin is stored in the first two bytes,
10430 + * followed by... */
10431 + fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10432 + if (fplug != NULL) {
10433 + addr += sizeof(d16);
10434 + obj->plugin = fplug;
10435 + assert("nikita-3520", fplug->wire.read != NULL);
10436 + /* plugin specific encoding of object identity. */
10437 + addr = fplug->wire.read(addr, obj);
10438 + } else
10439 + addr = ERR_PTR(RETERR(-EINVAL));
10440 + return addr;
10441 +}
10442 +
10443 +static struct dentry *reiser4_get_dentry(struct super_block *super,
10444 + void *data);
10445 +/**
10446 + * reiser4_decode_fh: decode on-wire object - helper function
10447 + * for fh_to_dentry, fh_to_parent export operations;
10448 + * @super: super block;
10449 + * @addr: onwire object to be decoded;
10450 + *
10451 + * Returns dentry referring to the object being decoded.
10452 + */
10453 +static struct dentry *reiser4_decode_fh(struct super_block * super,
10454 + char * addr)
10455 +{
10456 + reiser4_object_on_wire object;
10457 +
10458 + object_on_wire_init(&object);
10459 +
10460 + addr = decode_inode(super, addr, &object);
10461 + if (!IS_ERR(addr)) {
10462 + struct dentry *d;
10463 + d = reiser4_get_dentry(super, &object);
10464 + if (d != NULL && !IS_ERR(d))
10465 + /* FIXME check for -ENOMEM */
10466 + reiser4_get_dentry_fsdata(d)->stateless = 1;
10467 + addr = (char *)d;
10468 + }
10469 + object_on_wire_done(&object);
10470 + return (void *)addr;
10471 +}
10472 +
10473 +static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
10474 + struct fid *fid,
10475 + int fh_len, int fh_type)
10476 +{
10477 + reiser4_context *ctx;
10478 + struct dentry *d;
10479 +
10480 + assert("edward-1536",
10481 + fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
10482 +
10483 + ctx = reiser4_init_context(sb);
10484 + if (IS_ERR(ctx))
10485 + return (struct dentry *)ctx;
10486 +
10487 + d = reiser4_decode_fh(sb, (char *)fid->raw);
10488 +
10489 + reiser4_exit_context(ctx);
10490 + return d;
10491 +}
10492 +
10493 +static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
10494 + struct fid *fid,
10495 + int fh_len, int fh_type)
10496 +{
10497 + char * addr;
10498 + struct dentry * d;
10499 + reiser4_context *ctx;
10500 + file_plugin *fplug;
10501 +
10502 + if (fh_type == FH_WITHOUT_PARENT)
10503 + return NULL;
10504 + assert("edward-1537", fh_type == FH_WITH_PARENT);
10505 +
10506 + ctx = reiser4_init_context(sb);
10507 + if (IS_ERR(ctx))
10508 + return (struct dentry *)ctx;
10509 + addr = (char *)fid->raw;
10510 + /* extract 2-bytes file plugin id */
10511 + fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr);
10512 + if (fplug == NULL) {
10513 + d = ERR_PTR(RETERR(-EINVAL));
10514 + goto exit;
10515 + }
10516 + addr += sizeof(d16);
10517 + /* skip previously encoded object */
10518 + addr = fplug->wire.read(addr, NULL /* skip */);
10519 + if (IS_ERR(addr)) {
10520 + d = (struct dentry *)addr;
10521 + goto exit;
10522 + }
10523 + /* @extract and decode parent object */
10524 + d = reiser4_decode_fh(sb, addr);
10525 + exit:
10526 + reiser4_exit_context(ctx);
10527 + return d;
10528 +}
10529 +
10530 +/*
10531 + * Object serialization support.
10532 + *
10533 + * To support knfsd file system provides export_operations that are used to
10534 + * construct and interpret NFS file handles. As a generalization of this,
10535 + * reiser4 object plugins have serialization support: it provides methods to
10536 + * create on-wire representation of identity of reiser4 object, and
10537 + * re-create/locate object given its on-wire identity.
10538 + *
10539 + */
10540 +
10541 +/*
10542 + * return number of bytes that on-wire representation of @inode's identity
10543 + * consumes.
10544 + */
10545 +static int encode_inode_size(struct inode *inode)
10546 +{
10547 + assert("nikita-3514", inode != NULL);
10548 + assert("nikita-3515", inode_file_plugin(inode) != NULL);
10549 + assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10550 +
10551 + return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10552 +}
10553 +
10554 +/*
10555 + * store on-wire representation of @inode's identity at the area beginning at
10556 + * @start.
10557 + */
10558 +static char *encode_inode(struct inode *inode, char *start)
10559 +{
10560 + assert("nikita-3517", inode != NULL);
10561 + assert("nikita-3518", inode_file_plugin(inode) != NULL);
10562 + assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10563 +
10564 + /*
10565 + * first, store two-byte identifier of object plugin, then
10566 + */
10567 + save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10568 + (d16 *) start);
10569 + start += sizeof(d16);
10570 + /*
10571 + * call plugin to serialize object's identity
10572 + */
10573 + return inode_file_plugin(inode)->wire.write(inode, start);
10574 +}
10575 +
10576 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10577 + * returned if file handle can not be stored */
10578 +/**
10579 + * reiser4_encode_fh - encode_fh of export operations
10580 + * @dentry:
10581 + * @fh:
10582 + * @lenp:
10583 + * @need_parent:
10584 + *
10585 + */
10586 +static int
10587 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10588 + int need_parent)
10589 +{
10590 + struct inode *inode;
10591 + struct inode *parent;
10592 + char *addr;
10593 + int need;
10594 + int delta;
10595 + int result;
10596 + reiser4_context *ctx;
10597 +
10598 + /*
10599 + * knfsd asks as to serialize object in @dentry, and, optionally its
10600 + * parent (if need_parent != 0).
10601 + *
10602 + * encode_inode() and encode_inode_size() is used to build
10603 + * representation of object and its parent. All hard work is done by
10604 + * object plugins.
10605 + */
10606 + inode = dentry->d_inode;
10607 + parent = dentry->d_parent->d_inode;
10608 +
10609 + addr = (char *)fh;
10610 +
10611 + need = encode_inode_size(inode);
10612 + if (need < 0)
10613 + return NFSERROR;
10614 + if (need_parent) {
10615 + delta = encode_inode_size(parent);
10616 + if (delta < 0)
10617 + return NFSERROR;
10618 + need += delta;
10619 + }
10620 +
10621 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
10622 + if (IS_ERR(ctx))
10623 + return PTR_ERR(ctx);
10624 +
10625 + if (need <= sizeof(__u32) * (*lenp)) {
10626 + addr = encode_inode(inode, addr);
10627 + if (need_parent)
10628 + addr = encode_inode(parent, addr);
10629 +
10630 + /* store in lenp number of 32bit words required for file
10631 + * handle. */
10632 + *lenp = (need + sizeof(__u32) - 1) >> 2;
10633 + result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10634 + } else
10635 + /* no enough space in file handle */
10636 + result = NFSERROR;
10637 + reiser4_exit_context(ctx);
10638 + return result;
10639 +}
10640 +
10641 +/**
10642 + * reiser4_get_dentry_parent - get_parent of export operations
10643 + * @child:
10644 + *
10645 + */
10646 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10647 +{
10648 + struct inode *dir;
10649 + dir_plugin *dplug;
10650 + struct dentry *result;
10651 + reiser4_context *ctx;
10652 +
10653 + assert("nikita-3527", child != NULL);
10654 +
10655 + dir = child->d_inode;
10656 + assert("nikita-3529", dir != NULL);
10657 +
10658 + ctx = reiser4_init_context(dir->i_sb);
10659 + if (IS_ERR(ctx))
10660 + return (void *)ctx;
10661 +
10662 + dplug = inode_dir_plugin(dir);
10663 + assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10664 +
10665 + if (unlikely(dplug == NULL)) {
10666 + reiser4_exit_context(ctx);
10667 + return ERR_PTR(RETERR(-ENOTDIR));
10668 + }
10669 + result = dplug->get_parent(dir);
10670 + reiser4_exit_context(ctx);
10671 + return result;
10672 +}
10673 +
10674 +/**
10675 + * reiser4_get_dentry - get_dentry of export operations
10676 + * @super:
10677 + * @data:
10678 + *
10679 + *
10680 + */
10681 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10682 +{
10683 + reiser4_object_on_wire *o;
10684 +
10685 + assert("nikita-3522", super != NULL);
10686 + assert("nikita-3523", data != NULL);
10687 + /*
10688 + * this is only supposed to be called by
10689 + *
10690 + * reiser4_decode_fh->find_exported_dentry
10691 + *
10692 + * so, reiser4_context should be here already.
10693 + */
10694 + assert("nikita-3526", is_in_reiser4_context());
10695 +
10696 + o = (reiser4_object_on_wire *)data;
10697 + assert("nikita-3524", o->plugin != NULL);
10698 + assert("nikita-3525", o->plugin->wire.get != NULL);
10699 +
10700 + return o->plugin->wire.get(super, o);
10701 +}
10702 +
10703 +struct export_operations reiser4_export_operations = {
10704 + .encode_fh = reiser4_encode_fh,
10705 + .fh_to_dentry = reiser4_fh_to_dentry,
10706 + .fh_to_parent = reiser4_fh_to_parent,
10707 + .get_parent = reiser4_get_dentry_parent,
10708 +};
10709 +
10710 +/*
10711 + * Local variables:
10712 + * c-indentation-style: "K&R"
10713 + * mode-name: "LC"
10714 + * c-basic-offset: 8
10715 + * tab-width: 8
10716 + * fill-column: 79
10717 + * End:
10718 + */
10719 diff -urN linux-2.6.33.orig/fs/reiser4/flush.c linux-2.6.33/fs/reiser4/flush.c
10720 --- linux-2.6.33.orig/fs/reiser4/flush.c 1970-01-01 01:00:00.000000000 +0100
10721 +++ linux-2.6.33/fs/reiser4/flush.c 2010-03-04 19:33:22.000000000 +0100
10722 @@ -0,0 +1,3703 @@
10723 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10724 + reiser4/README */
10725 +
10726 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10727 +
10728 +#include "forward.h"
10729 +#include "debug.h"
10730 +#include "dformat.h"
10731 +#include "key.h"
10732 +#include "coord.h"
10733 +#include "plugin/item/item.h"
10734 +#include "plugin/plugin.h"
10735 +#include "plugin/object.h"
10736 +#include "txnmgr.h"
10737 +#include "jnode.h"
10738 +#include "znode.h"
10739 +#include "block_alloc.h"
10740 +#include "tree_walk.h"
10741 +#include "carry.h"
10742 +#include "tree.h"
10743 +#include "vfs_ops.h"
10744 +#include "inode.h"
10745 +#include "page_cache.h"
10746 +#include "wander.h"
10747 +#include "super.h"
10748 +#include "entd.h"
10749 +#include "reiser4.h"
10750 +#include "flush.h"
10751 +#include "writeout.h"
10752 +
10753 +#include <asm/atomic.h>
10754 +#include <linux/fs.h> /* for struct super_block */
10755 +#include <linux/mm.h> /* for struct page */
10756 +#include <linux/bio.h> /* for struct bio */
10757 +#include <linux/pagemap.h>
10758 +#include <linux/blkdev.h>
10759 +
10760 +/* IMPLEMENTATION NOTES */
10761 +
10762 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
10763 + assigning a total order to the nodes of the tree in which the parent is
10764 + placed before its children, which are ordered (recursively) in left-to-right
10765 + order. When we speak of a "parent-first preceder", it describes the node that
10766 + "came before in forward parent-first order". When we speak of a "parent-first
10767 + follower", it describes the node that "comes next in parent-first order"
10768 + (alternatively the node that "came before in reverse parent-first order").
10769 +
10770 + The following pseudo-code prints the nodes of a tree in forward parent-first
10771 + order:
10772 +
10773 + void parent_first (node)
10774 + {
10775 + print_node (node);
10776 + if (node->level > leaf) {
10777 + for (i = 0; i < num_children; i += 1) {
10778 + parent_first (node->child[i]);
10779 + }
10780 + }
10781 + }
10782 +*/
10783 +
10784 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block
10785 + allocation so that a left-to-right scan of the tree's data (i.e., the leaves
10786 + in left-to-right order) can be accomplished with sequential reads, which
10787 + results in reading nodes in their parent-first order. This is a
10788 + read-optimization aspect of the flush algorithm, and there is also a
10789 + write-optimization aspect, which is that we wish to make large sequential
10790 + writes to the disk by allocating or reallocating blocks so that they can be
10791 + written in sequence. Sometimes the read-optimization and write-optimization
10792 + goals conflict with each other, as we discuss in more detail below.
10793 +*/
10794 +
10795 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.
10796 + Here are the relevant jnode->state bits and their relevence to flush:
10797 +
10798 + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
10799 + written it must be allocated first. In order to be considered allocated,
10800 + the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
10801 + two bits are exclusive, and all dirtied jnodes eventually have one of these
10802 + bits set during each transaction.
10803 +
10804 + JNODE_CREATED: The node was freshly created in its transaction and has no
10805 + previous block address, so it is unconditionally assigned to be relocated,
10806 + although this is mainly for code-convenience. It is not being 'relocated'
10807 + from anything, but in almost every regard it is treated as part of the
10808 + relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
10809 + set, so the actual relocate can be distinguished from the
10810 + created-and-allocated set easily: relocate-set members (belonging to the
10811 + preserve-set) have (JNODE_RELOC) set and created-set members which have no
10812 + previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10813 +
10814 + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
10815 + made the decision to maintain the pre-existing location for this node and
10816 + it will be written to the wandered-log.
10817 +
10818 + JNODE_RELOC: The flush algorithm made the decision to relocate this block
10819 + (if it was not created, see note above). A block with JNODE_RELOC set is
10820 + eligible for early-flushing and may be submitted during flush_empty_queues.
10821 + When the JNODE_RELOC bit is set on a znode, the parent node's internal item
10822 + is modified and the znode is rehashed.
10823 +
10824 + JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
10825 + scans the node and calls plugin->f.squeeze() method for its items. By this
10826 + technology we update disk clusters of cryptcompress objects. Also if
10827 + leftmost point that was found by flush scan has this flag (races with
10828 + write(), rare case) the flush algorythm makes the decision to pass it to
10829 + squalloc() in spite of its flushprepped status for squeezing, not for
10830 + repeated allocation.
10831 +
10832 + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
10833 + into its flush queue. This means the jnode is not on any clean or dirty
10834 + list, instead it is moved to one of the flush queue (see flush_queue.h)
10835 + object private list. This prevents multiple concurrent flushes from
10836 + attempting to start flushing from the same node.
10837 +
10838 + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10839 + squeeze-and-allocate on a node while its children are actively being
10840 + squeezed and allocated. This flag was created to avoid submitting a write
10841 + request for a node while its children are still being allocated and
10842 + squeezed. Then flush queue was re-implemented to allow unlimited number of
10843 + nodes be queued. This flag support was commented out in source code because
10844 + we decided that there was no reason to submit queued nodes before
10845 + jnode_flush() finishes. However, current code calls fq_write() during a
10846 + slum traversal and may submit "busy nodes" to disk. Probably we can
10847 + re-enable the JNODE_FLUSH_BUSY bit support in future.
10848 +
10849 + With these state bits, we describe a test used frequently in the code below,
10850 + jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
10851 + The test for "flushprepped" returns true if any of the following are true:
10852 +
10853 + - The node is not dirty
10854 + - The node has JNODE_RELOC set
10855 + - The node has JNODE_OVRWR set
10856 +
10857 + If either the node is not dirty or it has already been processed by flush
10858 + (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
10859 + jnode_is_flushprepped() returns true then flush has work to do on that node.
10860 +*/
10861 +
10862 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10863 + flushprepped twice (unless an explicit call to flush_unprep is made as
10864 + described in detail below). For example a node is dirtied, allocated, and
10865 + then early-flushed to disk and set clean. Before the transaction commits, the
10866 + page is dirtied again and, due to memory pressure, the node is flushed again.
10867 + The flush algorithm will not relocate the node to a new disk location, it
10868 + will simply write it to the same, previously relocated position again.
10869 +*/
10870 +
10871 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
10872 + where we start at a leaf node and allocate in parent-first order by iterating
10873 + to the right. At each step of the iteration, we check for the right neighbor.
10874 + Before advancing to the right neighbor, we check if the current position and
10875 + the right neighbor share the same parent. If they do not share the same
10876 + parent, the parent is allocated before the right neighbor.
10877 +
10878 + This process goes recursively up the tree and squeeze nodes level by level as
10879 + long as the right neighbor and the current position have different parents,
10880 + then it allocates the right-neighbors-with-different-parents on the way back
10881 + down. This process is described in more detail in
10882 + flush_squalloc_changed_ancestor and the recursive function
10883 + squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10884 + specifics of the bottom-up approach as it is to contrast the bottom-up and
10885 + top-down approaches.
10886 +
10887 + The top-down algorithm was implemented earlier (April-May 2002). In the
10888 + top-down approach, we find a starting point by scanning left along each level
10889 + past dirty nodes, then going up and repeating the process until the left node
10890 + and the parent node are clean. We then perform a parent-first traversal from
10891 + the starting point, which makes allocating in parent-first order trivial.
10892 + After one subtree has been allocated in this manner, we move to the right,
10893 + try moving upward, then repeat the parent-first traversal.
10894 +
10895 + Both approaches have problems that need to be addressed. Both are
10896 + approximately the same amount of code, but the bottom-up approach has
10897 + advantages in the order it acquires locks which, at the very least, make it
10898 + the better approach. At first glance each one makes the other one look
10899 + simpler, so it is important to remember a few of the problems with each one.
10900 +
10901 + Main problem with the top-down approach: When you encounter a clean child
10902 + during the parent-first traversal, what do you do? You would like to avoid
10903 + searching through a large tree of nodes just to find a few dirty leaves at
10904 + the bottom, and there is not an obvious solution. One of the advantages of
10905 + the top-down approach is that during the parent-first traversal you check
10906 + every child of a parent to see if it is dirty. In this way, the top-down
10907 + approach easily handles the main problem of the bottom-up approach:
10908 + unallocated children.
10909 +
10910 + The unallocated children problem is that before writing a node to disk we
10911 + must make sure that all of its children are allocated. Otherwise, the writing
10912 + the node means extra I/O because the node will have to be written again when
10913 + the child is finally allocated.
10914 +
10915 + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
10916 + this should not cause any file system corruption, it only degrades I/O
10917 + performance because a node may be written when it is sure to be written at
10918 + least one more time in the same transaction when the remaining children are
10919 + allocated. What follows is a description of how we will solve the problem.
10920 +*/
10921 +
10922 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
10923 + then proceeding in parent first order, allocate some of its left-children,
10924 + then encounter a clean child in the middle of the parent. We do not allocate
10925 + the clean child, but there may remain unallocated (dirty) children to the
10926 + right of the clean child. If we were to stop flushing at this moment and
10927 + write everything to disk, the parent might still contain unallocated
10928 + children.
10929 +
10930 + We could try to allocate all the descendents of every node that we allocate,
10931 + but this is not necessary. Doing so could result in allocating the entire
10932 + tree: if the root node is allocated then every unallocated node would have to
10933 + be allocated before flushing. Actually, we do not have to write a node just
10934 + because we allocate it. It is possible to allocate but not write a node
10935 + during flush, when it still has unallocated children. However, this approach
10936 + is probably not optimal for the following reason.
10937 +
10938 + The flush algorithm is designed to allocate nodes in parent-first order in an
10939 + attempt to optimize reads that occur in the same order. Thus we are
10940 + read-optimizing for a left-to-right scan through all the leaves in the
10941 + system, and we are hoping to write-optimize at the same time because those
10942 + nodes will be written together in batch. What happens, however, if we assign
10943 + a block number to a node in its read-optimized order but then avoid writing
10944 + it because it has unallocated children? In that situation, we lose out on the
10945 + write-optimization aspect because a node will have to be written again to the
10946 + its location on the device, later, which likely means seeking back to that
10947 + location.
10948 +
10949 + So there are tradeoffs. We can choose either:
10950 +
10951 + A. Allocate all unallocated children to preserve both write-optimization and
10952 + read-optimization, but this is not always desirable because it may mean
10953 + having to allocate and flush very many nodes at once.
10954 +
10955 + B. Defer writing nodes with unallocated children, keep their read-optimized
10956 + locations, but sacrifice write-optimization because those nodes will be
10957 + written again.
10958 +
10959 + C. Defer writing nodes with unallocated children, but do not keep their
10960 + read-optimized locations. Instead, choose to write-optimize them later, when
10961 + they are written. To facilitate this, we "undo" the read-optimized allocation
10962 + that was given to the node so that later it can be write-optimized, thus
10963 + "unpreparing" the flush decision. This is a case where we disturb the
10964 + FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
10965 + flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10966 + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
10967 + its block location, and set the JNODE_CREATED bit, effectively setting the
10968 + node back to an unallocated state.
10969 +
10970 + We will take the following approach in v4.0: for twig nodes we will always
10971 + finish allocating unallocated children (A). For nodes with (level > TWIG)
10972 + we will defer writing and choose write-optimization (C).
10973 +
10974 + To summarize, there are several parts to a solution that avoids the problem
10975 + with unallocated children:
10976 +
10977 + FIXME-ZAM: Still no one approach is implemented to eliminate the
10978 + "UNALLOCATED CHILDREN" problem because there was an experiment which was done
10979 + showed that we have 1-2 nodes with unallocated children for thousands of
10980 + written nodes. The experiment was simple like coping/deletion of linux kernel
10981 + sources. However the problem can arise in more complex tests. I think we have
10982 + jnode_io_hook to insert a check for unallocated children and see what kind of
10983 + problem we have.
10984 +
10985 + 1. When flush reaches a stopping point (e.g. a clean node) it should continue
10986 + calling squeeze-and-allocate on any remaining unallocated children.
10987 + FIXME: Difficulty to implement: should be simple -- amounts to adding a while
10988 + loop to jnode_flush, see comments in that function.
10989 +
10990 + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
10991 + may still have unallocated children. If the twig level has unallocated
10992 + children it is an assertion failure. If a higher-level node has unallocated
10993 + children, then it should be explicitly de-allocated by a call to
10994 + flush_unprep().
10995 + FIXME: Difficulty to implement: should be simple.
10996 +
10997 + 3. (CPU-Optimization) Checking whether a node has unallocated children may
10998 + consume more CPU cycles than we would like, and it is possible (but medium
10999 + complexity) to optimize this somewhat in the case where large sub-trees are
11000 + flushed. The following observation helps: if both the left- and
11001 + right-neighbor of a node are processed by the flush algorithm then the node
11002 + itself is guaranteed to have all of its children allocated. However, the cost
11003 + of this check may not be so expensive after all: it is not needed for leaves
11004 + and flush can guarantee this property for twigs. That leaves only (level >
11005 + TWIG) nodes that have to be checked, so this optimization only helps if at
11006 + least three (level > TWIG) nodes are flushed in one pass, and the savings
11007 + will be very small unless there are many more (level > TWIG) nodes. But if
11008 + there are many (level > TWIG) nodes then the number of blocks being written
11009 + will be very large, so the savings may be insignificant. That said, the idea
11010 + is to maintain both the left and right edges of nodes that are processed in
11011 + flush. When flush_empty_queue() is called, a relatively simple test will
11012 + tell whether the (level > TWIG) node is on the edge. If it is on the edge,
11013 + the slow check is necessary, but if it is in the interior then it can be
11014 + assumed to have all of its children allocated. FIXME: medium complexity to
11015 + implement, but simple to verify given that we must have a slow check anyway.
11016 +
11017 + 4. (Optional) This part is optional, not for v4.0--flush should work
11018 + independently of whether this option is used or not. Called RAPID_SCAN, the
11019 + idea is to amend the left-scan operation to take unallocated children into
11020 + account. Normally, the left-scan operation goes left as long as adjacent
11021 + nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
11022 + which point it stops and begins flushing. But scan-left may stop at a
11023 + position where there are unallocated children to the left with the same
11024 + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
11025 + after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
11026 + FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
11027 + all the interior children of a node--if the leftmost child of a twig is
11028 + dirty, check its left neighbor (the rightmost child of the twig to the left).
11029 + If the left neighbor of the leftmost child is also dirty, then continue the
11030 + scan at the left twig and repeat. This option will cause flush to allocate
11031 + more twigs in a single pass, but it also has the potential to write many more
11032 + nodes than would otherwise be written without the RAPID_SCAN option.
11033 + RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
11034 +*/
11035 +
11036 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
11037 + the starting point for flush is a leaf node, but actually the flush code
11038 + cares very little about whether or not this is true. It is possible that all
11039 + the leaf nodes are flushed and dirty parent nodes still remain, in which case
11040 + jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
11041 + the argument node as if it were a leaf, even when it is not. This is a simple
11042 + approach, and there may be a more optimal policy but until a problem with
11043 + this approach is discovered, simplest is probably best.
11044 +
11045 + NOTE: In this case, the ordering produced by flush is parent-first only if
11046 + you ignore the leaves. This is done as a matter of simplicity and there is
11047 + only one (shaky) justification. When an atom commits, it flushes all leaf
11048 + level nodes first, followed by twigs, and so on. With flushing done in this
11049 + order, if flush is eventually called on a non-leaf node it means that
11050 + (somehow) we reached a point where all leaves are clean and only internal
11051 + nodes need to be flushed. If that it the case, then it means there were no
11052 + leaves that were the parent-first preceder/follower of the parent. This is
11053 + expected to be a rare case, which is why we do nothing special about it.
11054 + However, memory pressure may pass an internal node to flush when there are
11055 + still dirty leaf nodes that need to be flushed, which could prove our
11056 + original assumptions "inoperative". If this needs to be fixed, then
11057 + scan_left/right should have special checks for the non-leaf levels. For
11058 + example, instead of passing from a node to the left neighbor, it should pass
11059 + from the node to the left neighbor's rightmost descendent (if dirty).
11060 +
11061 +*/
11062 +
11063 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
11064 + chunks, dirtying everything and putting it into a transaction. We tell the
11065 + allocator to allocate the blocks as far as possible towards one end of the
11066 + logical device--the left (starting) end of the device if we are walking from
11067 + left to right, the right end of the device if we are walking from right to
11068 + left. We then make passes in alternating directions, and as we do this the
11069 + device becomes sorted such that tree order and block number order fully
11070 + correlate.
11071 +
11072 + Resizing is done by shifting everything either all the way to the left or all
11073 + the way to the right, and then reporting the last block.
11074 +*/
11075 +
11076 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
11077 + This descibes the policy from the highest level:
11078 +
11079 + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
11080 + nodes on the leaf level during flush-scan (right, left), then we
11081 + unconditionally decide to relocate leaf nodes.
11082 +
11083 + Otherwise, there are two contexts in which we make a decision to relocate:
11084 +
11085 + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11086 + During the initial stages of flush, after scan-right completes, we want to
11087 + ask the question: should we relocate this leaf node and thus dirty the parent
11088 + node. Then if the node is a leftmost child its parent is its own parent-first
11089 + preceder, thus we repeat the question at the next level up, and so on. In
11090 + these cases we are moving in the reverse-parent first direction.
11091 +
11092 + There is another case which is considered the reverse direction, which comes
11093 + at the end of a twig in reverse_relocate_end_of_twig(). As we finish
11094 + processing a twig we may reach a point where there is a clean twig to the
11095 + right with a dirty leftmost child. In this case, we may wish to relocate the
11096 + child by testing if it should be relocated relative to its parent.
11097 +
11098 + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
11099 + in allocate_znode. What distinguishes the forward parent-first case from the
11100 + reverse-parent first case is that the preceder has already been allocated in
11101 + the forward case, whereas in the reverse case we don't know what the preceder
11102 + is until we finish "going in reverse". That simplifies the forward case
11103 + considerably, and there we actually use the block allocator to determine
11104 + whether, e.g., a block closer to the preceder is available.
11105 +*/
11106 +
11107 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
11108 + once we finish scan-left and find a starting point, if the parent's left
11109 + neighbor is dirty then squeeze the parent's left neighbor and the parent.
11110 + This may change the flush-starting-node's parent. Repeat until the child's
11111 + parent is stable. If the child is a leftmost child, repeat this left-edge
11112 + squeezing operation at the next level up. Note that we cannot allocate
11113 + extents during this or they will be out of parent-first order. There is also
11114 + some difficult coordinate maintenence issues. We can't do a tree search to
11115 + find coordinates again (because we hold locks), we have to determine them
11116 + from the two nodes being squeezed. Looks difficult, but has potential to
11117 + increase space utilization. */
11118 +
11119 +/* Flush-scan helper functions. */
11120 +static void scan_init(flush_scan * scan);
11121 +static void scan_done(flush_scan * scan);
11122 +
11123 +/* Flush-scan algorithm. */
11124 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11125 + unsigned limit);
11126 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11127 +static int scan_common(flush_scan * scan, flush_scan * other);
11128 +static int scan_formatted(flush_scan * scan);
11129 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
11130 +static int scan_by_coord(flush_scan * scan);
11131 +
11132 +/* Initial flush-point ancestor allocation. */
11133 +static int alloc_pos_and_ancestors(flush_pos_t *pos);
11134 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos);
11135 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos);
11136 +
11137 +/* Main flush algorithm.
11138 + Note on abbreviation: "squeeze and allocate" == "squalloc". */
11139 +static int squalloc(flush_pos_t *pos);
11140 +
11141 +/* Flush squeeze implementation. */
11142 +static int squeeze_right_non_twig(znode * left, znode * right);
11143 +static int shift_one_internal_unit(znode * left, znode * right);
11144 +
11145 +/* Flush reverse parent-first relocation routines. */
11146 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11147 + const reiser4_block_nr * nblk);
11148 +static int reverse_relocate_test(jnode * node, const coord_t *parent_coord,
11149 + flush_pos_t *pos);
11150 +static int reverse_relocate_check_dirty_parent(jnode * node,
11151 + const coord_t *parent_coord,
11152 + flush_pos_t *pos);
11153 +
11154 +/* Flush allocate write-queueing functions: */
11155 +static int allocate_znode(znode * node, const coord_t *parent_coord,
11156 + flush_pos_t *pos);
11157 +static int allocate_znode_update(znode * node, const coord_t *parent_coord,
11158 + flush_pos_t *pos);
11159 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11160 +
11161 +/* Flush helper functions: */
11162 +static int jnode_lock_parent_coord(jnode * node,
11163 + coord_t *coord,
11164 + lock_handle * parent_lh,
11165 + load_count * parent_zh,
11166 + znode_lock_mode mode, int try);
11167 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11168 + znode_lock_mode mode, int check_dirty, int expected);
11169 +static int znode_same_parents(znode * a, znode * b);
11170 +
11171 +static int znode_check_flushprepped(znode * node)
11172 +{
11173 + return jnode_check_flushprepped(ZJNODE(node));
11174 +}
11175 +
11176 +/* Flush position functions */
11177 +static void pos_init(flush_pos_t *pos);
11178 +static int pos_valid(flush_pos_t *pos);
11179 +static void pos_done(flush_pos_t *pos);
11180 +static int pos_stop(flush_pos_t *pos);
11181 +
11182 +/* check that @org is first jnode extent unit, if extent is unallocated,
11183 + * because all jnodes of unallocated extent are dirty and of the same atom. */
11184 +#define checkchild(scan) \
11185 +assert("nikita-3435", \
11186 + ergo(scan->direction == LEFT_SIDE && \
11187 + (scan->parent_coord.node->level == TWIG_LEVEL) && \
11188 + jnode_is_unformatted(scan->node) && \
11189 + extent_is_unallocated(&scan->parent_coord), \
11190 + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11191 +
11192 +/* This flush_cnt variable is used to track the number of concurrent flush
11193 + operations, useful for debugging. It is initialized in txnmgr.c out of
11194 + laziness (because flush has no static initializer function...) */
11195 +ON_DEBUG(atomic_t flush_cnt;
11196 + )
11197 +
11198 +/* check fs backing device for write congestion */
11199 +static int check_write_congestion(void)
11200 +{
11201 + struct super_block *sb;
11202 + struct backing_dev_info *bdi;
11203 +
11204 + sb = reiser4_get_current_sb();
11205 + bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
11206 + return bdi_write_congested(bdi);
11207 +}
11208 +
11209 +/* conditionally write flush queue */
11210 +static int write_prepped_nodes(flush_pos_t *pos)
11211 +{
11212 + int ret;
11213 +
11214 + assert("zam-831", pos);
11215 + assert("zam-832", pos->fq);
11216 +
11217 + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11218 + return 0;
11219 +
11220 + if (check_write_congestion())
11221 + return 0;
11222 +
11223 + ret = reiser4_write_fq(pos->fq, pos->nr_written,
11224 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11225 + return ret;
11226 +}
11227 +
11228 +/* Proper release all flush pos. resources then move flush position to new
11229 + locked node */
11230 +static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock,
11231 + load_count * new_load, const coord_t *new_coord)
11232 +{
11233 + assert("zam-857", new_lock->node == new_load->node);
11234 +
11235 + if (new_coord) {
11236 + assert("zam-858", new_coord->node == new_lock->node);
11237 + coord_dup(&pos->coord, new_coord);
11238 + } else {
11239 + coord_init_first_unit(&pos->coord, new_lock->node);
11240 + }
11241 +
11242 + if (pos->child) {
11243 + jput(pos->child);
11244 + pos->child = NULL;
11245 + }
11246 +
11247 + move_load_count(&pos->load, new_load);
11248 + done_lh(&pos->lock);
11249 + move_lh(&pos->lock, new_lock);
11250 +}
11251 +
11252 +/* delete empty node which link from the parent still exists. */
11253 +static int delete_empty_node(znode * node)
11254 +{
11255 + reiser4_key smallest_removed;
11256 +
11257 + assert("zam-1019", node != NULL);
11258 + assert("zam-1020", node_is_empty(node));
11259 + assert("zam-1023", znode_is_wlocked(node));
11260 +
11261 + return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11262 +}
11263 +
11264 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11265 +static int prepare_flush_pos(flush_pos_t *pos, jnode * org)
11266 +{
11267 + int ret;
11268 + load_count load;
11269 + lock_handle lock;
11270 +
11271 + init_lh(&lock);
11272 + init_load_count(&load);
11273 +
11274 + if (jnode_is_znode(org)) {
11275 + ret = longterm_lock_znode(&lock, JZNODE(org),
11276 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11277 + if (ret)
11278 + return ret;
11279 +
11280 + ret = incr_load_count_znode(&load, JZNODE(org));
11281 + if (ret)
11282 + return ret;
11283 +
11284 + pos->state =
11285 + (jnode_get_level(org) ==
11286 + LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11287 + move_flush_pos(pos, &lock, &load, NULL);
11288 + } else {
11289 + coord_t parent_coord;
11290 + ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11291 + &load, ZNODE_WRITE_LOCK, 0);
11292 + if (ret)
11293 + goto done;
11294 + if (!item_is_extent(&parent_coord)) {
11295 + /* file was converted to tail, org became HB, we found
11296 + internal item */
11297 + ret = -EAGAIN;
11298 + goto done;
11299 + }
11300 +
11301 + pos->state = POS_ON_EPOINT;
11302 + move_flush_pos(pos, &lock, &load, &parent_coord);
11303 + pos->child = jref(org);
11304 + if (extent_is_unallocated(&parent_coord)
11305 + && extent_unit_index(&parent_coord) != index_jnode(org)) {
11306 + /* @org is not first child of its parent unit. This may
11307 + happen because longerm lock of its parent node was
11308 + released between scan_left and scan_right. For now
11309 + work around this having flush to repeat */
11310 + ret = -EAGAIN;
11311 + }
11312 + }
11313 +
11314 +done:
11315 + done_load_count(&load);
11316 + done_lh(&lock);
11317 + return ret;
11318 +}
11319 +
11320 +/* TODO LIST (no particular order): */
11321 +/* I have labelled most of the legitimate FIXME comments in this file with
11322 + letters to indicate which issue they relate to. There are a few miscellaneous
11323 + FIXMEs with specific names mentioned instead that need to be
11324 + inspected/resolved. */
11325 +/* B. There is an issue described in reverse_relocate_test having to do with an
11326 + imprecise is_preceder? check having to do with partially-dirty extents. The
11327 + code that sets preceder hints and computes the preceder is basically
11328 + untested. Careful testing needs to be done that preceder calculations are
11329 + done correctly, since if it doesn't affect correctness we will not catch this
11330 + stuff during regular testing. */
11331 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
11332 + these are considered expected but unlikely conditions. Flush currently
11333 + returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
11334 + any of these in jnode_flush(). Many of the calls that may produce one of
11335 + these return values (i.e., longterm_lock_znode, reiser4_get_parent,
11336 + reiser4_get_neighbor, ...) check some of these values themselves and, for
11337 + instance, stop flushing instead of resulting in a restart. If any of these
11338 + results are true error conditions then flush will go into a busy-loop, as we
11339 + noticed during testing when a corrupt tree caused find_child_ptr to return
11340 + ENOENT. It needs careful thought and testing of corner conditions.
11341 +*/
11342 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
11343 + created block is assigned a block number then early-flushed to disk. It is
11344 + dirtied again and flush is called again. Concurrently, that block is deleted,
11345 + and the de-allocation of its block number does not need to be deferred, since
11346 + it is not part of the preserve set (i.e., it didn't exist before the
11347 + transaction). I think there may be a race condition where flush writes the
11348 + dirty, created block after the non-deferred deallocated block number is
11349 + re-allocated, making it possible to write deleted data on top of non-deleted
11350 + data. Its just a theory, but it needs to be thought out. */
11351 +/* F. bio_alloc() failure is not handled gracefully. */
11352 +/* G. Unallocated children. */
11353 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
11354 + blocks. */
11355 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11356 +
11357 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11358 +/* This is the main entry point for flushing a jnode and its dirty neighborhood
11359 + (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
11360 + to write dirty blocks to disk, it happens when Linux VM decides to reduce
11361 + number of dirty pages or as a part of transaction commit.
11362 +
11363 + Our objective here is to prep and flush the slum the jnode belongs to. We
11364 + want to squish the slum together, and allocate the nodes in it as we squish
11365 + because allocation of children affects squishing of parents.
11366 +
11367 + The "argument" @node tells flush where to start. From there, flush finds the
11368 + left edge of the slum, and calls squalloc (in which nodes are squeezed and
11369 + allocated). To find a "better place" to start squalloc first we perform a
11370 + flush_scan.
11371 +
11372 + Flush-scanning may be performed in both left and right directions, but for
11373 + different purposes. When scanning to the left, we are searching for a node
11374 + that precedes a sequence of parent-first-ordered nodes which we will then
11375 + flush in parent-first order. During flush-scanning, we also take the
11376 + opportunity to count the number of consecutive leaf nodes. If this number is
11377 + past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
11378 + reallocate leaf nodes (thus favoring write-optimization).
11379 +
11380 + Since the flush argument node can be anywhere in a sequence of dirty leaves,
11381 + there may also be dirty nodes to the right of the argument. If the scan-left
11382 + operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
11383 + follow it with a right-scan operation to see whether there is, in fact,
11384 + enough nodes to meet the relocate threshold. Each right- and left-scan
11385 + operation uses a single flush_scan object.
11386 +
11387 + After left-scan and possibly right-scan, we prepare a flush_position object
11388 + with the starting flush point or parent coordinate, which was determined
11389 + using scan-left.
11390 +
11391 + Next we call the main flush routine, squalloc, which iterates along the leaf
11392 + level, squeezing and allocating nodes (and placing them into the flush
11393 + queue).
11394 +
11395 + After squalloc returns we take extra steps to ensure that all the children
11396 + of the final twig node are allocated--this involves repeating squalloc
11397 + until we finish at a twig with no unallocated children.
11398 +
11399 + Finally, we call flush_empty_queue to submit write-requests to disk. If we
11400 + encounter any above-twig nodes during flush_empty_queue that still have
11401 + unallocated children, we flush_unprep them.
11402 +
11403 + Flush treats several "failure" cases as non-failures, essentially causing
11404 + them to start over. E_DEADLOCK is one example.
11405 + FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
11406 + properly rather than restarting, but there are a bunch of cases to audit.
11407 +*/
11408 +
11409 +static int
11410 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11411 + flush_queue_t *fq, int flags)
11412 +{
11413 + long ret = 0;
11414 + flush_scan *right_scan;
11415 + flush_scan *left_scan;
11416 + flush_pos_t *flush_pos;
11417 + int todo;
11418 + struct super_block *sb;
11419 + reiser4_super_info_data *sbinfo;
11420 + jnode *leftmost_in_slum = NULL;
11421 +
11422 + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11423 + assert("nikita-3022", reiser4_schedulable());
11424 +
11425 + assert("nikita-3185",
11426 + get_current_super_private()->delete_mutex_owner != current);
11427 +
11428 + /* allocate right_scan, left_scan and flush_pos */
11429 + right_scan =
11430 + kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11431 + reiser4_ctx_gfp_mask_get());
11432 + if (right_scan == NULL)
11433 + return RETERR(-ENOMEM);
11434 + left_scan = right_scan + 1;
11435 + flush_pos = (flush_pos_t *) (left_scan + 1);
11436 +
11437 + sb = reiser4_get_current_sb();
11438 + sbinfo = get_super_private(sb);
11439 +
11440 + /* Flush-concurrency debug code */
11441 +#if REISER4_DEBUG
11442 + atomic_inc(&flush_cnt);
11443 +#endif
11444 +
11445 + reiser4_enter_flush(sb);
11446 +
11447 + /* Initialize a flush position. */
11448 + pos_init(flush_pos);
11449 +
11450 + flush_pos->nr_written = nr_written;
11451 + flush_pos->fq = fq;
11452 + flush_pos->flags = flags;
11453 + flush_pos->nr_to_write = nr_to_write;
11454 +
11455 + scan_init(right_scan);
11456 + scan_init(left_scan);
11457 +
11458 + /* First scan left and remember the leftmost scan position. If the
11459 + leftmost position is unformatted we remember its parent_coord. We
11460 + scan until counting FLUSH_SCAN_MAXNODES.
11461 +
11462 + If starting @node is unformatted, at the beginning of left scan its
11463 + parent (twig level node, containing extent item) will be long term
11464 + locked and lock handle will be stored in the
11465 + @right_scan->parent_lock. This lock is used to start the rightward
11466 + scan without redoing the tree traversal (necessary to find parent)
11467 + and, hence, is kept during leftward scan. As a result, we have to
11468 + use try-lock when taking long term locks during the leftward scan.
11469 + */
11470 + ret = scan_left(left_scan, right_scan,
11471 + node, sbinfo->flush.scan_maxnodes);
11472 + if (ret != 0)
11473 + goto failed;
11474 +
11475 + leftmost_in_slum = jref(left_scan->node);
11476 + scan_done(left_scan);
11477 +
11478 + /* Then possibly go right to decide if we will use a policy of
11479 + relocating leaves. This is only done if we did not scan past (and
11480 + count) enough nodes during the leftward scan. If we do scan right,
11481 + we only care to go far enough to establish that at least
11482 + FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
11483 + limit is the difference between left_scan.count and the threshold. */
11484 +
11485 + todo = sbinfo->flush.relocate_threshold - left_scan->count;
11486 + /* scan right is inherently deadlock prone, because we are
11487 + * (potentially) holding a lock on the twig node at this moment.
11488 + * FIXME: this is incorrect comment: lock is not held */
11489 + if (todo > 0) {
11490 + ret = scan_right(right_scan, node, (unsigned)todo);
11491 + if (ret != 0)
11492 + goto failed;
11493 + }
11494 +
11495 + /* Only the right-scan count is needed, release any rightward locks
11496 + right away. */
11497 + scan_done(right_scan);
11498 +
11499 + /* ... and the answer is: we should relocate leaf nodes if at least
11500 + FLUSH_RELOCATE_THRESHOLD nodes were found. */
11501 + flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11502 + (left_scan->count + right_scan->count >=
11503 + sbinfo->flush.relocate_threshold);
11504 +
11505 + /* Funny business here. We set the 'point' in the flush_position at
11506 + prior to starting squalloc regardless of whether the first point is
11507 + formatted or unformatted. Without this there would be an invariant,
11508 + in the rest of the code, that if the flush_position is unformatted
11509 + then flush_position->point is NULL and
11510 + flush_position->parent_{lock,coord} is set, and if the flush_position
11511 + is formatted then flush_position->point is non-NULL and no parent
11512 + info is set.
11513 +
11514 + This seems lazy, but it makes the initial calls to
11515 + reverse_relocate_test (which ask "is it the pos->point the leftmost
11516 + child of its parent") much easier because we know the first child
11517 + already. Nothing is broken by this, but the reasoning is subtle.
11518 + Holding an extra reference on a jnode during flush can cause us to
11519 + see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11520 + removed from sibling lists until they have zero reference count.
11521 + Flush would never observe a HEARD_BANSHEE node on the left-edge of
11522 + flush, nodes are only deleted to the right. So if nothing is broken,
11523 + why fix it?
11524 +
11525 + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11526 + point and in any moment, because of the concurrent file system
11527 + activity (for example, truncate). */
11528 +
11529 + /* Check jnode state after flush_scan completed. Having a lock on this
11530 + node or its parent (in case of unformatted) helps us in case of
11531 + concurrent flushing. */
11532 + if (jnode_check_flushprepped(leftmost_in_slum)
11533 + && !jnode_convertible(leftmost_in_slum)) {
11534 + ret = 0;
11535 + goto failed;
11536 + }
11537 +
11538 + /* Now setup flush_pos using scan_left's endpoint. */
11539 + ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11540 + if (ret)
11541 + goto failed;
11542 +
11543 + if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11544 + && node_is_empty(flush_pos->coord.node)) {
11545 + znode *empty = flush_pos->coord.node;
11546 +
11547 + assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11548 + ret = delete_empty_node(empty);
11549 + goto failed;
11550 + }
11551 +
11552 + if (jnode_check_flushprepped(leftmost_in_slum)
11553 + && !jnode_convertible(leftmost_in_slum)) {
11554 + ret = 0;
11555 + goto failed;
11556 + }
11557 +
11558 + /* Set pos->preceder and (re)allocate pos and its ancestors if it is
11559 + needed */
11560 + ret = alloc_pos_and_ancestors(flush_pos);
11561 + if (ret)
11562 + goto failed;
11563 +
11564 + /* Do the main rightward-bottom-up squeeze and allocate loop. */
11565 + ret = squalloc(flush_pos);
11566 + pos_stop(flush_pos);
11567 + if (ret)
11568 + goto failed;
11569 +
11570 + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
11571 + children. First, the pos_stop() and pos_valid() routines should be
11572 + modified so that pos_stop() sets a flush_position->stop flag to 1
11573 + without releasing the current position immediately--instead release
11574 + it in pos_done(). This is a better implementation than the current
11575 + one anyway.
11576 +
11577 + It is not clear that all fields of the flush_position should not be
11578 + released, but at the very least the parent_lock, parent_coord, and
11579 + parent_load should remain held because they are hold the last twig
11580 + when pos_stop() is called.
11581 +
11582 + When we reach this point in the code, if the parent_coord is set to
11583 + after the last item then we know that flush reached the end of a twig
11584 + (and according to the new flush queueing design, we will return now).
11585 + If parent_coord is not past the last item, we should check if the
11586 + current twig has any unallocated children to the right (we are not
11587 + concerned with unallocated children to the left--in that case the
11588 + twig itself should not have been allocated). If the twig has
11589 + unallocated children to the right, set the parent_coord to that
11590 + position and then repeat the call to squalloc.
11591 +
11592 + Testing for unallocated children may be defined in two ways: if any
11593 + internal item has a fake block number, it is unallocated; if any
11594 + extent item is unallocated then all of its children are unallocated.
11595 + But there is a more aggressive approach: if there are any dirty
11596 + children of the twig to the right of the current position, we may
11597 + wish to relocate those nodes now. Checking for potential relocation
11598 + is more expensive as it requires knowing whether there are any dirty
11599 + children that are not unallocated. The extent_needs_allocation should
11600 + be used after setting the correct preceder.
11601 +
11602 + When we reach the end of a twig at this point in the code, if the
11603 + flush can continue (when the queue is ready) it will need some
11604 + information on the future starting point. That should be stored away
11605 + in the flush_handle using a seal, I believe. Holding a jref() on the
11606 + future starting point may break other code that deletes that node.
11607 + */
11608 +
11609 + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
11610 + called above the twig level. If the VM calls flush above the twig
11611 + level, do nothing and return (but figure out why this happens). The
11612 + txnmgr should be modified to only flush its leaf-level dirty list.
11613 + This will do all the necessary squeeze and allocate steps but leave
11614 + unallocated branches and possibly unallocated twigs (when the twig's
11615 + leftmost child is not dirty). After flushing the leaf level, the
11616 + remaining unallocated nodes should be given write-optimized
11617 + locations. (Possibly, the remaining unallocated twigs should be
11618 + allocated just before their leftmost child.)
11619 + */
11620 +
11621 + /* Any failure reaches this point. */
11622 +failed:
11623 +
11624 + switch (ret) {
11625 + case -E_REPEAT:
11626 + case -EINVAL:
11627 + case -E_DEADLOCK:
11628 + case -E_NO_NEIGHBOR:
11629 + case -ENOENT:
11630 + /* FIXME(C): Except for E_DEADLOCK, these should probably be
11631 + handled properly in each case. They already are handled in
11632 + many cases. */
11633 + /* Something bad happened, but difficult to avoid... Try again!
11634 + */
11635 + ret = 0;
11636 + }
11637 +
11638 + if (leftmost_in_slum)
11639 + jput(leftmost_in_slum);
11640 +
11641 + pos_done(flush_pos);
11642 + scan_done(left_scan);
11643 + scan_done(right_scan);
11644 + kfree(right_scan);
11645 +
11646 + ON_DEBUG(atomic_dec(&flush_cnt));
11647 +
11648 + reiser4_leave_flush(sb);
11649 +
11650 + return ret;
11651 +}
11652 +
11653 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11654 + * flusher should submit all prepped nodes immediately without keeping them in
11655 + * flush queues for long time. The reason for rapid flush mode is to free
11656 + * memory as fast as possible. */
11657 +
11658 +#if REISER4_USE_RAPID_FLUSH
11659 +
11660 +/**
11661 + * submit all prepped nodes if rapid flush mode is set,
11662 + * turn rapid flush mode off.
11663 + */
11664 +
11665 +static int rapid_flush(flush_pos_t *pos)
11666 +{
11667 + if (!wbq_available())
11668 + return 0;
11669 +
11670 + return write_prepped_nodes(pos);
11671 +}
11672 +
11673 +#else
11674 +
11675 +#define rapid_flush(pos) (0)
11676 +
11677 +#endif /* REISER4_USE_RAPID_FLUSH */
11678 +
11679 +static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom,
11680 + flush_queue_t *fq, int *nr_queued,
11681 + int flags)
11682 +{
11683 + jnode * node;
11684 +
11685 + if (start != NULL) {
11686 + spin_lock_jnode(start);
11687 + if (!jnode_is_flushprepped(start)) {
11688 + assert("zam-1056", start->atom == atom);
11689 + node = start;
11690 + goto enter;
11691 + }
11692 + spin_unlock_jnode(start);
11693 + }
11694 + /*
11695 + * In this loop we process all already prepped (RELOC or OVRWR) and
11696 + * dirtied again nodes. The atom spin lock is not released until all
11697 + * dirty nodes processed or not prepped node found in the atom dirty
11698 + * lists.
11699 + */
11700 + while ((node = find_first_dirty_jnode(atom, flags))) {
11701 + spin_lock_jnode(node);
11702 +enter:
11703 + assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11704 + assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11705 +
11706 + if (JF_ISSET(node, JNODE_WRITEBACK)) {
11707 + /* move node to the end of atom's writeback list */
11708 + list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11709 +
11710 + /*
11711 + * jnode is not necessarily on dirty list: if it was
11712 + * dirtied when it was on flush queue - it does not get
11713 + * moved to dirty list
11714 + */
11715 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11716 + WB_LIST, 1));
11717 +
11718 + } else if (jnode_is_znode(node)
11719 + && znode_above_root(JZNODE(node))) {
11720 + /*
11721 + * A special case for znode-above-root. The above-root
11722 + * (fake) znode is captured and dirtied when the tree
11723 + * height changes or when the root node is relocated.
11724 + * This causes atoms to fuse so that changes at the root
11725 + * are serialized. However, this node is never flushed.
11726 + * This special case used to be in lock.c to prevent the
11727 + * above-root node from ever being captured, but now
11728 + * that it is captured we simply prevent it from
11729 + * flushing. The log-writer code relies on this to
11730 + * properly log superblock modifications of the tree
11731 + * height.
11732 + */
11733 + jnode_make_wander_nolock(node);
11734 + } else if (JF_ISSET(node, JNODE_RELOC)) {
11735 + queue_jnode(fq, node);
11736 + ++(*nr_queued);
11737 + } else
11738 + break;
11739 +
11740 + spin_unlock_jnode(node);
11741 + }
11742 + return node;
11743 +}
11744 +
11745 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
11746 + * more nodes to flush, return 0 if atom's dirty lists empty and keep current
11747 + * atom locked, return other errors as they are. */
11748 +int
11749 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11750 + txn_atom ** atom, jnode *start)
11751 +{
11752 + reiser4_super_info_data *sinfo = get_current_super_private();
11753 + flush_queue_t *fq = NULL;
11754 + jnode *node;
11755 + int nr_queued;
11756 + int ret;
11757 +
11758 + assert("zam-889", atom != NULL && *atom != NULL);
11759 + assert_spin_locked(&((*atom)->alock));
11760 + assert("zam-892", get_current_context()->trans->atom == *atom);
11761 +
11762 + nr_to_write = LONG_MAX;
11763 + while (1) {
11764 + ret = reiser4_fq_by_atom(*atom, &fq);
11765 + if (ret != -E_REPEAT)
11766 + break;
11767 + *atom = get_current_atom_locked();
11768 + }
11769 + if (ret)
11770 + return ret;
11771 +
11772 + assert_spin_locked(&((*atom)->alock));
11773 +
11774 + /* parallel flushers limit */
11775 + if (sinfo->tmgr.atom_max_flushers != 0) {
11776 + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11777 + /* An reiser4_atom_send_event() call is inside
11778 + reiser4_fq_put_nolock() which is called when flush is
11779 + finished and nr_flushers is decremented. */
11780 + reiser4_atom_wait_event(*atom);
11781 + *atom = get_current_atom_locked();
11782 + }
11783 + }
11784 +
11785 + /* count ourself as a flusher */
11786 + (*atom)->nr_flushers++;
11787 +
11788 + writeout_mode_enable();
11789 +
11790 + nr_queued = 0;
11791 + node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11792 +
11793 + if (node == NULL) {
11794 + if (nr_queued == 0) {
11795 + (*atom)->nr_flushers--;
11796 + reiser4_fq_put_nolock(fq);
11797 + reiser4_atom_send_event(*atom);
11798 + /* current atom remains locked */
11799 + writeout_mode_disable();
11800 + return 0;
11801 + }
11802 + spin_unlock_atom(*atom);
11803 + } else {
11804 + jref(node);
11805 + BUG_ON((*atom)->super != node->tree->super);
11806 + spin_unlock_atom(*atom);
11807 + spin_unlock_jnode(node);
11808 + BUG_ON(nr_to_write == 0);
11809 + ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11810 + jput(node);
11811 + }
11812 +
11813 + ret =
11814 + reiser4_write_fq(fq, nr_submitted,
11815 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11816 +
11817 + *atom = get_current_atom_locked();
11818 + (*atom)->nr_flushers--;
11819 + reiser4_fq_put_nolock(fq);
11820 + reiser4_atom_send_event(*atom);
11821 + spin_unlock_atom(*atom);
11822 +
11823 + writeout_mode_disable();
11824 +
11825 + if (ret == 0)
11826 + ret = -E_REPEAT;
11827 +
11828 + return ret;
11829 +}
11830 +
11831 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11832 +
11833 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation
11834 + in the reverse parent-first relocate context. Here all we know is the
11835 + preceder and the block number. Since we are going in reverse, the preceder
11836 + may still be relocated as well, so we can't ask the block allocator "is there
11837 + a closer block available to relocate?" here. In the _forward_ parent-first
11838 + relocate context (not here) we actually call the block allocator to try and
11839 + find a closer location. */
11840 +static int
11841 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11842 + const reiser4_block_nr * nblk)
11843 +{
11844 + reiser4_block_nr dist;
11845 +
11846 + assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11847 + assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11848 + assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11849 +
11850 + /* Distance is the absolute value. */
11851 + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11852 +
11853 + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
11854 + its preceder block, do not relocate. */
11855 + if (dist <= get_current_super_private()->flush.relocate_distance)
11856 + return 0;
11857 +
11858 + return 1;
11859 +}
11860 +
11861 +/* This function is a predicate that tests for relocation. Always called in the
11862 + reverse-parent-first context, when we are asking whether the current node
11863 + should be relocated in order to expand the flush by dirtying the parent level
11864 + (and thus proceeding to flush that level). When traversing in the forward
11865 + parent-first direction (not here), relocation decisions are handled in two
11866 + places: allocate_znode() and extent_needs_allocation(). */
11867 +static int
11868 +reverse_relocate_test(jnode * node, const coord_t *parent_coord,
11869 + flush_pos_t *pos)
11870 +{
11871 + reiser4_block_nr pblk = 0;
11872 + reiser4_block_nr nblk = 0;
11873 +
11874 + assert("jmacd-8989", !jnode_is_root(node));
11875 +
11876 + /*
11877 + * This function is called only from the
11878 + * reverse_relocate_check_dirty_parent() and only if the parent
11879 + * node is clean. This implies that the parent has the real (i.e., not
11880 + * fake) block number, and, so does the child, because otherwise the
11881 + * parent would be dirty.
11882 + */
11883 +
11884 + /* New nodes are treated as if they are being relocated. */
11885 + if (JF_ISSET(node, JNODE_CREATED) ||
11886 + (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL))
11887 + return 1;
11888 +
11889 + /* Find the preceder. FIXME(B): When the child is an unformatted,
11890 + previously existing node, the coord may be leftmost even though the
11891 + child is not the parent-first preceder of the parent. If the first
11892 + dirty node appears somewhere in the middle of the first extent unit,
11893 + this preceder calculation is wrong.
11894 + Needs more logic in here. */
11895 + if (coord_is_leftmost_unit(parent_coord)) {
11896 + pblk = *znode_get_block(parent_coord->node);
11897 + } else {
11898 + pblk = pos->preceder.blk;
11899 + }
11900 + check_preceder(pblk);
11901 +
11902 + /* If (pblk == 0) then the preceder isn't allocated or isn't known:
11903 + relocate. */
11904 + if (pblk == 0)
11905 + return 1;
11906 +
11907 + nblk = *jnode_get_block(node);
11908 +
11909 + if (reiser4_blocknr_is_fake(&nblk))
11910 + /* child is unallocated, mark parent dirty */
11911 + return 1;
11912 +
11913 + return reverse_relocate_if_close_enough(&pblk, &nblk);
11914 +}
11915 +
11916 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11917 + relocation decision and then, if yes, it marks the parent dirty. */
11918 +static int
11919 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord,
11920 + flush_pos_t *pos)
11921 +{
11922 + int ret;
11923 +
11924 + if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11925 +
11926 + ret = reverse_relocate_test(node, parent_coord, pos);
11927 + if (ret < 0)
11928 + return ret;
11929 +
11930 + /* FIXME-ZAM
11931 + if parent is already relocated - we do not want to grab space,
11932 + right? */
11933 + if (ret == 1) {
11934 + int grabbed;
11935 +
11936 + grabbed = get_current_context()->grabbed_blocks;
11937 + if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11938 + 0)
11939 + reiser4_panic("umka-1250",
11940 + "No space left during flush.");
11941 +
11942 + assert("jmacd-18923",
11943 + znode_is_write_locked(parent_coord->node));
11944 + znode_make_dirty(parent_coord->node);
11945 + grabbed2free_mark(grabbed);
11946 + }
11947 + }
11948 +
11949 + return 0;
11950 +}
11951 +
11952 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
11953 + FORWARD PARENT-FIRST LOOP BEGINS) */
11954 +
11955 +/* Get the leftmost child for given coord. */
11956 +static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child)
11957 +{
11958 + int ret;
11959 +
11960 + ret = item_utmost_child(coord, LEFT_SIDE, child);
11961 +
11962 + if (ret)
11963 + return ret;
11964 +
11965 + if (IS_ERR(*child))
11966 + return PTR_ERR(*child);
11967 +
11968 + return 0;
11969 +}
11970 +
11971 +/* This step occurs after the left- and right-scans are completed, before
11972 + starting the forward parent-first traversal. Here we attempt to allocate
11973 + ancestors of the starting flush point, which means continuing in the reverse
11974 + parent-first direction to the parent, grandparent, and so on (as long as the
11975 + child is a leftmost child). This routine calls a recursive process,
11976 + alloc_one_ancestor, which does the real work, except there is special-case
11977 + handling here for the first ancestor, which may be a twig. At each level
11978 + (here and alloc_one_ancestor), we check for relocation and then, if the child
11979 + is a leftmost child, repeat at the next level. On the way back down (the
11980 + recursion), we allocate the ancestors in parent-first order. */
11981 +static int alloc_pos_and_ancestors(flush_pos_t *pos)
11982 +{
11983 + int ret = 0;
11984 + lock_handle plock;
11985 + load_count pload;
11986 + coord_t pcoord;
11987 +
11988 + if (znode_check_flushprepped(pos->lock.node))
11989 + return 0;
11990 +
11991 + coord_init_invalid(&pcoord, NULL);
11992 + init_lh(&plock);
11993 + init_load_count(&pload);
11994 +
11995 + if (pos->state == POS_ON_EPOINT) {
11996 + /* a special case for pos on twig level, where we already have
11997 + a lock on parent node. */
11998 + /* The parent may not be dirty, in which case we should decide
11999 + whether to relocate the child now. If decision is made to
12000 + relocate the child, the parent is marked dirty. */
12001 + ret =
12002 + reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
12003 + pos);
12004 + if (ret)
12005 + goto exit;
12006 +
12007 + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
12008 + is leftmost) and the leaf/child, so recursion is not needed.
12009 + Levels above the twig will be allocated for
12010 + write-optimization before the transaction commits. */
12011 +
12012 + /* Do the recursive step, allocating zero or more of our
12013 + * ancestors. */
12014 + ret = alloc_one_ancestor(&pos->coord, pos);
12015 +
12016 + } else {
12017 + if (!znode_is_root(pos->lock.node)) {
12018 + /* all formatted nodes except tree root */
12019 + ret =
12020 + reiser4_get_parent(&plock, pos->lock.node,
12021 + ZNODE_WRITE_LOCK);
12022 + if (ret)
12023 + goto exit;
12024 +
12025 + ret = incr_load_count_znode(&pload, plock.node);
12026 + if (ret)
12027 + goto exit;
12028 +
12029 + ret =
12030 + find_child_ptr(plock.node, pos->lock.node, &pcoord);
12031 + if (ret)
12032 + goto exit;
12033 +
12034 + ret =
12035 + reverse_relocate_check_dirty_parent(ZJNODE
12036 + (pos->lock.
12037 + node), &pcoord,
12038 + pos);
12039 + if (ret)
12040 + goto exit;
12041 +
12042 + ret = alloc_one_ancestor(&pcoord, pos);
12043 + if (ret)
12044 + goto exit;
12045 + }
12046 +
12047 + ret = allocate_znode(pos->lock.node, &pcoord, pos);
12048 + }
12049 +exit:
12050 + done_load_count(&pload);
12051 + done_lh(&plock);
12052 + return ret;
12053 +}
12054 +
12055 +/* This is the recursive step described in alloc_pos_and_ancestors, above.
12056 + Ignoring the call to set_preceder, which is the next function described, this
12057 + checks if the child is a leftmost child and returns if it is not. If the
12058 + child is a leftmost child it checks for relocation, possibly dirtying the
12059 + parent. Then it performs the recursive step. */
12060 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos)
12061 +{
12062 + int ret = 0;
12063 + lock_handle alock;
12064 + load_count aload;
12065 + coord_t acoord;
12066 +
12067 + /* As we ascend at the left-edge of the region to flush, take this
12068 + opportunity at the twig level to find our parent-first preceder
12069 + unless we have already set it. */
12070 + if (pos->preceder.blk == 0) {
12071 + ret = set_preceder(coord, pos);
12072 + if (ret != 0)
12073 + return ret;
12074 + }
12075 +
12076 + /* If the ancestor is clean or already allocated, or if the child is not
12077 + a leftmost child, stop going up, even leaving coord->node not
12078 + flushprepped. */
12079 + if (znode_check_flushprepped(coord->node)
12080 + || !coord_is_leftmost_unit(coord))
12081 + return 0;
12082 +
12083 + init_lh(&alock);
12084 + init_load_count(&aload);
12085 + coord_init_invalid(&acoord, NULL);
12086 +
12087 + /* Only ascend to the next level if it is a leftmost child, but
12088 + write-lock the parent in case we will relocate the child. */
12089 + if (!znode_is_root(coord->node)) {
12090 +
12091 + ret =
12092 + jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12093 + &alock, &aload, ZNODE_WRITE_LOCK,
12094 + 0);
12095 + if (ret != 0) {
12096 + /* FIXME(C): check EINVAL, E_DEADLOCK */
12097 + goto exit;
12098 + }
12099 +
12100 + ret =
12101 + reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12102 + &acoord, pos);
12103 + if (ret != 0)
12104 + goto exit;
12105 +
12106 + /* Recursive call. */
12107 + if (!znode_check_flushprepped(acoord.node)) {
12108 + ret = alloc_one_ancestor(&acoord, pos);
12109 + if (ret)
12110 + goto exit;
12111 + }
12112 + }
12113 +
12114 + /* Note: we call allocate with the parent write-locked (except at the
12115 + root) in case we relocate the child, in which case it will modify the
12116 + parent during this call. */
12117 + ret = allocate_znode(coord->node, &acoord, pos);
12118 +
12119 +exit:
12120 + done_load_count(&aload);
12121 + done_lh(&alock);
12122 + return ret;
12123 +}
12124 +
12125 +/* During the reverse parent-first alloc_pos_and_ancestors process described
12126 + above there is a call to this function at the twig level. During
12127 + alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
12128 + parent-first context)? We repeat this process as long as the child is the
12129 + leftmost child, eventually reaching an ancestor of the flush point that is
12130 + not a leftmost child. The preceder of that ancestors, which is not a leftmost
12131 + child, is actually on the leaf level. The preceder of that block is the
12132 + left-neighbor of the flush point. The preceder of that block is the rightmost
12133 + child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
12134 + through the twig level, it stops momentarily to remember the block of the
12135 + rightmost child of the twig on the left and sets it to the flush_position's
12136 + preceder_hint.
12137 +
12138 + There is one other place where we may set the flush_position's preceder hint,
12139 + which is during scan-left.
12140 +*/
12141 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos)
12142 +{
12143 + int ret;
12144 + coord_t coord;
12145 + lock_handle left_lock;
12146 + load_count left_load;
12147 +
12148 + coord_dup(&coord, coord_in);
12149 +
12150 + init_lh(&left_lock);
12151 + init_load_count(&left_load);
12152 +
12153 + /* FIXME(B): Same FIXME as in "Find the preceder" in
12154 + reverse_relocate_test. coord_is_leftmost_unit is not the right test
12155 + if the unformatted child is in the middle of the first extent unit.*/
12156 + if (!coord_is_leftmost_unit(&coord)) {
12157 + coord_prev_unit(&coord);
12158 + } else {
12159 + ret =
12160 + reiser4_get_left_neighbor(&left_lock, coord.node,
12161 + ZNODE_READ_LOCK, GN_SAME_ATOM);
12162 + if (ret) {
12163 + /* If we fail for any reason it doesn't matter because
12164 + the preceder is only a hint. We are low-priority at
12165 + this point, so this must be the case. */
12166 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12167 + ret == -ENOENT || ret == -EINVAL
12168 + || ret == -E_DEADLOCK)
12169 + ret = 0;
12170 + goto exit;
12171 + }
12172 +
12173 + ret = incr_load_count_znode(&left_load, left_lock.node);
12174 + if (ret)
12175 + goto exit;
12176 +
12177 + coord_init_last_unit(&coord, left_lock.node);
12178 + }
12179 +
12180 + ret =
12181 + item_utmost_child_real_block(&coord, RIGHT_SIDE,
12182 + &pos->preceder.blk);
12183 +exit:
12184 + check_preceder(pos->preceder.blk);
12185 + done_load_count(&left_load);
12186 + done_lh(&left_lock);
12187 + return ret;
12188 +}
12189 +
12190 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12191 +
12192 +/* This procedure implements the outer loop of the flush algorithm. To put this
12193 + in context, here is the general list of steps taken by the flush routine as a
12194 + whole:
12195 +
12196 + 1. Scan-left
12197 + 2. Scan-right (maybe)
12198 + 3. Allocate initial flush position and its ancestors
12199 + 4. <handle extents>
12200 + 5. <squeeze and next position and its ancestors to-the-right,
12201 + then update position to-the-right>
12202 + 6. <repeat from #4 until flush is stopped>
12203 +
12204 + This procedure implements the loop in steps 4 through 6 in the above listing.
12205 +
12206 + Step 4: if the current flush position is an extent item (position on the twig
12207 + level), it allocates the extent (allocate_extent_item_in_place) then shifts
12208 + to the next coordinate. If the next coordinate's leftmost child needs
12209 + flushprep, we will continue. If the next coordinate is an internal item, we
12210 + descend back to the leaf level, otherwise we repeat a step #4 (labeled
12211 + ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
12212 + twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
12213 + next (right) twig, prior to step #5 which moves to the right.
12214 +
12215 + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
12216 + the tree to allocate any ancestors of the next-right flush position that are
12217 + not also ancestors of the current position. Those ancestors (in top-down
12218 + order) are the next in parent-first order. We squeeze adjacent nodes on the
12219 + way up until the right node and current node share the same parent, then
12220 + allocate on the way back down. Finally, this step sets the flush position to
12221 + the next-right node. Then repeat steps 4 and 5.
12222 +*/
12223 +
12224 +/* SQUEEZE CODE */
12225 +
12226 +/* squalloc_right_twig helper function, cut a range of extent items from
12227 + cut node to->node from the beginning up to coord @to. */
12228 +static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key,
12229 + znode * left)
12230 +{
12231 + coord_t from;
12232 + reiser4_key from_key;
12233 +
12234 + coord_init_first_unit(&from, to->node);
12235 + item_key_by_coord(&from, &from_key);
12236 +
12237 + return cut_node_content(&from, to, &from_key, to_key, NULL);
12238 +}
12239 +
12240 +/* Copy as much of the leading extents from @right to @left, allocating
12241 + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
12242 + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
12243 + internal item it calls shift_one_internal_unit and may then return
12244 + SUBTREE_MOVED. */
12245 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos)
12246 +{
12247 + int ret = SUBTREE_MOVED;
12248 + coord_t coord; /* used to iterate over items */
12249 + reiser4_key stop_key;
12250 +
12251 + assert("jmacd-2008", !node_is_empty(right));
12252 + coord_init_first_unit(&coord, right);
12253 +
12254 + /* FIXME: can be optimized to cut once */
12255 + while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12256 + ON_DEBUG(void *vp);
12257 +
12258 + assert("vs-1468", coord_is_leftmost_unit(&coord));
12259 + ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12260 +
12261 + /* stop_key is used to find what was copied and what to cut */
12262 + stop_key = *reiser4_min_key();
12263 + ret = squalloc_extent(left, &coord, pos, &stop_key);
12264 + if (ret != SQUEEZE_CONTINUE) {
12265 + ON_DEBUG(kfree(vp));
12266 + break;
12267 + }
12268 + assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
12269 +
12270 + /* Helper function to do the cutting. */
12271 + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12272 + check_me("vs-1466",
12273 + squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12274 +
12275 + ON_DEBUG(shift_check(vp, left, coord.node));
12276 + }
12277 +
12278 + if (node_is_empty(coord.node))
12279 + ret = SQUEEZE_SOURCE_EMPTY;
12280 +
12281 + if (ret == SQUEEZE_TARGET_FULL)
12282 + goto out;
12283 +
12284 + if (node_is_empty(right)) {
12285 + /* The whole right node was copied into @left. */
12286 + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12287 + goto out;
12288 + }
12289 +
12290 + coord_init_first_unit(&coord, right);
12291 +
12292 + if (!item_is_internal(&coord)) {
12293 + /* we do not want to squeeze anything else to left neighbor
12294 + because "slum" is over */
12295 + ret = SQUEEZE_TARGET_FULL;
12296 + goto out;
12297 + }
12298 + assert("jmacd-433", item_is_internal(&coord));
12299 +
12300 + /* Shift an internal unit. The child must be allocated before shifting
12301 + any more extents, so we stop here. */
12302 + ret = shift_one_internal_unit(left, right);
12303 +
12304 +out:
12305 + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12306 + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12307 +
12308 + if (ret == SQUEEZE_TARGET_FULL) {
12309 + /* We submit prepped nodes here and expect that this @left twig
12310 + * will not be modified again during this jnode_flush() call. */
12311 + int ret1;
12312 +
12313 + /* NOTE: seems like io is done under long term locks. */
12314 + ret1 = write_prepped_nodes(pos);
12315 + if (ret1 < 0)
12316 + return ret1;
12317 + }
12318 +
12319 + return ret;
12320 +}
12321 +
12322 +#if REISER4_DEBUG
12323 +static void item_convert_invariant(flush_pos_t *pos)
12324 +{
12325 + assert("edward-1225", coord_is_existing_item(&pos->coord));
12326 + if (chaining_data_present(pos)) {
12327 + item_plugin *iplug = item_convert_plug(pos);
12328 +
12329 + assert("edward-1000",
12330 + iplug == item_plugin_by_coord(&pos->coord));
12331 + assert("edward-1001", iplug->f.convert != NULL);
12332 + } else
12333 + assert("edward-1226", pos->child == NULL);
12334 +}
12335 +#else
12336 +
12337 +#define item_convert_invariant(pos) noop
12338 +
12339 +#endif
12340 +
12341 +/* Scan node items starting from the first one and apply for each
12342 + item its flush ->convert() method (if any). This method may
12343 + resize/kill the item so the tree will be changed.
12344 +*/
12345 +static int convert_node(flush_pos_t *pos, znode * node)
12346 +{
12347 + int ret = 0;
12348 + item_plugin *iplug;
12349 +
12350 + assert("edward-304", pos != NULL);
12351 + assert("edward-305", pos->child == NULL);
12352 + assert("edward-475", znode_convertible(node));
12353 + assert("edward-669", znode_is_wlocked(node));
12354 + assert("edward-1210", !node_is_empty(node));
12355 +
12356 + if (znode_get_level(node) != LEAF_LEVEL)
12357 + /* unsupported */
12358 + goto exit;
12359 +
12360 + coord_init_first_unit(&pos->coord, node);
12361 +
12362 + while (1) {
12363 + ret = 0;
12364 + coord_set_to_left(&pos->coord);
12365 + item_convert_invariant(pos);
12366 +
12367 + iplug = item_plugin_by_coord(&pos->coord);
12368 + assert("edward-844", iplug != NULL);
12369 +
12370 + if (iplug->f.convert) {
12371 + ret = iplug->f.convert(pos);
12372 + if (ret)
12373 + goto exit;
12374 + }
12375 + assert("edward-307", pos->child == NULL);
12376 +
12377 + if (coord_next_item(&pos->coord)) {
12378 + /* node is over */
12379 +
12380 + if (!chaining_data_present(pos))
12381 + /* finished this node */
12382 + break;
12383 + if (should_chain_next_node(pos)) {
12384 + /* go to next node */
12385 + move_chaining_data(pos, 0/* to next node */);
12386 + break;
12387 + }
12388 + /* repeat this node */
12389 + move_chaining_data(pos, 1/* this node */);
12390 + continue;
12391 + }
12392 + /* Node is not over.
12393 + Check if there is attached convert data.
12394 + If so roll one item position back and repeat
12395 + on this node
12396 + */
12397 + if (chaining_data_present(pos)) {
12398 +
12399 + if (iplug != item_plugin_by_coord(&pos->coord))
12400 + set_item_convert_count(pos, 0);
12401 +
12402 + ret = coord_prev_item(&pos->coord);
12403 + assert("edward-1003", !ret);
12404 +
12405 + move_chaining_data(pos, 1/* this node */);
12406 + }
12407 + }
12408 + JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12409 + znode_make_dirty(node);
12410 +exit:
12411 + assert("edward-1004", !ret);
12412 + return ret;
12413 +}
12414 +
12415 +/* Squeeze and allocate the right neighbor. This is called after @left and
12416 + its current children have been squeezed and allocated already. This
12417 + procedure's job is to squeeze and items from @right to @left.
12418 +
12419 + If at the leaf level, use the shift_everything_left memcpy-optimized
12420 + version of shifting (squeeze_right_leaf).
12421 +
12422 + If at the twig level, extents are allocated as they are shifted from @right
12423 + to @left (squalloc_right_twig).
12424 +
12425 + At any other level, shift one internal item and return to the caller
12426 + (squalloc_parent_first) so that the shifted-subtree can be processed in
12427 + parent-first order.
12428 +
12429 + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12430 + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12431 + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12432 + is returned.
12433 +*/
12434 +
12435 +static int squeeze_right_neighbor(flush_pos_t *pos, znode * left,
12436 + znode * right)
12437 +{
12438 + int ret;
12439 +
12440 + /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12441 + * tree owing to error (for example, ENOSPC) in write */
12442 + /* assert("jmacd-9321", !node_is_empty(left)); */
12443 + assert("jmacd-9322", !node_is_empty(right));
12444 + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12445 +
12446 + switch (znode_get_level(left)) {
12447 + case TWIG_LEVEL:
12448 + /* Shift with extent allocating until either an internal item
12449 + is encountered or everything is shifted or no free space
12450 + left in @left */
12451 + ret = squeeze_right_twig(left, right, pos);
12452 + break;
12453 +
12454 + default:
12455 + /* All other levels can use shift_everything until we implement
12456 + per-item flush plugins. */
12457 + ret = squeeze_right_non_twig(left, right);
12458 + break;
12459 + }
12460 +
12461 + assert("jmacd-2011", (ret < 0 ||
12462 + ret == SQUEEZE_SOURCE_EMPTY
12463 + || ret == SQUEEZE_TARGET_FULL
12464 + || ret == SUBTREE_MOVED));
12465 + return ret;
12466 +}
12467 +
12468 +static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos,
12469 + znode * right)
12470 +{
12471 + int ret;
12472 +
12473 + ret = squeeze_right_twig(pos->lock.node, right, pos);
12474 + if (ret < 0)
12475 + return ret;
12476 + if (ret > 0) {
12477 + coord_init_after_last_item(&pos->coord, pos->lock.node);
12478 + return ret;
12479 + }
12480 +
12481 + coord_init_last_unit(&pos->coord, pos->lock.node);
12482 + return 0;
12483 +}
12484 +
12485 +/* forward declaration */
12486 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12487 +
12488 +/* do a fast check for "same parents" condition before calling
12489 + * squalloc_upper_levels() */
12490 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos,
12491 + znode * left,
12492 + znode * right)
12493 +{
12494 + if (znode_same_parents(left, right))
12495 + return 0;
12496 +
12497 + return squalloc_upper_levels(pos, left, right);
12498 +}
12499 +
12500 +/* Check whether the parent of given @right node needs to be processes
12501 + ((re)allocated) prior to processing of the child. If @left and @right do not
12502 + share at least the parent of the @right is after the @left but before the
12503 + @right in parent-first order, we have to (re)allocate it before the @right
12504 + gets (re)allocated. */
12505 +static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right)
12506 +{
12507 + int ret;
12508 +
12509 + lock_handle left_parent_lock;
12510 + lock_handle right_parent_lock;
12511 +
12512 + load_count left_parent_load;
12513 + load_count right_parent_load;
12514 +
12515 + init_lh(&left_parent_lock);
12516 + init_lh(&right_parent_lock);
12517 +
12518 + init_load_count(&left_parent_load);
12519 + init_load_count(&right_parent_load);
12520 +
12521 + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12522 + if (ret)
12523 + goto out;
12524 +
12525 + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12526 + if (ret)
12527 + goto out;
12528 +
12529 + /* Check for same parents */
12530 + if (left_parent_lock.node == right_parent_lock.node)
12531 + goto out;
12532 +
12533 + if (znode_check_flushprepped(right_parent_lock.node)) {
12534 + /* Keep parent-first order. In the order, the right parent node
12535 + stands before the @right node. If it is already allocated,
12536 + we set the preceder (next block search start point) to its
12537 + block number, @right node should be allocated after it.
12538 +
12539 + However, preceder is set only if the right parent is on twig
12540 + level. The explanation is the following: new branch nodes are
12541 + allocated over already allocated children while the tree
12542 + grows, it is difficult to keep tree ordered, we assume that
12543 + only leaves and twings are correctly allocated. So, only
12544 + twigs are used as a preceder for allocating of the rest of
12545 + the slum. */
12546 + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12547 + pos->preceder.blk =
12548 + *znode_get_block(right_parent_lock.node);
12549 + check_preceder(pos->preceder.blk);
12550 + }
12551 + goto out;
12552 + }
12553 +
12554 + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12555 + if (ret)
12556 + goto out;
12557 +
12558 + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12559 + if (ret)
12560 + goto out;
12561 +
12562 + ret =
12563 + squeeze_right_neighbor(pos, left_parent_lock.node,
12564 + right_parent_lock.node);
12565 + /* We stop if error. We stop if some items/units were shifted (ret == 0)
12566 + * and thus @right changed its parent. It means we have not process
12567 + * right_parent node prior to processing of @right. Positive return
12568 + * values say that shifting items was not happen because of "empty
12569 + * source" or "target full" conditions. */
12570 + if (ret <= 0)
12571 + goto out;
12572 +
12573 + /* parent(@left) and parent(@right) may have different parents also. We
12574 + * do a recursive call for checking that. */
12575 + ret =
12576 + check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12577 + right_parent_lock.node);
12578 + if (ret)
12579 + goto out;
12580 +
12581 + /* allocate znode when going down */
12582 + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12583 +
12584 +out:
12585 + done_load_count(&left_parent_load);
12586 + done_load_count(&right_parent_load);
12587 +
12588 + done_lh(&left_parent_lock);
12589 + done_lh(&right_parent_lock);
12590 +
12591 + return ret;
12592 +}
12593 +
12594 +/* Check the leftmost child "flushprepped" status, also returns true if child
12595 + * node was not found in cache. */
12596 +static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord)
12597 +{
12598 + int ret;
12599 + int prepped;
12600 +
12601 + jnode *child;
12602 +
12603 + ret = get_leftmost_child_of_unit(coord, &child);
12604 +
12605 + if (ret)
12606 + return ret;
12607 +
12608 + if (child) {
12609 + prepped = jnode_check_flushprepped(child);
12610 + jput(child);
12611 + } else {
12612 + /* We consider not existing child as a node which slum
12613 + processing should not continue to. Not cached node is clean,
12614 + so it is flushprepped. */
12615 + prepped = 1;
12616 + }
12617 +
12618 + return prepped;
12619 +}
12620 +
12621 +/* (re)allocate znode with automated getting parent node */
12622 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos)
12623 +{
12624 + int ret;
12625 + lock_handle parent_lock;
12626 + load_count parent_load;
12627 + coord_t pcoord;
12628 +
12629 + assert("zam-851", znode_is_write_locked(node));
12630 +
12631 + init_lh(&parent_lock);
12632 + init_load_count(&parent_load);
12633 +
12634 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12635 + if (ret)
12636 + goto out;
12637 +
12638 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12639 + if (ret)
12640 + goto out;
12641 +
12642 + ret = find_child_ptr(parent_lock.node, node, &pcoord);
12643 + if (ret)
12644 + goto out;
12645 +
12646 + ret = allocate_znode(node, &pcoord, pos);
12647 +
12648 +out:
12649 + done_load_count(&parent_load);
12650 + done_lh(&parent_lock);
12651 + return ret;
12652 +}
12653 +
12654 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12655 + * slum reached. */
12656 +static int handle_pos_on_formatted(flush_pos_t *pos)
12657 +{
12658 + int ret;
12659 + lock_handle right_lock;
12660 + load_count right_load;
12661 +
12662 + init_lh(&right_lock);
12663 + init_load_count(&right_load);
12664 +
12665 + if (should_convert_node(pos, pos->lock.node)) {
12666 + ret = convert_node(pos, pos->lock.node);
12667 + if (ret)
12668 + return ret;
12669 + }
12670 +
12671 + while (1) {
12672 + int expected;
12673 + expected = should_convert_next_node(pos);
12674 + ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12675 + ZNODE_WRITE_LOCK, !expected, expected);
12676 + if (ret) {
12677 + if (expected)
12678 + warning("edward-1495",
12679 + "Expected neighbor not found (ret = %d). Fsck?",
12680 + ret);
12681 + break;
12682 + }
12683 +
12684 + /* we don't prep(allocate) nodes for flushing twice. This can be
12685 + * suboptimal, or it can be optimal. For now we choose to live
12686 + * with the risk that it will be suboptimal because it would be
12687 + * quite complex to code it to be smarter. */
12688 + if (znode_check_flushprepped(right_lock.node)
12689 + && !znode_convertible(right_lock.node)) {
12690 + assert("edward-1005", !should_convert_next_node(pos));
12691 + pos_stop(pos);
12692 + break;
12693 + }
12694 +
12695 + ret = incr_load_count_znode(&right_load, right_lock.node);
12696 + if (ret)
12697 + break;
12698 + if (should_convert_node(pos, right_lock.node)) {
12699 + ret = convert_node(pos, right_lock.node);
12700 + if (ret)
12701 + break;
12702 + if (node_is_empty(right_lock.node)) {
12703 + /* node became empty after converting, repeat */
12704 + done_load_count(&right_load);
12705 + done_lh(&right_lock);
12706 + continue;
12707 + }
12708 + }
12709 +
12710 + /* squeeze _before_ going upward. */
12711 + ret =
12712 + squeeze_right_neighbor(pos, pos->lock.node,
12713 + right_lock.node);
12714 + if (ret < 0)
12715 + break;
12716 +
12717 + if (znode_check_flushprepped(right_lock.node)) {
12718 + if (should_convert_next_node(pos)) {
12719 + /* in spite of flushprepped status of the node,
12720 + its right slum neighbor should be converted*/
12721 + assert("edward-953", convert_data(pos));
12722 + assert("edward-954", item_convert_data(pos));
12723 +
12724 + if (node_is_empty(right_lock.node)) {
12725 + done_load_count(&right_load);
12726 + done_lh(&right_lock);
12727 + } else
12728 + move_flush_pos(pos, &right_lock,
12729 + &right_load, NULL);
12730 + continue;
12731 + }
12732 + pos_stop(pos);
12733 + break;
12734 + }
12735 +
12736 + if (node_is_empty(right_lock.node)) {
12737 + /* repeat if right node was squeezed completely */
12738 + done_load_count(&right_load);
12739 + done_lh(&right_lock);
12740 + continue;
12741 + }
12742 +
12743 + /* parent(right_lock.node) has to be processed before
12744 + * (right_lock.node) due to "parent-first" allocation order. */
12745 + ret =
12746 + check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12747 + right_lock.node);
12748 + if (ret)
12749 + break;
12750 + /* (re)allocate _after_ going upward */
12751 + ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12752 + if (ret)
12753 + break;
12754 + if (should_terminate_squalloc(pos)) {
12755 + set_item_convert_count(pos, 0);
12756 + break;
12757 + }
12758 +
12759 + /* advance the flush position to the right neighbor */
12760 + move_flush_pos(pos, &right_lock, &right_load, NULL);
12761 +
12762 + ret = rapid_flush(pos);
12763 + if (ret)
12764 + break;
12765 + }
12766 + check_convert_info(pos);
12767 + done_load_count(&right_load);
12768 + done_lh(&right_lock);
12769 +
12770 + /* This function indicates via pos whether to stop or go to twig or
12771 + * continue on current level. */
12772 + return ret;
12773 +
12774 +}
12775 +
12776 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12777 + * slum reached. */
12778 +static int handle_pos_on_leaf(flush_pos_t *pos)
12779 +{
12780 + int ret;
12781 +
12782 + assert("zam-845", pos->state == POS_ON_LEAF);
12783 +
12784 + ret = handle_pos_on_formatted(pos);
12785 +
12786 + if (ret == -E_NO_NEIGHBOR) {
12787 + /* cannot get right neighbor, go process extents. */
12788 + pos->state = POS_TO_TWIG;
12789 + return 0;
12790 + }
12791 +
12792 + return ret;
12793 +}
12794 +
12795 +/* Process slum on level > 1 */
12796 +static int handle_pos_on_internal(flush_pos_t *pos)
12797 +{
12798 + assert("zam-850", pos->state == POS_ON_INTERNAL);
12799 + return handle_pos_on_formatted(pos);
12800 +}
12801 +
12802 +/* check whether squalloc should stop before processing given extent */
12803 +static int squalloc_extent_should_stop(flush_pos_t *pos)
12804 +{
12805 + assert("zam-869", item_is_extent(&pos->coord));
12806 +
12807 + /* pos->child is a jnode handle_pos_on_extent() should start with in
12808 + * stead of the first child of the first extent unit. */
12809 + if (pos->child) {
12810 + int prepped;
12811 +
12812 + assert("vs-1383", jnode_is_unformatted(pos->child));
12813 + prepped = jnode_check_flushprepped(pos->child);
12814 + pos->pos_in_unit =
12815 + jnode_get_index(pos->child) -
12816 + extent_unit_index(&pos->coord);
12817 + assert("vs-1470",
12818 + pos->pos_in_unit < extent_unit_width(&pos->coord));
12819 + assert("nikita-3434",
12820 + ergo(extent_is_unallocated(&pos->coord),
12821 + pos->pos_in_unit == 0));
12822 + jput(pos->child);
12823 + pos->child = NULL;
12824 +
12825 + return prepped;
12826 + }
12827 +
12828 + pos->pos_in_unit = 0;
12829 + if (extent_is_unallocated(&pos->coord))
12830 + return 0;
12831 +
12832 + return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12833 +}
12834 +
12835 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12836 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12837 + * unformatted nodes. By having a lock on twig level and use extent code
12838 + * routines to process unformatted nodes we swim around an irregular part of
12839 + * reiser4 tree. */
12840 +static int handle_pos_on_twig(flush_pos_t *pos)
12841 +{
12842 + int ret;
12843 +
12844 + assert("zam-844", pos->state == POS_ON_EPOINT);
12845 + assert("zam-843", item_is_extent(&pos->coord));
12846 +
12847 + /* We decide should we continue slum processing with current extent
12848 + unit: if leftmost child of current extent unit is flushprepped
12849 + (i.e. clean or already processed by flush) we stop squalloc(). There
12850 + is a fast check for unallocated extents which we assume contain all
12851 + not flushprepped nodes. */
12852 + /* FIXME: Here we implement simple check, we are only looking on the
12853 + leftmost child. */
12854 + ret = squalloc_extent_should_stop(pos);
12855 + if (ret != 0) {
12856 + pos_stop(pos);
12857 + return ret;
12858 + }
12859 +
12860 + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12861 + && item_is_extent(&pos->coord)) {
12862 + ret = reiser4_alloc_extent(pos);
12863 + if (ret)
12864 + break;
12865 + coord_next_unit(&pos->coord);
12866 + }
12867 +
12868 + if (coord_is_after_rightmost(&pos->coord)) {
12869 + pos->state = POS_END_OF_TWIG;
12870 + return 0;
12871 + }
12872 + if (item_is_internal(&pos->coord)) {
12873 + pos->state = POS_TO_LEAF;
12874 + return 0;
12875 + }
12876 +
12877 + assert("zam-860", item_is_extent(&pos->coord));
12878 +
12879 + /* "slum" is over */
12880 + pos->state = POS_INVALID;
12881 + return 0;
12882 +}
12883 +
12884 +/* When we about to return flush position from twig to leaf level we can process
12885 + * the right twig node or move position to the leaf. This processes right twig
12886 + * if it is possible and jump to leaf level if not. */
12887 +static int handle_pos_end_of_twig(flush_pos_t *pos)
12888 +{
12889 + int ret;
12890 + lock_handle right_lock;
12891 + load_count right_load;
12892 + coord_t at_right;
12893 + jnode *child = NULL;
12894 +
12895 + assert("zam-848", pos->state == POS_END_OF_TWIG);
12896 + assert("zam-849", coord_is_after_rightmost(&pos->coord));
12897 +
12898 + init_lh(&right_lock);
12899 + init_load_count(&right_load);
12900 +
12901 + /* We get a lock on the right twig node even it is not dirty because
12902 + * slum continues or discontinues on leaf level not on next twig. This
12903 + * lock on the right twig is needed for getting its leftmost child. */
12904 + ret =
12905 + reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12906 + ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12907 + if (ret)
12908 + goto out;
12909 +
12910 + ret = incr_load_count_znode(&right_load, right_lock.node);
12911 + if (ret)
12912 + goto out;
12913 +
12914 + /* right twig could be not dirty */
12915 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12916 + /* If right twig node is dirty we always attempt to squeeze it
12917 + * content to the left... */
12918 +became_dirty:
12919 + ret =
12920 + squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12921 + if (ret <= 0) {
12922 + /* pos->coord is on internal item, go to leaf level, or
12923 + * we have an error which will be caught in squalloc()
12924 + */
12925 + pos->state = POS_TO_LEAF;
12926 + goto out;
12927 + }
12928 +
12929 + /* If right twig was squeezed completely we wave to re-lock
12930 + * right twig. now it is done through the top-level squalloc
12931 + * routine. */
12932 + if (node_is_empty(right_lock.node))
12933 + goto out;
12934 +
12935 + /* ... and prep it if it is not yet prepped */
12936 + if (!znode_check_flushprepped(right_lock.node)) {
12937 + /* As usual, process parent before ... */
12938 + ret =
12939 + check_parents_and_squalloc_upper_levels(pos,
12940 + pos->lock.
12941 + node,
12942 + right_lock.
12943 + node);
12944 + if (ret)
12945 + goto out;
12946 +
12947 + /* ... processing the child */
12948 + ret =
12949 + lock_parent_and_allocate_znode(right_lock.node,
12950 + pos);
12951 + if (ret)
12952 + goto out;
12953 + }
12954 + } else {
12955 + coord_init_first_unit(&at_right, right_lock.node);
12956 +
12957 + /* check first child of next twig, should we continue there ? */
12958 + ret = get_leftmost_child_of_unit(&at_right, &child);
12959 + if (ret || child == NULL || jnode_check_flushprepped(child)) {
12960 + pos_stop(pos);
12961 + goto out;
12962 + }
12963 +
12964 + /* check clean twig for possible relocation */
12965 + if (!znode_check_flushprepped(right_lock.node)) {
12966 + ret =
12967 + reverse_relocate_check_dirty_parent(child,
12968 + &at_right, pos);
12969 + if (ret)
12970 + goto out;
12971 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12972 + goto became_dirty;
12973 + }
12974 + }
12975 +
12976 + assert("zam-875", znode_check_flushprepped(right_lock.node));
12977 +
12978 + /* Update the preceder by a block number of just processed right twig
12979 + * node. The code above could miss the preceder updating because
12980 + * allocate_znode() could not be called for this node. */
12981 + pos->preceder.blk = *znode_get_block(right_lock.node);
12982 + check_preceder(pos->preceder.blk);
12983 +
12984 + coord_init_first_unit(&at_right, right_lock.node);
12985 + assert("zam-868", coord_is_existing_unit(&at_right));
12986 +
12987 + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12988 + move_flush_pos(pos, &right_lock, &right_load, &at_right);
12989 +
12990 +out:
12991 + done_load_count(&right_load);
12992 + done_lh(&right_lock);
12993 +
12994 + if (child)
12995 + jput(child);
12996 +
12997 + return ret;
12998 +}
12999 +
13000 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
13001 + * continue there. */
13002 +static int handle_pos_to_leaf(flush_pos_t *pos)
13003 +{
13004 + int ret;
13005 + lock_handle child_lock;
13006 + load_count child_load;
13007 + jnode *child;
13008 +
13009 + assert("zam-846", pos->state == POS_TO_LEAF);
13010 + assert("zam-847", item_is_internal(&pos->coord));
13011 +
13012 + init_lh(&child_lock);
13013 + init_load_count(&child_load);
13014 +
13015 + ret = get_leftmost_child_of_unit(&pos->coord, &child);
13016 + if (ret)
13017 + return ret;
13018 + if (child == NULL) {
13019 + pos_stop(pos);
13020 + return 0;
13021 + }
13022 +
13023 + if (jnode_check_flushprepped(child)) {
13024 + pos->state = POS_INVALID;
13025 + goto out;
13026 + }
13027 +
13028 + ret =
13029 + longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13030 + ZNODE_LOCK_LOPRI);
13031 + if (ret)
13032 + goto out;
13033 +
13034 + ret = incr_load_count_znode(&child_load, JZNODE(child));
13035 + if (ret)
13036 + goto out;
13037 +
13038 + ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13039 + if (ret)
13040 + goto out;
13041 +
13042 + /* move flush position to leaf level */
13043 + pos->state = POS_ON_LEAF;
13044 + move_flush_pos(pos, &child_lock, &child_load, NULL);
13045 +
13046 + if (node_is_empty(JZNODE(child))) {
13047 + ret = delete_empty_node(JZNODE(child));
13048 + pos->state = POS_INVALID;
13049 + }
13050 +out:
13051 + done_load_count(&child_load);
13052 + done_lh(&child_lock);
13053 + jput(child);
13054 +
13055 + return ret;
13056 +}
13057 +
13058 +/* move pos from leaf to twig, and move lock from leaf to twig. */
13059 +/* Move pos->lock to upper (twig) level */
13060 +static int handle_pos_to_twig(flush_pos_t *pos)
13061 +{
13062 + int ret;
13063 +
13064 + lock_handle parent_lock;
13065 + load_count parent_load;
13066 + coord_t pcoord;
13067 +
13068 + assert("zam-852", pos->state == POS_TO_TWIG);
13069 +
13070 + init_lh(&parent_lock);
13071 + init_load_count(&parent_load);
13072 +
13073 + ret =
13074 + reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13075 + if (ret)
13076 + goto out;
13077 +
13078 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
13079 + if (ret)
13080 + goto out;
13081 +
13082 + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13083 + if (ret)
13084 + goto out;
13085 +
13086 + assert("zam-870", item_is_internal(&pcoord));
13087 + coord_next_item(&pcoord);
13088 +
13089 + if (coord_is_after_rightmost(&pcoord))
13090 + pos->state = POS_END_OF_TWIG;
13091 + else if (item_is_extent(&pcoord))
13092 + pos->state = POS_ON_EPOINT;
13093 + else {
13094 + /* Here we understand that getting -E_NO_NEIGHBOR in
13095 + * handle_pos_on_leaf() was because of just a reaching edge of
13096 + * slum */
13097 + pos_stop(pos);
13098 + goto out;
13099 + }
13100 +
13101 + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13102 +
13103 +out:
13104 + done_load_count(&parent_load);
13105 + done_lh(&parent_lock);
13106 +
13107 + return ret;
13108 +}
13109 +
13110 +typedef int (*pos_state_handle_t) (flush_pos_t *);
13111 +static pos_state_handle_t flush_pos_handlers[] = {
13112 + /* process formatted nodes on leaf level, keep lock on a leaf node */
13113 + [POS_ON_LEAF] = handle_pos_on_leaf,
13114 + /* process unformatted nodes, keep lock on twig node, pos->coord points
13115 + * to extent currently being processed */
13116 + [POS_ON_EPOINT] = handle_pos_on_twig,
13117 + /* move a lock from leaf node to its parent for further processing of
13118 + unformatted nodes */
13119 + [POS_TO_TWIG] = handle_pos_to_twig,
13120 + /* move a lock from twig to leaf level when a processing of unformatted
13121 + * nodes finishes, pos->coord points to the leaf node we jump to */
13122 + [POS_TO_LEAF] = handle_pos_to_leaf,
13123 + /* after processing last extent in the twig node, attempting to shift
13124 + * items from the twigs right neighbor and process them while shifting*/
13125 + [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13126 + /* process formatted nodes on internal level, keep lock on an internal
13127 + node */
13128 + [POS_ON_INTERNAL] = handle_pos_on_internal
13129 +};
13130 +
13131 +/* Advance flush position horizontally, prepare for flushing ((re)allocate,
13132 + * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
13133 +static int squalloc(flush_pos_t *pos)
13134 +{
13135 + int ret = 0;
13136 +
13137 + /* maybe needs to be made a case statement with handle_pos_on_leaf as
13138 + * first case, for greater CPU efficiency? Measure and see.... -Hans */
13139 + while (pos_valid(pos)) {
13140 + ret = flush_pos_handlers[pos->state] (pos);
13141 + if (ret < 0)
13142 + break;
13143 +
13144 + ret = rapid_flush(pos);
13145 + if (ret)
13146 + break;
13147 + }
13148 +
13149 + /* any positive value or -E_NO_NEIGHBOR are legal return codes for
13150 + handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
13151 + reached */
13152 + if (ret > 0 || ret == -E_NO_NEIGHBOR)
13153 + ret = 0;
13154 +
13155 + return ret;
13156 +}
13157 +
13158 +static void update_ldkey(znode * node)
13159 +{
13160 + reiser4_key ldkey;
13161 +
13162 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13163 + if (node_is_empty(node))
13164 + return;
13165 +
13166 + znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13167 +}
13168 +
13169 +/* this is to be called after calling of shift node's method to shift data from
13170 + @right to @left. It sets left delimiting keys of @left and @right to keys of
13171 + first items of @left and @right correspondingly and sets right delimiting key
13172 + of @left to first key of @right */
13173 +static void update_znode_dkeys(znode * left, znode * right)
13174 +{
13175 + assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13176 + assert("vs-1629", (znode_is_write_locked(left) &&
13177 + znode_is_write_locked(right)));
13178 +
13179 + /* we need to update left delimiting of left if it was empty before
13180 + shift */
13181 + update_ldkey(left);
13182 + update_ldkey(right);
13183 + if (node_is_empty(right))
13184 + znode_set_rd_key(left, znode_get_rd_key(right));
13185 + else
13186 + znode_set_rd_key(left, znode_get_ld_key(right));
13187 +}
13188 +
13189 +/* try to shift everything from @right to @left. If everything was shifted -
13190 + @right is removed from the tree. Result is the number of bytes shifted. */
13191 +static int
13192 +shift_everything_left(znode * right, znode * left, carry_level * todo)
13193 +{
13194 + coord_t from;
13195 + node_plugin *nplug;
13196 + carry_plugin_info info;
13197 +
13198 + coord_init_after_last_item(&from, right);
13199 +
13200 + nplug = node_plugin_by_node(right);
13201 + info.doing = NULL;
13202 + info.todo = todo;
13203 + return nplug->shift(&from, left, SHIFT_LEFT,
13204 + 1 /* delete @right if it becomes empty */ ,
13205 + 1
13206 + /* move coord @from to node @left if everything will
13207 + be shifted */
13208 + ,
13209 + &info);
13210 +}
13211 +
13212 +/* Shift as much as possible from @right to @left using the memcpy-optimized
13213 + shift_everything_left. @left and @right are formatted neighboring nodes on
13214 + leaf level. */
13215 +static int squeeze_right_non_twig(znode * left, znode * right)
13216 +{
13217 + int ret;
13218 + carry_pool *pool;
13219 + carry_level *todo;
13220 +
13221 + assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13222 +
13223 + if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13224 + !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13225 + return SQUEEZE_TARGET_FULL;
13226 +
13227 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13228 + if (IS_ERR(pool))
13229 + return PTR_ERR(pool);
13230 + todo = (carry_level *) (pool + 1);
13231 + init_carry_level(todo, pool);
13232 +
13233 + ret = shift_everything_left(right, left, todo);
13234 + if (ret > 0) {
13235 + /* something was shifted */
13236 + reiser4_tree *tree;
13237 + __u64 grabbed;
13238 +
13239 + znode_make_dirty(left);
13240 + znode_make_dirty(right);
13241 +
13242 + /* update delimiting keys of nodes which participated in
13243 + shift. FIXME: it would be better to have this in shift
13244 + node's operation. But it can not be done there. Nobody
13245 + remembers why, though */
13246 + tree = znode_get_tree(left);
13247 + write_lock_dk(tree);
13248 + update_znode_dkeys(left, right);
13249 + write_unlock_dk(tree);
13250 +
13251 + /* Carry is called to update delimiting key and, maybe, to
13252 + remove empty node. */
13253 + grabbed = get_current_context()->grabbed_blocks;
13254 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13255 + assert("nikita-3003", ret == 0); /* reserved space is
13256 + exhausted. Ask Hans. */
13257 + ret = reiser4_carry(todo, NULL/* previous level */);
13258 + grabbed2free_mark(grabbed);
13259 + } else {
13260 + /* Shifting impossible, we return appropriate result code */
13261 + ret =
13262 + node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13263 + SQUEEZE_TARGET_FULL;
13264 + }
13265 +
13266 + done_carry_pool(pool);
13267 +
13268 + return ret;
13269 +}
13270 +
13271 +#if REISER4_DEBUG
13272 +static int sibling_link_is_ok(const znode *left, const znode *right)
13273 +{
13274 + int result;
13275 +
13276 + read_lock_tree(znode_get_tree(left));
13277 + result = (left->right == right && left == right->left);
13278 + read_unlock_tree(znode_get_tree(left));
13279 + return result;
13280 +}
13281 +#endif
13282 +
13283 +/* Shift first unit of first item if it is an internal one. Return
13284 + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13285 + SUBTREE_MOVED. */
13286 +static int shift_one_internal_unit(znode * left, znode * right)
13287 +{
13288 + int ret;
13289 + carry_pool *pool;
13290 + carry_level *todo;
13291 + coord_t *coord;
13292 + carry_plugin_info *info;
13293 + int size, moved;
13294 +
13295 + assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13296 + assert("nikita-2435", znode_is_write_locked(left));
13297 + assert("nikita-2436", znode_is_write_locked(right));
13298 + assert("nikita-2434", sibling_link_is_ok(left, right));
13299 +
13300 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13301 + sizeof(*coord) + sizeof(*info)
13302 +#if REISER4_DEBUG
13303 + + sizeof(*coord) + 2 * sizeof(reiser4_key)
13304 +#endif
13305 + );
13306 + if (IS_ERR(pool))
13307 + return PTR_ERR(pool);
13308 + todo = (carry_level *) (pool + 1);
13309 + init_carry_level(todo, pool);
13310 +
13311 + coord = (coord_t *) (todo + 3);
13312 + coord_init_first_unit(coord, right);
13313 + info = (carry_plugin_info *) (coord + 1);
13314 +
13315 +#if REISER4_DEBUG
13316 + if (!node_is_empty(left)) {
13317 + coord_t *last;
13318 + reiser4_key *right_key;
13319 + reiser4_key *left_key;
13320 +
13321 + last = (coord_t *) (info + 1);
13322 + right_key = (reiser4_key *) (last + 1);
13323 + left_key = right_key + 1;
13324 + coord_init_last_unit(last, left);
13325 +
13326 + assert("nikita-2463",
13327 + keyle(item_key_by_coord(last, left_key),
13328 + item_key_by_coord(coord, right_key)));
13329 + }
13330 +#endif
13331 +
13332 + assert("jmacd-2007", item_is_internal(coord));
13333 +
13334 + size = item_length_by_coord(coord);
13335 + info->todo = todo;
13336 + info->doing = NULL;
13337 +
13338 + ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13339 + 1
13340 + /* delete @right if it becomes
13341 + empty */
13342 + ,
13343 + 0
13344 + /* do not move coord @coord to
13345 + node @left */
13346 + ,
13347 + info);
13348 +
13349 + /* If shift returns positive, then we shifted the item. */
13350 + assert("vs-423", ret <= 0 || size == ret);
13351 + moved = (ret > 0);
13352 +
13353 + if (moved) {
13354 + /* something was moved */
13355 + reiser4_tree *tree;
13356 + int grabbed;
13357 +
13358 + znode_make_dirty(left);
13359 + znode_make_dirty(right);
13360 + tree = znode_get_tree(left);
13361 + write_lock_dk(tree);
13362 + update_znode_dkeys(left, right);
13363 + write_unlock_dk(tree);
13364 +
13365 + /* reserve space for delimiting keys after shifting */
13366 + grabbed = get_current_context()->grabbed_blocks;
13367 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13368 + assert("nikita-3003", ret == 0); /* reserved space is
13369 + exhausted. Ask Hans. */
13370 +
13371 + ret = reiser4_carry(todo, NULL/* previous level */);
13372 + grabbed2free_mark(grabbed);
13373 + }
13374 +
13375 + done_carry_pool(pool);
13376 +
13377 + if (ret != 0) {
13378 + /* Shift or carry operation failed. */
13379 + assert("jmacd-7325", ret < 0);
13380 + return ret;
13381 + }
13382 +
13383 + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13384 +}
13385 +
13386 +/* Make the final relocate/wander decision during forward parent-first squalloc
13387 + for a znode. For unformatted nodes this is done in
13388 + plugin/item/extent.c:extent_needs_allocation(). */
13389 +static int
13390 +allocate_znode_loaded(znode * node,
13391 + const coord_t *parent_coord, flush_pos_t *pos)
13392 +{
13393 + int ret;
13394 + reiser4_super_info_data *sbinfo = get_current_super_private();
13395 + /* FIXME(D): We have the node write-locked and should have checked for !
13396 + allocated() somewhere before reaching this point, but there can be a
13397 + race, so this assertion is bogus. */
13398 + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13399 + assert("jmacd-7988", znode_is_write_locked(node));
13400 + assert("jmacd-7989", coord_is_invalid(parent_coord)
13401 + || znode_is_write_locked(parent_coord->node));
13402 +
13403 + if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13404 + znode_is_root(node) ||
13405 + /* We have enough nodes to relocate no matter what. */
13406 + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13407 + /* No need to decide with new nodes, they are treated the same
13408 + as relocate. If the root node is dirty, relocate. */
13409 + if (pos->preceder.blk == 0) {
13410 + /* preceder is unknown and we have decided to relocate
13411 + node -- using of default value for search start is
13412 + better than search from block #0. */
13413 + get_blocknr_hint_default(&pos->preceder.blk);
13414 + check_preceder(pos->preceder.blk);
13415 + }
13416 +
13417 + goto best_reloc;
13418 +
13419 + } else if (pos->preceder.blk == 0) {
13420 + /* If we don't know the preceder, leave it where it is. */
13421 + jnode_make_wander(ZJNODE(node));
13422 + } else {
13423 + /* Make a decision based on block distance. */
13424 + reiser4_block_nr dist;
13425 + reiser4_block_nr nblk = *znode_get_block(node);
13426 +
13427 + assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13428 + assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13429 + assert("jmacd-6174", pos->preceder.blk != 0);
13430 +
13431 + if (pos->preceder.blk == nblk - 1) {
13432 + /* Ideal. */
13433 + jnode_make_wander(ZJNODE(node));
13434 + } else {
13435 +
13436 + dist =
13437 + (nblk <
13438 + pos->preceder.blk) ? (pos->preceder.blk -
13439 + nblk) : (nblk -
13440 + pos->preceder.blk);
13441 +
13442 + /* See if we can find a closer block
13443 + (forward direction only). */
13444 + pos->preceder.max_dist =
13445 + min((reiser4_block_nr) sbinfo->flush.
13446 + relocate_distance, dist);
13447 + pos->preceder.level = znode_get_level(node);
13448 +
13449 + ret = allocate_znode_update(node, parent_coord, pos);
13450 +
13451 + pos->preceder.max_dist = 0;
13452 +
13453 + if (ret && (ret != -ENOSPC))
13454 + return ret;
13455 +
13456 + if (ret == 0) {
13457 + /* Got a better allocation. */
13458 + znode_make_reloc(node, pos->fq);
13459 + } else if (dist < sbinfo->flush.relocate_distance) {
13460 + /* The present allocation is good enough. */
13461 + jnode_make_wander(ZJNODE(node));
13462 + } else {
13463 + /* Otherwise, try to relocate to the best
13464 + position. */
13465 +best_reloc:
13466 + ret =
13467 + allocate_znode_update(node, parent_coord,
13468 + pos);
13469 + if (ret != 0)
13470 + return ret;
13471 +
13472 + /* set JNODE_RELOC bit _after_ node gets
13473 + allocated */
13474 + znode_make_reloc(node, pos->fq);
13475 + }
13476 + }
13477 + }
13478 +
13479 + /* This is the new preceder. */
13480 + pos->preceder.blk = *znode_get_block(node);
13481 + check_preceder(pos->preceder.blk);
13482 + pos->alloc_cnt += 1;
13483 +
13484 + assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13485 +
13486 + return 0;
13487 +}
13488 +
13489 +static int
13490 +allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos)
13491 +{
13492 + /*
13493 + * perform znode allocation with znode pinned in memory to avoid races
13494 + * with asynchronous emergency flush (which plays with
13495 + * JNODE_FLUSH_RESERVED bit).
13496 + */
13497 + return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13498 +}
13499 +
13500 +/* A subroutine of allocate_znode, this is called first to see if there is a
13501 + close position to relocate to. It may return ENOSPC if there is no close
13502 + position. If there is no close position it may not relocate. This takes care
13503 + of updating the parent node with the relocated block address. */
13504 +static int
13505 +allocate_znode_update(znode * node, const coord_t *parent_coord,
13506 + flush_pos_t *pos)
13507 +{
13508 + int ret;
13509 + reiser4_block_nr blk;
13510 + lock_handle uber_lock;
13511 + int flush_reserved_used = 0;
13512 + int grabbed;
13513 + reiser4_context *ctx;
13514 + reiser4_super_info_data *sbinfo;
13515 +
13516 + init_lh(&uber_lock);
13517 +
13518 + ctx = get_current_context();
13519 + sbinfo = get_super_private(ctx->super);
13520 +
13521 + grabbed = ctx->grabbed_blocks;
13522 +
13523 + /* discard e-flush allocation */
13524 + ret = zload(node);
13525 + if (ret)
13526 + return ret;
13527 +
13528 + if (ZF_ISSET(node, JNODE_CREATED)) {
13529 + assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13530 + pos->preceder.block_stage = BLOCK_UNALLOCATED;
13531 + } else {
13532 + pos->preceder.block_stage = BLOCK_GRABBED;
13533 +
13534 + /* The disk space for relocating the @node is already reserved
13535 + * in "flush reserved" counter if @node is leaf, otherwise we
13536 + * grab space using BA_RESERVED (means grab space from whole
13537 + * disk not from only 95%). */
13538 + if (znode_get_level(node) == LEAF_LEVEL) {
13539 + /*
13540 + * earlier (during do_jnode_make_dirty()) we decided
13541 + * that @node can possibly go into overwrite set and
13542 + * reserved block for its wandering location.
13543 + */
13544 + txn_atom *atom = get_current_atom_locked();
13545 + assert("nikita-3449",
13546 + ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13547 + flush_reserved2grabbed(atom, (__u64) 1);
13548 + spin_unlock_atom(atom);
13549 + /*
13550 + * we are trying to move node into relocate
13551 + * set. Allocation of relocated position "uses"
13552 + * reserved block.
13553 + */
13554 + ZF_CLR(node, JNODE_FLUSH_RESERVED);
13555 + flush_reserved_used = 1;
13556 + } else {
13557 + ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13558 + if (ret != 0)
13559 + goto exit;
13560 + }
13561 + }
13562 +
13563 + /* We may do not use 5% of reserved disk space here and flush will not
13564 + pack tightly. */
13565 + ret = reiser4_alloc_block(&pos->preceder, &blk,
13566 + BA_FORMATTED | BA_PERMANENT);
13567 + if (ret)
13568 + goto exit;
13569 +
13570 + if (!ZF_ISSET(node, JNODE_CREATED) &&
13571 + (ret =
13572 + reiser4_dealloc_block(znode_get_block(node), 0,
13573 + BA_DEFER | BA_FORMATTED)))
13574 + goto exit;
13575 +
13576 + if (likely(!znode_is_root(node))) {
13577 + item_plugin *iplug;
13578 +
13579 + iplug = item_plugin_by_coord(parent_coord);
13580 + assert("nikita-2954", iplug->f.update != NULL);
13581 + iplug->f.update(parent_coord, &blk);
13582 +
13583 + znode_make_dirty(parent_coord->node);
13584 +
13585 + } else {
13586 + reiser4_tree *tree = znode_get_tree(node);
13587 + znode *uber;
13588 +
13589 + /* We take a longterm lock on the fake node in order to change
13590 + the root block number. This may cause atom fusion. */
13591 + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13592 + &uber_lock);
13593 + /* The fake node cannot be deleted, and we must have priority
13594 + here, and may not be confused with ENOSPC. */
13595 + assert("jmacd-74412",
13596 + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13597 +
13598 + if (ret)
13599 + goto exit;
13600 +
13601 + uber = uber_lock.node;
13602 +
13603 + write_lock_tree(tree);
13604 + tree->root_block = blk;
13605 + write_unlock_tree(tree);
13606 +
13607 + znode_make_dirty(uber);
13608 + }
13609 +
13610 + ret = znode_rehash(node, &blk);
13611 +exit:
13612 + if (ret) {
13613 + /* Get flush reserved block back if something fails, because
13614 + * callers assume that on error block wasn't relocated and its
13615 + * flush reserved block wasn't used. */
13616 + if (flush_reserved_used) {
13617 + /*
13618 + * ok, we failed to move node into relocate
13619 + * set. Restore status quo.
13620 + */
13621 + grabbed2flush_reserved((__u64) 1);
13622 + ZF_SET(node, JNODE_FLUSH_RESERVED);
13623 + }
13624 + }
13625 + zrelse(node);
13626 + done_lh(&uber_lock);
13627 + grabbed2free_mark(grabbed);
13628 + return ret;
13629 +}
13630 +
13631 +/* JNODE INTERFACE */
13632 +
13633 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13634 + coordinate in the parent. If the child is the root node, the above_root
13635 + znode is returned but the coord is not set. This function may cause atom
13636 + fusion, but it is only used for read locks (at this point) and therefore
13637 + fusion only occurs when the parent is already dirty. */
13638 +/* Hans adds this note: remember to ask how expensive this operation is vs.
13639 + storing parent pointer in jnodes. */
13640 +static int
13641 +jnode_lock_parent_coord(jnode * node,
13642 + coord_t *coord,
13643 + lock_handle * parent_lh,
13644 + load_count * parent_zh,
13645 + znode_lock_mode parent_mode, int try)
13646 +{
13647 + int ret;
13648 +
13649 + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13650 + assert("edward-54", jnode_is_unformatted(node)
13651 + || znode_is_any_locked(JZNODE(node)));
13652 +
13653 + if (!jnode_is_znode(node)) {
13654 + reiser4_key key;
13655 + tree_level stop_level = TWIG_LEVEL;
13656 + lookup_bias bias = FIND_EXACT;
13657 +
13658 + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13659 +
13660 + /* The case when node is not znode, but can have parent coord
13661 + (unformatted node, node which represents cluster page,
13662 + etc..). Generate a key for the appropriate entry, search
13663 + in the tree using coord_by_key, which handles locking for
13664 + us. */
13665 +
13666 + /*
13667 + * nothing is locked at this moment, so, nothing prevents
13668 + * concurrent truncate from removing jnode from inode. To
13669 + * prevent this spin-lock jnode. jnode can be truncated just
13670 + * after call to the jnode_build_key(), but this is ok,
13671 + * because coord_by_key() will just fail to find appropriate
13672 + * extent.
13673 + */
13674 + spin_lock_jnode(node);
13675 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13676 + jnode_build_key(node, &key);
13677 + ret = 0;
13678 + } else
13679 + ret = RETERR(-ENOENT);
13680 + spin_unlock_jnode(node);
13681 +
13682 + if (ret != 0)
13683 + return ret;
13684 +
13685 + if (jnode_is_cluster_page(node))
13686 + stop_level = LEAF_LEVEL;
13687 +
13688 + assert("jmacd-1812", coord != NULL);
13689 +
13690 + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13691 + parent_mode, bias, stop_level, stop_level,
13692 + CBK_UNIQUE, NULL/*ra_info */);
13693 + switch (ret) {
13694 + case CBK_COORD_NOTFOUND:
13695 + assert("edward-1038",
13696 + ergo(jnode_is_cluster_page(node),
13697 + JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13698 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13699 + warning("nikita-3177", "Parent not found");
13700 + return ret;
13701 + case CBK_COORD_FOUND:
13702 + if (coord->between != AT_UNIT) {
13703 + /* FIXME: comment needed */
13704 + done_lh(parent_lh);
13705 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13706 + warning("nikita-3178",
13707 + "Found but not happy: %i",
13708 + coord->between);
13709 + }
13710 + return RETERR(-ENOENT);
13711 + }
13712 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13713 + if (ret != 0)
13714 + return ret;
13715 + /* if (jnode_is_cluster_page(node)) {
13716 + races with write() are possible
13717 + check_child_cluster (parent_lh->node);
13718 + }
13719 + */
13720 + break;
13721 + default:
13722 + return ret;
13723 + }
13724 +
13725 + } else {
13726 + int flags;
13727 + znode *z;
13728 +
13729 + z = JZNODE(node);
13730 + /* Formatted node case: */
13731 + assert("jmacd-2061", !znode_is_root(z));
13732 +
13733 + flags = GN_ALLOW_NOT_CONNECTED;
13734 + if (try)
13735 + flags |= GN_TRY_LOCK;
13736 +
13737 + ret =
13738 + reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13739 + if (ret != 0)
13740 + /* -E_REPEAT is ok here, it is handled by the caller. */
13741 + return ret;
13742 +
13743 + /* Make the child's position "hint" up-to-date. (Unless above
13744 + root, which caller must check.) */
13745 + if (coord != NULL) {
13746 +
13747 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13748 + if (ret != 0) {
13749 + warning("jmacd-976812386",
13750 + "incr_load_count_znode failed: %d",
13751 + ret);
13752 + return ret;
13753 + }
13754 +
13755 + ret = find_child_ptr(parent_lh->node, z, coord);
13756 + if (ret != 0) {
13757 + warning("jmacd-976812",
13758 + "find_child_ptr failed: %d", ret);
13759 + return ret;
13760 + }
13761 + }
13762 + }
13763 +
13764 + return 0;
13765 +}
13766 +
13767 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the
13768 + same atom. If there is no next neighbor or the neighbor is not in memory or
13769 + if there is a neighbor but it is not dirty or not in the same atom,
13770 + -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
13771 + are not dirty, if so @check_dirty should be 0 */
13772 +static int neighbor_in_slum(znode * node, /* starting point */
13773 + lock_handle * lock, /* lock on starting point */
13774 + sideof side, /* left or right direction we
13775 + seek the next node in */
13776 + znode_lock_mode mode, /* kind of lock we want */
13777 + int check_dirty, /* true if the neighbor should
13778 + be dirty */
13779 + int use_upper_levels /* get neighbor by going though
13780 + upper levels */)
13781 +{
13782 + int ret;
13783 + int flags;
13784 +
13785 + assert("jmacd-6334", znode_is_connected(node));
13786 +
13787 + flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13788 + if (use_upper_levels)
13789 + flags |= GN_CAN_USE_UPPER_LEVELS;
13790 +
13791 + ret = reiser4_get_neighbor(lock, node, mode, flags);
13792 + if (ret) {
13793 + /* May return -ENOENT or -E_NO_NEIGHBOR. */
13794 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13795 + if (ret == -ENOENT)
13796 + ret = RETERR(-E_NO_NEIGHBOR);
13797 + return ret;
13798 + }
13799 + if (!check_dirty)
13800 + return 0;
13801 + /* Check dirty bit of locked znode, no races here */
13802 + if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13803 + return 0;
13804 +
13805 + done_lh(lock);
13806 + return RETERR(-E_NO_NEIGHBOR);
13807 +}
13808 +
13809 +/* Return true if two znodes have the same parent. This is called with both
13810 + nodes write-locked (for squeezing) so no tree lock is needed. */
13811 +static int znode_same_parents(znode * a, znode * b)
13812 +{
13813 + int result;
13814 +
13815 + assert("jmacd-7011", znode_is_write_locked(a));
13816 + assert("jmacd-7012", znode_is_write_locked(b));
13817 +
13818 + /* We lock the whole tree for this check.... I really don't like whole
13819 + * tree locks... -Hans */
13820 + read_lock_tree(znode_get_tree(a));
13821 + result = (znode_parent(a) == znode_parent(b));
13822 + read_unlock_tree(znode_get_tree(a));
13823 + return result;
13824 +}
13825 +
13826 +/* FLUSH SCAN */
13827 +
13828 +/* Initialize the flush_scan data structure. */
13829 +static void scan_init(flush_scan * scan)
13830 +{
13831 + memset(scan, 0, sizeof(*scan));
13832 + init_lh(&scan->node_lock);
13833 + init_lh(&scan->parent_lock);
13834 + init_load_count(&scan->parent_load);
13835 + init_load_count(&scan->node_load);
13836 + coord_init_invalid(&scan->parent_coord, NULL);
13837 +}
13838 +
13839 +/* Release any resources held by the flush scan, e.g. release locks,
13840 + free memory, etc. */
13841 +static void scan_done(flush_scan * scan)
13842 +{
13843 + done_load_count(&scan->node_load);
13844 + if (scan->node != NULL) {
13845 + jput(scan->node);
13846 + scan->node = NULL;
13847 + }
13848 + done_load_count(&scan->parent_load);
13849 + done_lh(&scan->parent_lock);
13850 + done_lh(&scan->node_lock);
13851 +}
13852 +
13853 +/* Returns true if flush scanning is finished. */
13854 +int reiser4_scan_finished(flush_scan * scan)
13855 +{
13856 + return scan->stop || (scan->direction == RIGHT_SIDE &&
13857 + scan->count >= scan->max_count);
13858 +}
13859 +
13860 +/* Return true if the scan should continue to the @tonode. True if the node
13861 + meets the same_slum_check condition. If not, deref the "left" node and stop
13862 + the scan. */
13863 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13864 +{
13865 + int go = same_slum_check(scan->node, tonode, 1, 0);
13866 +
13867 + if (!go) {
13868 + scan->stop = 1;
13869 + jput(tonode);
13870 + }
13871 +
13872 + return go;
13873 +}
13874 +
13875 +/* Set the current scan->node, refcount it, increment count by the @add_count
13876 + (number to count, e.g., skipped unallocated nodes), deref previous current,
13877 + and copy the current parent coordinate. */
13878 +int
13879 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13880 + const coord_t *parent)
13881 +{
13882 + /* Release the old references, take the new reference. */
13883 + done_load_count(&scan->node_load);
13884 +
13885 + if (scan->node != NULL)
13886 + jput(scan->node);
13887 + scan->node = node;
13888 + scan->count += add_count;
13889 +
13890 + /* This next stmt is somewhat inefficient. The reiser4_scan_extent()
13891 + code could delay this update step until it finishes and update the
13892 + parent_coord only once. It did that before, but there was a bug and
13893 + this was the easiest way to make it correct. */
13894 + if (parent != NULL)
13895 + coord_dup(&scan->parent_coord, parent);
13896 +
13897 + /* Failure may happen at the incr_load_count call, but the caller can
13898 + assume the reference is safely taken. */
13899 + return incr_load_count_jnode(&scan->node_load, node);
13900 +}
13901 +
13902 +/* Return true if scanning in the leftward direction. */
13903 +int reiser4_scanning_left(flush_scan * scan)
13904 +{
13905 + return scan->direction == LEFT_SIDE;
13906 +}
13907 +
13908 +/* Performs leftward scanning starting from either kind of node. Counts the
13909 + starting node. The right-scan object is passed in for the left-scan in order
13910 + to copy the parent of an unformatted starting position. This way we avoid
13911 + searching for the unformatted node's parent when scanning in each direction.
13912 + If we search for the parent once it is set in both scan objects. The limit
13913 + parameter tells flush-scan when to stop.
13914 +
13915 + Rapid scanning is used only during scan_left, where we are interested in
13916 + finding the 'leftpoint' where we begin flushing. We are interested in
13917 + stopping at the left child of a twig that does not have a dirty left
13918 + neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
13919 + those nodes without unallocated children, and it is difficult to solve in the
13920 + bottom-up flushing algorithm we are currently using. The problem can be
13921 + solved by scanning left at every level as we go upward, but this would
13922 + basically bring us back to using a top-down allocation strategy, which we
13923 + already tried (see BK history from May 2002), and has a different set of
13924 + problems. The top-down strategy makes avoiding unallocated children easier,
13925 + but makes it difficult to propertly flush dirty children with clean parents
13926 + that would otherwise stop the top-down flush, only later to dirty the parent
13927 + once the children are flushed. So we solve the problem in the bottom-up
13928 + algorithm with a special case for twigs and leaves only.
13929 +
13930 + The first step in solving the problem is this rapid leftward scan. After we
13931 + determine that there are at least enough nodes counted to qualify for
13932 + FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
13933 + are only interested in finding the best place to start the flush.
13934 +
13935 + We could choose one of two possibilities:
13936 +
13937 + 1. Stop at the leftmost child (of a twig) that does not have a dirty left
13938 + neighbor. This requires checking one leaf per rapid-scan twig
13939 +
13940 + 2. Stop at the leftmost child (of a twig) where there are no dirty children
13941 + of the twig to the left. This requires checking possibly all of the in-memory
13942 + children of each twig during the rapid scan.
13943 +
13944 + For now we implement the first policy.
13945 +*/
13946 +static int
13947 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13948 +{
13949 + int ret = 0;
13950 +
13951 + scan->max_count = limit;
13952 + scan->direction = LEFT_SIDE;
13953 +
13954 + ret = scan_set_current(scan, jref(node), 1, NULL);
13955 + if (ret != 0)
13956 + return ret;
13957 +
13958 + ret = scan_common(scan, right);
13959 + if (ret != 0)
13960 + return ret;
13961 +
13962 + /* Before rapid scanning, we need a lock on scan->node so that we can
13963 + get its parent, only if formatted. */
13964 + if (jnode_is_znode(scan->node)) {
13965 + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13966 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13967 + }
13968 +
13969 + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
13970 + */
13971 + return ret;
13972 +}
13973 +
13974 +/* Performs rightward scanning... Does not count the starting node. The limit
13975 + parameter is described in scan_left. If the starting node is unformatted then
13976 + the parent_coord was already set during scan_left. The rapid_after parameter
13977 + is not used during right-scanning.
13978 +
13979 + scan_right is only called if the scan_left operation does not count at least
13980 + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter
13981 + is set to the difference between scan-left's count and
13982 + FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
13983 + FLUSH_RELOCATE_THRESHOLD and then stops. */
13984 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13985 +{
13986 + int ret;
13987 +
13988 + scan->max_count = limit;
13989 + scan->direction = RIGHT_SIDE;
13990 +
13991 + ret = scan_set_current(scan, jref(node), 0, NULL);
13992 + if (ret != 0)
13993 + return ret;
13994 +
13995 + return scan_common(scan, NULL);
13996 +}
13997 +
13998 +/* Common code to perform left or right scanning. */
13999 +static int scan_common(flush_scan * scan, flush_scan * other)
14000 +{
14001 + int ret;
14002 +
14003 + assert("nikita-2376", scan->node != NULL);
14004 + assert("edward-54", jnode_is_unformatted(scan->node)
14005 + || jnode_is_znode(scan->node));
14006 +
14007 + /* Special case for starting at an unformatted node. Optimization: we
14008 + only want to search for the parent (which requires a tree traversal)
14009 + once. Obviously, we shouldn't have to call it once for the left scan
14010 + and once for the right scan. For this reason, if we search for the
14011 + parent during scan-left we then duplicate the coord/lock/load into
14012 + the scan-right object. */
14013 + if (jnode_is_unformatted(scan->node)) {
14014 + ret = scan_unformatted(scan, other);
14015 + if (ret != 0)
14016 + return ret;
14017 + }
14018 + /* This loop expects to start at a formatted position and performs
14019 + chaining of formatted regions */
14020 + while (!reiser4_scan_finished(scan)) {
14021 +
14022 + ret = scan_formatted(scan);
14023 + if (ret != 0)
14024 + return ret;
14025 + }
14026 +
14027 + return 0;
14028 +}
14029 +
14030 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
14031 +{
14032 + int ret = 0;
14033 + int try = 0;
14034 +
14035 + if (!coord_is_invalid(&scan->parent_coord))
14036 + goto scan;
14037 +
14038 + /* set parent coord from */
14039 + if (!jnode_is_unformatted(scan->node)) {
14040 + /* formatted position */
14041 +
14042 + lock_handle lock;
14043 + assert("edward-301", jnode_is_znode(scan->node));
14044 + init_lh(&lock);
14045 +
14046 + /*
14047 + * when flush starts from unformatted node, first thing it
14048 + * does is tree traversal to find formatted parent of starting
14049 + * node. This parent is then kept lock across scans to the
14050 + * left and to the right. This means that during scan to the
14051 + * left we cannot take left-ward lock, because this is
14052 + * dead-lock prone. So, if we are scanning to the left and
14053 + * there is already lock held by this thread,
14054 + * jnode_lock_parent_coord() should use try-lock.
14055 + */
14056 + try = reiser4_scanning_left(scan)
14057 + && !lock_stack_isclean(get_current_lock_stack());
14058 + /* Need the node locked to get the parent lock, We have to
14059 + take write lock since there is at least one call path
14060 + where this znode is already write-locked by us. */
14061 + ret =
14062 + longterm_lock_znode(&lock, JZNODE(scan->node),
14063 + ZNODE_WRITE_LOCK,
14064 + reiser4_scanning_left(scan) ?
14065 + ZNODE_LOCK_LOPRI :
14066 + ZNODE_LOCK_HIPRI);
14067 + if (ret != 0)
14068 + /* EINVAL or E_DEADLOCK here mean... try again! At this
14069 + point we've scanned too far and can't back out, just
14070 + start over. */
14071 + return ret;
14072 +
14073 + ret = jnode_lock_parent_coord(scan->node,
14074 + &scan->parent_coord,
14075 + &scan->parent_lock,
14076 + &scan->parent_load,
14077 + ZNODE_WRITE_LOCK, try);
14078 +
14079 + /* FIXME(C): check EINVAL, E_DEADLOCK */
14080 + done_lh(&lock);
14081 + if (ret == -E_REPEAT) {
14082 + scan->stop = 1;
14083 + return 0;
14084 + }
14085 + if (ret)
14086 + return ret;
14087 +
14088 + } else {
14089 + /* unformatted position */
14090 +
14091 + ret =
14092 + jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14093 + &scan->parent_lock,
14094 + &scan->parent_load,
14095 + ZNODE_WRITE_LOCK, try);
14096 +
14097 + if (IS_CBKERR(ret))
14098 + return ret;
14099 +
14100 + if (ret == CBK_COORD_NOTFOUND)
14101 + /* FIXME(C): check EINVAL, E_DEADLOCK */
14102 + return ret;
14103 +
14104 + /* parent was found */
14105 + assert("jmacd-8661", other != NULL);
14106 + /* Duplicate the reference into the other flush_scan. */
14107 + coord_dup(&other->parent_coord, &scan->parent_coord);
14108 + copy_lh(&other->parent_lock, &scan->parent_lock);
14109 + copy_load_count(&other->parent_load, &scan->parent_load);
14110 + }
14111 +scan:
14112 + return scan_by_coord(scan);
14113 +}
14114 +
14115 +/* Performs left- or rightward scanning starting from a formatted node. Follow
14116 + left pointers under tree lock as long as:
14117 +
14118 + - node->left/right is non-NULL
14119 + - node->left/right is connected, dirty
14120 + - node->left/right belongs to the same atom
14121 + - scan has not reached maximum count
14122 +*/
14123 +static int scan_formatted(flush_scan * scan)
14124 +{
14125 + int ret;
14126 + znode *neighbor = NULL;
14127 +
14128 + assert("jmacd-1401", !reiser4_scan_finished(scan));
14129 +
14130 + do {
14131 + znode *node = JZNODE(scan->node);
14132 +
14133 + /* Node should be connected, but if not stop the scan. */
14134 + if (!znode_is_connected(node)) {
14135 + scan->stop = 1;
14136 + break;
14137 + }
14138 +
14139 + /* Lock the tree, check-for and reference the next sibling. */
14140 + read_lock_tree(znode_get_tree(node));
14141 +
14142 + /* It may be that a node is inserted or removed between a node
14143 + and its left sibling while the tree lock is released, but the
14144 + flush-scan count does not need to be precise. Thus, we
14145 + release the tree lock as soon as we get the neighboring node.
14146 + */
14147 + neighbor =
14148 + reiser4_scanning_left(scan) ? node->left : node->right;
14149 + if (neighbor != NULL)
14150 + zref(neighbor);
14151 +
14152 + read_unlock_tree(znode_get_tree(node));
14153 +
14154 + /* If neighbor is NULL at the leaf level, need to check for an
14155 + unformatted sibling using the parent--break in any case. */
14156 + if (neighbor == NULL)
14157 + break;
14158 +
14159 + /* Check the condition for going left, break if it is not met.
14160 + This also releases (jputs) the neighbor if false. */
14161 + if (!reiser4_scan_goto(scan, ZJNODE(neighbor)))
14162 + break;
14163 +
14164 + /* Advance the flush_scan state to the left, repeat. */
14165 + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14166 + if (ret != 0)
14167 + return ret;
14168 +
14169 + } while (!reiser4_scan_finished(scan));
14170 +
14171 + /* If neighbor is NULL then we reached the end of a formatted region, or
14172 + else the sibling is out of memory, now check for an extent to the
14173 + left (as long as LEAF_LEVEL). */
14174 + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14175 + || reiser4_scan_finished(scan)) {
14176 + scan->stop = 1;
14177 + return 0;
14178 + }
14179 + /* Otherwise, calls scan_by_coord for the right(left)most item of the
14180 + left(right) neighbor on the parent level, then possibly continue. */
14181 +
14182 + coord_init_invalid(&scan->parent_coord, NULL);
14183 + return scan_unformatted(scan, NULL);
14184 +}
14185 +
14186 +/* NOTE-EDWARD:
14187 + This scans adjacent items of the same type and calls scan flush plugin for
14188 + each one. Performs left(right)ward scanning starting from a (possibly)
14189 + unformatted node. If we start from unformatted node, then we continue only if
14190 + the next neighbor is also unformatted. When called from scan_formatted, we
14191 + skip first iteration (to make sure that right(left)most item of the
14192 + left(right) neighbor on the parent level is of the same type and set
14193 + appropriate coord). */
14194 +static int scan_by_coord(flush_scan * scan)
14195 +{
14196 + int ret = 0;
14197 + int scan_this_coord;
14198 + lock_handle next_lock;
14199 + load_count next_load;
14200 + coord_t next_coord;
14201 + jnode *child;
14202 + item_plugin *iplug;
14203 +
14204 + init_lh(&next_lock);
14205 + init_load_count(&next_load);
14206 + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14207 +
14208 + /* set initial item id */
14209 + iplug = item_plugin_by_coord(&scan->parent_coord);
14210 +
14211 + for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
14212 + if (scan_this_coord) {
14213 + /* Here we expect that unit is scannable. it would not
14214 + * be so due to race with extent->tail conversion. */
14215 + if (iplug->f.scan == NULL) {
14216 + scan->stop = 1;
14217 + ret = -E_REPEAT;
14218 + /* skip the check at the end. */
14219 + goto race;
14220 + }
14221 +
14222 + ret = iplug->f.scan(scan);
14223 + if (ret != 0)
14224 + goto exit;
14225 +
14226 + if (reiser4_scan_finished(scan)) {
14227 + checkchild(scan);
14228 + break;
14229 + }
14230 + } else {
14231 + /* the same race against truncate as above is possible
14232 + * here, it seems */
14233 +
14234 + /* NOTE-JMACD: In this case, apply the same end-of-node
14235 + logic but don't scan the first coordinate. */
14236 + assert("jmacd-1231",
14237 + item_is_internal(&scan->parent_coord));
14238 + }
14239 +
14240 + if (iplug->f.utmost_child == NULL
14241 + || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14242 + /* stop this coord and continue on parrent level */
14243 + ret =
14244 + scan_set_current(scan,
14245 + ZJNODE(zref
14246 + (scan->parent_coord.node)),
14247 + 1, NULL);
14248 + if (ret != 0)
14249 + goto exit;
14250 + break;
14251 + }
14252 +
14253 + /* Either way, the invariant is that scan->parent_coord is set
14254 + to the parent of scan->node. Now get the next unit. */
14255 + coord_dup(&next_coord, &scan->parent_coord);
14256 + coord_sideof_unit(&next_coord, scan->direction);
14257 +
14258 + /* If off-the-end of the twig, try the next twig. */
14259 + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14260 + /* We take the write lock because we may start flushing
14261 + * from this coordinate. */
14262 + ret = neighbor_in_slum(next_coord.node,
14263 + &next_lock,
14264 + scan->direction,
14265 + ZNODE_WRITE_LOCK,
14266 + 1 /* check dirty */,
14267 + 0 /* don't go though upper
14268 + levels */);
14269 + if (ret == -E_NO_NEIGHBOR) {
14270 + scan->stop = 1;
14271 + ret = 0;
14272 + break;
14273 + }
14274 +
14275 + if (ret != 0)
14276 + goto exit;
14277 +
14278 + ret = incr_load_count_znode(&next_load, next_lock.node);
14279 + if (ret != 0)
14280 + goto exit;
14281 +
14282 + coord_init_sideof_unit(&next_coord, next_lock.node,
14283 + sideof_reverse(scan->direction));
14284 + }
14285 +
14286 + iplug = item_plugin_by_coord(&next_coord);
14287 +
14288 + /* Get the next child. */
14289 + ret =
14290 + iplug->f.utmost_child(&next_coord,
14291 + sideof_reverse(scan->direction),
14292 + &child);
14293 + if (ret != 0)
14294 + goto exit;
14295 + /* If the next child is not in memory, or, item_utmost_child
14296 + failed (due to race with unlink, most probably), stop
14297 + here. */
14298 + if (child == NULL || IS_ERR(child)) {
14299 + scan->stop = 1;
14300 + checkchild(scan);
14301 + break;
14302 + }
14303 +
14304 + assert("nikita-2374", jnode_is_unformatted(child)
14305 + || jnode_is_znode(child));
14306 +
14307 + /* See if it is dirty, part of the same atom. */
14308 + if (!reiser4_scan_goto(scan, child)) {
14309 + checkchild(scan);
14310 + break;
14311 + }
14312 +
14313 + /* If so, make this child current. */
14314 + ret = scan_set_current(scan, child, 1, &next_coord);
14315 + if (ret != 0)
14316 + goto exit;
14317 +
14318 + /* Now continue. If formatted we release the parent lock and
14319 + return, then proceed. */
14320 + if (jnode_is_znode(child))
14321 + break;
14322 +
14323 + /* Otherwise, repeat the above loop with next_coord. */
14324 + if (next_load.node != NULL) {
14325 + done_lh(&scan->parent_lock);
14326 + move_lh(&scan->parent_lock, &next_lock);
14327 + move_load_count(&scan->parent_load, &next_load);
14328 + }
14329 + }
14330 +
14331 + assert("jmacd-6233",
14332 + reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14333 +exit:
14334 + checkchild(scan);
14335 +race: /* skip the above check */
14336 + if (jnode_is_znode(scan->node)) {
14337 + done_lh(&scan->parent_lock);
14338 + done_load_count(&scan->parent_load);
14339 + }
14340 +
14341 + done_load_count(&next_load);
14342 + done_lh(&next_lock);
14343 + return ret;
14344 +}
14345 +
14346 +/* FLUSH POS HELPERS */
14347 +
14348 +/* Initialize the fields of a flush_position. */
14349 +static void pos_init(flush_pos_t *pos)
14350 +{
14351 + memset(pos, 0, sizeof *pos);
14352 +
14353 + pos->state = POS_INVALID;
14354 + coord_init_invalid(&pos->coord, NULL);
14355 + init_lh(&pos->lock);
14356 + init_load_count(&pos->load);
14357 +
14358 + reiser4_blocknr_hint_init(&pos->preceder);
14359 +}
14360 +
14361 +/* The flush loop inside squalloc periodically checks pos_valid to determine
14362 + when "enough flushing" has been performed. This will return true until one
14363 + of the following conditions is met:
14364 +
14365 + 1. the number of flush-queued nodes has reached the kernel-supplied
14366 + "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
14367 + kernel requested. When flushing to commit, this parameter is NULL.
14368 +
14369 + 2. pos_stop() is called because squalloc discovers that the "next" node in
14370 + the flush order is either non-existant, not dirty, or not in the same atom.
14371 +*/
14372 +
14373 +static int pos_valid(flush_pos_t *pos)
14374 +{
14375 + return pos->state != POS_INVALID;
14376 +}
14377 +
14378 +/* Release any resources of a flush_position. Called when jnode_flush
14379 + finishes. */
14380 +static void pos_done(flush_pos_t *pos)
14381 +{
14382 + pos_stop(pos);
14383 + reiser4_blocknr_hint_done(&pos->preceder);
14384 + if (convert_data(pos))
14385 + free_convert_data(pos);
14386 +}
14387 +
14388 +/* Reset the point and parent. Called during flush subroutines to terminate the
14389 + squalloc loop. */
14390 +static int pos_stop(flush_pos_t *pos)
14391 +{
14392 + pos->state = POS_INVALID;
14393 + done_lh(&pos->lock);
14394 + done_load_count(&pos->load);
14395 + coord_init_invalid(&pos->coord, NULL);
14396 +
14397 + if (pos->child) {
14398 + jput(pos->child);
14399 + pos->child = NULL;
14400 + }
14401 +
14402 + return 0;
14403 +}
14404 +
14405 +/* Return the flush_position's block allocator hint. */
14406 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos)
14407 +{
14408 + return &pos->preceder;
14409 +}
14410 +
14411 +flush_queue_t *reiser4_pos_fq(flush_pos_t *pos)
14412 +{
14413 + return pos->fq;
14414 +}
14415 +
14416 +/* Make Linus happy.
14417 + Local variables:
14418 + c-indentation-style: "K&R"
14419 + mode-name: "LC"
14420 + c-basic-offset: 8
14421 + tab-width: 8
14422 + fill-column: 90
14423 + LocalWords: preceder
14424 + End:
14425 +*/
14426 diff -urN linux-2.6.33.orig/fs/reiser4/flush.h linux-2.6.33/fs/reiser4/flush.h
14427 --- linux-2.6.33.orig/fs/reiser4/flush.h 1970-01-01 01:00:00.000000000 +0100
14428 +++ linux-2.6.33/fs/reiser4/flush.h 2010-03-04 19:33:22.000000000 +0100
14429 @@ -0,0 +1,300 @@
14430 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14431 +
14432 +/* DECLARATIONS: */
14433 +
14434 +#if !defined(__REISER4_FLUSH_H__)
14435 +#define __REISER4_FLUSH_H__
14436 +
14437 +#include "plugin/cluster.h"
14438 +
14439 +/* The flush_scan data structure maintains the state of an in-progress
14440 + flush-scan on a single level of the tree. A flush-scan is used for counting
14441 + the number of adjacent nodes to flush, which is used to determine whether we
14442 + should relocate, and it is also used to find a starting point for flush. A
14443 + flush-scan object can scan in both right and left directions via the
14444 + scan_left() and scan_right() interfaces. The right- and left-variations are
14445 + similar but perform different functions. When scanning left we (optionally
14446 + perform rapid scanning and then) longterm-lock the endpoint node. When
14447 + scanning right we are simply counting the number of adjacent, dirty nodes. */
14448 +struct flush_scan {
14449 +
14450 + /* The current number of nodes scanned on this level. */
14451 + unsigned count;
14452 +
14453 + /* There may be a maximum number of nodes for a scan on any single
14454 + level. When going leftward, max_count is determined by
14455 + FLUSH_SCAN_MAXNODES (see reiser4.h) */
14456 + unsigned max_count;
14457 +
14458 + /* Direction: Set to one of the sideof enumeration:
14459 + { LEFT_SIDE, RIGHT_SIDE }. */
14460 + sideof direction;
14461 +
14462 + /* Initially @stop is set to false then set true once some condition
14463 + stops the search (e.g., we found a clean node before reaching
14464 + max_count or we found a node belonging to another atom). */
14465 + int stop;
14466 +
14467 + /* The current scan position. If @node is non-NULL then its reference
14468 + count has been incremented to reflect this reference. */
14469 + jnode *node;
14470 +
14471 + /* A handle for zload/zrelse of current scan position node. */
14472 + load_count node_load;
14473 +
14474 + /* During left-scan, if the final position (a.k.a. endpoint node) is
14475 + formatted the node is locked using this lock handle. The endpoint
14476 + needs to be locked for transfer to the flush_position object after
14477 + scanning finishes. */
14478 + lock_handle node_lock;
14479 +
14480 + /* When the position is unformatted, its parent, coordinate, and parent
14481 + zload/zrelse handle. */
14482 + lock_handle parent_lock;
14483 + coord_t parent_coord;
14484 + load_count parent_load;
14485 +
14486 + /* The block allocator preceder hint. Sometimes flush_scan determines
14487 + what the preceder is and if so it sets it here, after which it is
14488 + copied into the flush_position. Otherwise, the preceder is computed
14489 + later. */
14490 + reiser4_block_nr preceder_blk;
14491 +};
14492 +
14493 +struct convert_item_info {
14494 + dc_item_stat d_cur; /* disk cluster state of the current item */
14495 + dc_item_stat d_next; /* disk cluster state of the next slum item */
14496 + int cluster_shift; /* disk cluster shift */
14497 + flow_t flow; /* disk cluster data */
14498 +};
14499 +
14500 +struct convert_info {
14501 + int count; /* for squalloc terminating */
14502 + item_plugin *iplug; /* current item plugin */
14503 + struct convert_item_info *itm; /* current item info */
14504 + struct cluster_handle clust; /* transform cluster */
14505 +};
14506 +
14507 +typedef enum flush_position_state {
14508 + POS_INVALID, /* Invalid or stopped pos, do not continue slum
14509 + * processing */
14510 + POS_ON_LEAF, /* pos points to already prepped, locked
14511 + * formatted node at leaf level */
14512 + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field
14513 + * is used to traverse unformatted nodes */
14514 + POS_TO_LEAF, /* pos is being moved to leaf level */
14515 + POS_TO_TWIG, /* pos is being moved to twig level */
14516 + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is
14517 + * after rightmost unit of the current twig */
14518 + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal
14519 + * node */
14520 +} flushpos_state_t;
14521 +
14522 +/* An encapsulation of the current flush point and all the parameters that are
14523 + passed through the entire squeeze-and-allocate stage of the flush routine.
14524 + A single flush_position object is constructed after left- and right-scanning
14525 + finishes. */
14526 +struct flush_position {
14527 + flushpos_state_t state;
14528 +
14529 + coord_t coord; /* coord to traverse unformatted nodes */
14530 + lock_handle lock; /* current lock we hold */
14531 + load_count load; /* load status for current locked formatted node
14532 + */
14533 + jnode *child; /* for passing a reference to unformatted child
14534 + * across pos state changes */
14535 +
14536 + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14537 + int leaf_relocate; /* True if enough leaf-level nodes were
14538 + * found to suggest a relocate policy. */
14539 + int alloc_cnt; /* The number of nodes allocated during squeeze
14540 + and allococate. */
14541 + int prep_or_free_cnt; /* The number of nodes prepared for write
14542 + (allocate) or squeezed and freed. */
14543 + flush_queue_t *fq;
14544 + long *nr_written; /* number of nodes submitted to disk */
14545 + int flags; /* a copy of jnode_flush flags argument */
14546 +
14547 + znode *prev_twig; /* previous parent pointer value, used to catch
14548 + * processing of new twig node */
14549 + struct convert_info *sq; /* convert info */
14550 +
14551 + unsigned long pos_in_unit; /* for extents only. Position
14552 + within an extent unit of first
14553 + jnode of slum */
14554 + long nr_to_write; /* number of unformatted nodes to handle on
14555 + flush */
14556 +};
14557 +
14558 +static inline int item_convert_count(flush_pos_t *pos)
14559 +{
14560 + return pos->sq->count;
14561 +}
14562 +static inline void inc_item_convert_count(flush_pos_t *pos)
14563 +{
14564 + pos->sq->count++;
14565 +}
14566 +static inline void set_item_convert_count(flush_pos_t *pos, int count)
14567 +{
14568 + pos->sq->count = count;
14569 +}
14570 +static inline item_plugin *item_convert_plug(flush_pos_t *pos)
14571 +{
14572 + return pos->sq->iplug;
14573 +}
14574 +
14575 +static inline struct convert_info *convert_data(flush_pos_t *pos)
14576 +{
14577 + return pos->sq;
14578 +}
14579 +
14580 +static inline struct convert_item_info *item_convert_data(flush_pos_t *pos)
14581 +{
14582 + assert("edward-955", convert_data(pos));
14583 + return pos->sq->itm;
14584 +}
14585 +
14586 +static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos)
14587 +{
14588 + return &pos->sq->clust.tc;
14589 +}
14590 +
14591 +static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos,
14592 + tfm_stream_id id)
14593 +{
14594 + assert("edward-854", pos->sq != NULL);
14595 + return get_tfm_stream(tfm_cluster_sq(pos), id);
14596 +}
14597 +
14598 +static inline int chaining_data_present(flush_pos_t *pos)
14599 +{
14600 + return convert_data(pos) && item_convert_data(pos);
14601 +}
14602 +
14603 +/* Returns true if next node contains next item of the disk cluster
14604 + so item convert data should be moved to the right slum neighbor.
14605 +*/
14606 +static inline int should_chain_next_node(flush_pos_t *pos)
14607 +{
14608 + int result = 0;
14609 +
14610 + assert("edward-1007", chaining_data_present(pos));
14611 +
14612 + switch (item_convert_data(pos)->d_next) {
14613 + case DC_CHAINED_ITEM:
14614 + result = 1;
14615 + break;
14616 + case DC_AFTER_CLUSTER:
14617 + break;
14618 + default:
14619 + impossible("edward-1009", "bad state of next slum item");
14620 + }
14621 + return result;
14622 +}
14623 +
14624 +/* update item state in a disk cluster to assign conversion mode */
14625 +static inline void
14626 +move_chaining_data(flush_pos_t *pos, int this_node/* where is next item */)
14627 +{
14628 +
14629 + assert("edward-1010", chaining_data_present(pos));
14630 +
14631 + if (this_node == 0) {
14632 + /* next item is on the right neighbor */
14633 + assert("edward-1011",
14634 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14635 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14636 + assert("edward-1012",
14637 + item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14638 +
14639 + item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14640 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14641 + } else {
14642 + /* next item is on the same node */
14643 + assert("edward-1013",
14644 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14645 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14646 + assert("edward-1227",
14647 + item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14648 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
14649 +
14650 + item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14651 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14652 + }
14653 +}
14654 +
14655 +static inline int should_convert_node(flush_pos_t *pos, znode * node)
14656 +{
14657 + return znode_convertible(node);
14658 +}
14659 +
14660 +/* true if there is attached convert item info */
14661 +static inline int should_convert_next_node(flush_pos_t *pos)
14662 +{
14663 + return convert_data(pos) && item_convert_data(pos);
14664 +}
14665 +
14666 +#define SQUALLOC_THRESHOLD 256
14667 +
14668 +static inline int should_terminate_squalloc(flush_pos_t *pos)
14669 +{
14670 + return convert_data(pos) &&
14671 + !item_convert_data(pos) &&
14672 + item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14673 +}
14674 +
14675 +#if 1
14676 +#define check_convert_info(pos) \
14677 +do { \
14678 + if (unlikely(should_convert_next_node(pos))) { \
14679 + warning("edward-1006", "unprocessed chained data"); \
14680 + printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14681 + item_convert_data(pos)->d_cur, \
14682 + item_convert_data(pos)->d_next, \
14683 + item_convert_data(pos)->flow.length); \
14684 + } \
14685 +} while (0)
14686 +#else
14687 +#define check_convert_info(pos)
14688 +#endif /* REISER4_DEBUG */
14689 +
14690 +void free_convert_data(flush_pos_t *pos);
14691 +/* used in extent.c */
14692 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14693 + const coord_t *parent);
14694 +int reiser4_scan_finished(flush_scan * scan);
14695 +int reiser4_scanning_left(flush_scan * scan);
14696 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14697 +txn_atom *atom_locked_by_fq(flush_queue_t *fq);
14698 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14699 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14700 + reiser4_key *stop_key);
14701 +extern int reiser4_init_fqs(void);
14702 +extern void reiser4_done_fqs(void);
14703 +
14704 +#if REISER4_DEBUG
14705 +
14706 +extern void reiser4_check_fq(const txn_atom *atom);
14707 +extern atomic_t flush_cnt;
14708 +
14709 +#define check_preceder(blk) \
14710 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14711 +extern void check_pos(flush_pos_t *pos);
14712 +#else
14713 +#define check_preceder(b) noop
14714 +#define check_pos(pos) noop
14715 +#endif
14716 +
14717 +/* __REISER4_FLUSH_H__ */
14718 +#endif
14719 +
14720 +/* Make Linus happy.
14721 + Local variables:
14722 + c-indentation-style: "K&R"
14723 + mode-name: "LC"
14724 + c-basic-offset: 8
14725 + tab-width: 8
14726 + fill-column: 90
14727 + LocalWords: preceder
14728 + End:
14729 +*/
14730 diff -urN linux-2.6.33.orig/fs/reiser4/flush_queue.c linux-2.6.33/fs/reiser4/flush_queue.c
14731 --- linux-2.6.33.orig/fs/reiser4/flush_queue.c 1970-01-01 01:00:00.000000000 +0100
14732 +++ linux-2.6.33/fs/reiser4/flush_queue.c 2010-03-04 19:33:22.000000000 +0100
14733 @@ -0,0 +1,678 @@
14734 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
14735 + reiser4/README */
14736 +
14737 +#include "debug.h"
14738 +#include "super.h"
14739 +#include "txnmgr.h"
14740 +#include "jnode.h"
14741 +#include "znode.h"
14742 +#include "page_cache.h"
14743 +#include "wander.h"
14744 +#include "vfs_ops.h"
14745 +#include "writeout.h"
14746 +#include "flush.h"
14747 +
14748 +#include <linux/bio.h>
14749 +#include <linux/mm.h>
14750 +#include <linux/pagemap.h>
14751 +#include <linux/blkdev.h>
14752 +#include <linux/writeback.h>
14753 +
14754 +/* A flush queue object is an accumulator for keeping jnodes prepared
14755 + by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14756 + kept on the flush queue until memory pressure or atom commit asks
14757 + flush queues to write some or all from their jnodes. */
14758 +
14759 +/*
14760 + LOCKING:
14761 +
14762 + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14763 + list protected by atom spin lock. fq->prepped list uses the following
14764 + locking:
14765 +
14766 + two ways to protect fq->prepped list for read-only list traversal:
14767 +
14768 + 1. atom spin-lock atom.
14769 + 2. fq is IN_USE, atom->nr_running_queues increased.
14770 +
14771 + and one for list modification:
14772 +
14773 + 1. atom is spin-locked and one condition is true: fq is IN_USE or
14774 + atom->nr_running_queues == 0.
14775 +
14776 + The deadlock-safe order for flush queues and atoms is: first lock atom, then
14777 + lock flush queue, then lock jnode.
14778 +*/
14779 +
14780 +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14781 +#define fq_ready(fq) (!fq_in_use(fq))
14782 +
14783 +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14784 +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14785 +
14786 +/* get lock on atom from locked flush queue object */
14787 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq)
14788 +{
14789 + /* This code is similar to jnode_get_atom(), look at it for the
14790 + * explanation. */
14791 + txn_atom *atom;
14792 +
14793 + assert_spin_locked(&(fq->guard));
14794 +
14795 + while (1) {
14796 + atom = fq->atom;
14797 + if (atom == NULL)
14798 + break;
14799 +
14800 + if (spin_trylock_atom(atom))
14801 + break;
14802 +
14803 + atomic_inc(&atom->refcount);
14804 + spin_unlock(&(fq->guard));
14805 + spin_lock_atom(atom);
14806 + spin_lock(&(fq->guard));
14807 +
14808 + if (fq->atom == atom) {
14809 + atomic_dec(&atom->refcount);
14810 + break;
14811 + }
14812 +
14813 + spin_unlock(&(fq->guard));
14814 + atom_dec_and_unlock(atom);
14815 + spin_lock(&(fq->guard));
14816 + }
14817 +
14818 + return atom;
14819 +}
14820 +
14821 +txn_atom *atom_locked_by_fq(flush_queue_t *fq)
14822 +{
14823 + txn_atom *atom;
14824 +
14825 + spin_lock(&(fq->guard));
14826 + atom = atom_locked_by_fq_nolock(fq);
14827 + spin_unlock(&(fq->guard));
14828 + return atom;
14829 +}
14830 +
14831 +static void init_fq(flush_queue_t *fq)
14832 +{
14833 + memset(fq, 0, sizeof *fq);
14834 +
14835 + atomic_set(&fq->nr_submitted, 0);
14836 +
14837 + INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14838 +
14839 + init_waitqueue_head(&fq->wait);
14840 + spin_lock_init(&fq->guard);
14841 +}
14842 +
14843 +/* slab for flush queues */
14844 +static struct kmem_cache *fq_slab;
14845 +
14846 +/**
14847 + * reiser4_init_fqs - create flush queue cache
14848 + *
14849 + * Initializes slab cache of flush queues. It is part of reiser4 module
14850 + * initialization.
14851 + */
14852 +int reiser4_init_fqs(void)
14853 +{
14854 + fq_slab = kmem_cache_create("fq",
14855 + sizeof(flush_queue_t),
14856 + 0, SLAB_HWCACHE_ALIGN, NULL);
14857 + if (fq_slab == NULL)
14858 + return RETERR(-ENOMEM);
14859 + return 0;
14860 +}
14861 +
14862 +/**
14863 + * reiser4_done_fqs - delete flush queue cache
14864 + *
14865 + * This is called on reiser4 module unloading or system shutdown.
14866 + */
14867 +void reiser4_done_fqs(void)
14868 +{
14869 + destroy_reiser4_cache(&fq_slab);
14870 +}
14871 +
14872 +/* create new flush queue object */
14873 +static flush_queue_t *create_fq(gfp_t gfp)
14874 +{
14875 + flush_queue_t *fq;
14876 +
14877 + fq = kmem_cache_alloc(fq_slab, gfp);
14878 + if (fq)
14879 + init_fq(fq);
14880 +
14881 + return fq;
14882 +}
14883 +
14884 +/* adjust atom's and flush queue's counters of queued nodes */
14885 +static void count_enqueued_node(flush_queue_t *fq)
14886 +{
14887 + ON_DEBUG(fq->atom->num_queued++);
14888 +}
14889 +
14890 +static void count_dequeued_node(flush_queue_t *fq)
14891 +{
14892 + assert("zam-993", fq->atom->num_queued > 0);
14893 + ON_DEBUG(fq->atom->num_queued--);
14894 +}
14895 +
14896 +/* attach flush queue object to the atom */
14897 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14898 +{
14899 + assert_spin_locked(&(atom->alock));
14900 + list_add(&fq->alink, &atom->flush_queues);
14901 + fq->atom = atom;
14902 + ON_DEBUG(atom->nr_flush_queues++);
14903 +}
14904 +
14905 +static void detach_fq(flush_queue_t *fq)
14906 +{
14907 + assert_spin_locked(&(fq->atom->alock));
14908 +
14909 + spin_lock(&(fq->guard));
14910 + list_del_init(&fq->alink);
14911 + assert("vs-1456", fq->atom->nr_flush_queues > 0);
14912 + ON_DEBUG(fq->atom->nr_flush_queues--);
14913 + fq->atom = NULL;
14914 + spin_unlock(&(fq->guard));
14915 +}
14916 +
14917 +/* destroy flush queue object */
14918 +static void done_fq(flush_queue_t *fq)
14919 +{
14920 + assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14921 + assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14922 +
14923 + kmem_cache_free(fq_slab, fq);
14924 +}
14925 +
14926 +/* */
14927 +static void mark_jnode_queued(flush_queue_t *fq, jnode * node)
14928 +{
14929 + JF_SET(node, JNODE_FLUSH_QUEUED);
14930 + count_enqueued_node(fq);
14931 +}
14932 +
14933 +/* Putting jnode into the flush queue. Both atom and jnode should be
14934 + spin-locked. */
14935 +void queue_jnode(flush_queue_t *fq, jnode * node)
14936 +{
14937 + assert_spin_locked(&(node->guard));
14938 + assert("zam-713", node->atom != NULL);
14939 + assert_spin_locked(&(node->atom->alock));
14940 + assert("zam-716", fq->atom != NULL);
14941 + assert("zam-717", fq->atom == node->atom);
14942 + assert("zam-907", fq_in_use(fq));
14943 +
14944 + assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14945 + assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14946 + assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14947 + assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14948 +
14949 + mark_jnode_queued(fq, node);
14950 + list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14951 +
14952 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14953 + FQ_LIST, 1));
14954 +}
14955 +
14956 +/* repeatable process for waiting io completion on a flush queue object */
14957 +static int wait_io(flush_queue_t *fq, int *nr_io_errors)
14958 +{
14959 + assert("zam-738", fq->atom != NULL);
14960 + assert_spin_locked(&(fq->atom->alock));
14961 + assert("zam-736", fq_in_use(fq));
14962 + assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14963 +
14964 + if (atomic_read(&fq->nr_submitted) != 0) {
14965 + struct super_block *super;
14966 +
14967 + spin_unlock_atom(fq->atom);
14968 +
14969 + assert("nikita-3013", reiser4_schedulable());
14970 +
14971 + super = reiser4_get_current_sb();
14972 +
14973 + /* FIXME: this is instead of blk_run_queues() */
14974 + blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14975 +
14976 + if (!(super->s_flags & MS_RDONLY))
14977 + wait_event(fq->wait,
14978 + atomic_read(&fq->nr_submitted) == 0);
14979 +
14980 + /* Ask the caller to re-acquire the locks and call this
14981 + function again. Note: this technique is commonly used in
14982 + the txnmgr code. */
14983 + return -E_REPEAT;
14984 + }
14985 +
14986 + *nr_io_errors += atomic_read(&fq->nr_errors);
14987 + return 0;
14988 +}
14989 +
14990 +/* wait on I/O completion, re-submit dirty nodes to write */
14991 +static int finish_fq(flush_queue_t *fq, int *nr_io_errors)
14992 +{
14993 + int ret;
14994 + txn_atom *atom = fq->atom;
14995 +
14996 + assert("zam-801", atom != NULL);
14997 + assert_spin_locked(&(atom->alock));
14998 + assert("zam-762", fq_in_use(fq));
14999 +
15000 + ret = wait_io(fq, nr_io_errors);
15001 + if (ret)
15002 + return ret;
15003 +
15004 + detach_fq(fq);
15005 + done_fq(fq);
15006 +
15007 + reiser4_atom_send_event(atom);
15008 +
15009 + return 0;
15010 +}
15011 +
15012 +/* wait for all i/o for given atom to be completed, actually do one iteration
15013 + on that and return -E_REPEAT if there more iterations needed */
15014 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
15015 +{
15016 + flush_queue_t *fq;
15017 +
15018 + assert_spin_locked(&(atom->alock));
15019 +
15020 + if (list_empty_careful(&atom->flush_queues))
15021 + return 0;
15022 +
15023 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15024 + if (fq_ready(fq)) {
15025 + int ret;
15026 +
15027 + mark_fq_in_use(fq);
15028 + assert("vs-1247", fq->owner == NULL);
15029 + ON_DEBUG(fq->owner = current);
15030 + ret = finish_fq(fq, nr_io_errors);
15031 +
15032 + if (*nr_io_errors)
15033 + reiser4_handle_error();
15034 +
15035 + if (ret) {
15036 + reiser4_fq_put(fq);
15037 + return ret;
15038 + }
15039 +
15040 + spin_unlock_atom(atom);
15041 +
15042 + return -E_REPEAT;
15043 + }
15044 + }
15045 +
15046 + /* All flush queues are in use; atom remains locked */
15047 + return -EBUSY;
15048 +}
15049 +
15050 +/* wait all i/o for current atom */
15051 +int current_atom_finish_all_fq(void)
15052 +{
15053 + txn_atom *atom;
15054 + int nr_io_errors = 0;
15055 + int ret = 0;
15056 +
15057 + do {
15058 + while (1) {
15059 + atom = get_current_atom_locked();
15060 + ret = finish_all_fq(atom, &nr_io_errors);
15061 + if (ret != -EBUSY)
15062 + break;
15063 + reiser4_atom_wait_event(atom);
15064 + }
15065 + } while (ret == -E_REPEAT);
15066 +
15067 + /* we do not need locked atom after this function finishes, SUCCESS or
15068 + -EBUSY are two return codes when atom remains locked after
15069 + finish_all_fq */
15070 + if (!ret)
15071 + spin_unlock_atom(atom);
15072 +
15073 + assert_spin_not_locked(&(atom->alock));
15074 +
15075 + if (ret)
15076 + return ret;
15077 +
15078 + if (nr_io_errors)
15079 + return RETERR(-EIO);
15080 +
15081 + return 0;
15082 +}
15083 +
15084 +/* change node->atom field for all jnode from given list */
15085 +static void
15086 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15087 +{
15088 + jnode *cur;
15089 +
15090 + list_for_each_entry(cur, list, capture_link) {
15091 + spin_lock_jnode(cur);
15092 + cur->atom = atom;
15093 + spin_unlock_jnode(cur);
15094 + }
15095 +}
15096 +
15097 +/* support for atom fusion operation */
15098 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
15099 +{
15100 + flush_queue_t *fq;
15101 +
15102 + assert_spin_locked(&(to->alock));
15103 + assert_spin_locked(&(from->alock));
15104 +
15105 + list_for_each_entry(fq, &from->flush_queues, alink) {
15106 + scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15107 + spin_lock(&(fq->guard));
15108 + fq->atom = to;
15109 + spin_unlock(&(fq->guard));
15110 + }
15111 +
15112 + list_splice_init(&from->flush_queues, to->flush_queues.prev);
15113 +
15114 +#if REISER4_DEBUG
15115 + to->num_queued += from->num_queued;
15116 + to->nr_flush_queues += from->nr_flush_queues;
15117 + from->nr_flush_queues = 0;
15118 +#endif
15119 +}
15120 +
15121 +#if REISER4_DEBUG
15122 +int atom_fq_parts_are_clean(txn_atom * atom)
15123 +{
15124 + assert("zam-915", atom != NULL);
15125 + return list_empty_careful(&atom->flush_queues);
15126 +}
15127 +#endif
15128 +/* Bio i/o completion routine for reiser4 write operations. */
15129 +static void
15130 +end_io_handler(struct bio *bio, int err)
15131 +{
15132 + int i;
15133 + int nr_errors = 0;
15134 + flush_queue_t *fq;
15135 +
15136 + assert("zam-958", bio->bi_rw & WRITE);
15137 +
15138 + if (err == -EOPNOTSUPP)
15139 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15140 +
15141 + /* we expect that bio->private is set to NULL or fq object which is used
15142 + * for synchronization and error counting. */
15143 + fq = bio->bi_private;
15144 + /* Check all elements of io_vec for correct write completion. */
15145 + for (i = 0; i < bio->bi_vcnt; i += 1) {
15146 + struct page *pg = bio->bi_io_vec[i].bv_page;
15147 +
15148 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15149 + SetPageError(pg);
15150 + nr_errors++;
15151 + }
15152 +
15153 + {
15154 + /* jnode WRITEBACK ("write is in progress bit") is
15155 + * atomically cleared here. */
15156 + jnode *node;
15157 +
15158 + assert("zam-736", pg != NULL);
15159 + assert("zam-736", PagePrivate(pg));
15160 + node = jprivate(pg);
15161 +
15162 + JF_CLR(node, JNODE_WRITEBACK);
15163 + }
15164 +
15165 + end_page_writeback(pg);
15166 + page_cache_release(pg);
15167 + }
15168 +
15169 + if (fq) {
15170 + /* count i/o error in fq object */
15171 + atomic_add(nr_errors, &fq->nr_errors);
15172 +
15173 + /* If all write requests registered in this "fq" are done we up
15174 + * the waiter. */
15175 + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15176 + wake_up(&fq->wait);
15177 + }
15178 +
15179 + bio_put(bio);
15180 +}
15181 +
15182 +/* Count I/O requests which will be submitted by @bio in given flush queues
15183 + @fq */
15184 +void add_fq_to_bio(flush_queue_t *fq, struct bio *bio)
15185 +{
15186 + bio->bi_private = fq;
15187 + bio->bi_end_io = end_io_handler;
15188 +
15189 + if (fq)
15190 + atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15191 +}
15192 +
15193 +/* Move all queued nodes out from @fq->prepped list. */
15194 +static void release_prepped_list(flush_queue_t *fq)
15195 +{
15196 + txn_atom *atom;
15197 +
15198 + assert("zam-904", fq_in_use(fq));
15199 + atom = atom_locked_by_fq(fq);
15200 +
15201 + while (!list_empty(ATOM_FQ_LIST(fq))) {
15202 + jnode *cur;
15203 +
15204 + cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15205 + list_del_init(&cur->capture_link);
15206 +
15207 + count_dequeued_node(fq);
15208 + spin_lock_jnode(cur);
15209 + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15210 + assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15211 + assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15212 + JF_CLR(cur, JNODE_FLUSH_QUEUED);
15213 +
15214 + if (JF_ISSET(cur, JNODE_DIRTY)) {
15215 + list_add_tail(&cur->capture_link,
15216 + ATOM_DIRTY_LIST(atom,
15217 + jnode_get_level(cur)));
15218 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15219 + DIRTY_LIST, 1));
15220 + } else {
15221 + list_add_tail(&cur->capture_link,
15222 + ATOM_CLEAN_LIST(atom));
15223 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15224 + CLEAN_LIST, 1));
15225 + }
15226 +
15227 + spin_unlock_jnode(cur);
15228 + }
15229 +
15230 + if (--atom->nr_running_queues == 0)
15231 + reiser4_atom_send_event(atom);
15232 +
15233 + spin_unlock_atom(atom);
15234 +}
15235 +
15236 +/* Submit write requests for nodes on the already filled flush queue @fq.
15237 +
15238 + @fq: flush queue object which contains jnodes we can (and will) write.
15239 + @return: number of submitted blocks (>=0) if success, otherwise -- an error
15240 + code (<0). */
15241 +int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags)
15242 +{
15243 + int ret;
15244 + txn_atom *atom;
15245 +
15246 + while (1) {
15247 + atom = atom_locked_by_fq(fq);
15248 + assert("zam-924", atom);
15249 + /* do not write fq in parallel. */
15250 + if (atom->nr_running_queues == 0
15251 + || !(flags & WRITEOUT_SINGLE_STREAM))
15252 + break;
15253 + reiser4_atom_wait_event(atom);
15254 + }
15255 +
15256 + atom->nr_running_queues++;
15257 + spin_unlock_atom(atom);
15258 +
15259 + ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15260 + release_prepped_list(fq);
15261 +
15262 + return ret;
15263 +}
15264 +
15265 +/* Getting flush queue object for exclusive use by one thread. May require
15266 + several iterations which is indicated by -E_REPEAT return code.
15267 +
15268 + This function does not contain code for obtaining an atom lock because an
15269 + atom lock is obtained by different ways in different parts of reiser4,
15270 + usually it is current atom, but we need a possibility for getting fq for the
15271 + atom of given jnode. */
15272 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15273 +{
15274 + flush_queue_t *fq;
15275 +
15276 + assert_spin_locked(&(atom->alock));
15277 +
15278 + fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15279 + while (&atom->flush_queues != &fq->alink) {
15280 + spin_lock(&(fq->guard));
15281 +
15282 + if (fq_ready(fq)) {
15283 + mark_fq_in_use(fq);
15284 + assert("vs-1246", fq->owner == NULL);
15285 + ON_DEBUG(fq->owner = current);
15286 + spin_unlock(&(fq->guard));
15287 +
15288 + if (*new_fq)
15289 + done_fq(*new_fq);
15290 +
15291 + *new_fq = fq;
15292 +
15293 + return 0;
15294 + }
15295 +
15296 + spin_unlock(&(fq->guard));
15297 +
15298 + fq = list_entry(fq->alink.next, flush_queue_t, alink);
15299 + }
15300 +
15301 + /* Use previously allocated fq object */
15302 + if (*new_fq) {
15303 + mark_fq_in_use(*new_fq);
15304 + assert("vs-1248", (*new_fq)->owner == 0);
15305 + ON_DEBUG((*new_fq)->owner = current);
15306 + attach_fq(atom, *new_fq);
15307 +
15308 + return 0;
15309 + }
15310 +
15311 + spin_unlock_atom(atom);
15312 +
15313 + *new_fq = create_fq(gfp);
15314 +
15315 + if (*new_fq == NULL)
15316 + return RETERR(-ENOMEM);
15317 +
15318 + return RETERR(-E_REPEAT);
15319 +}
15320 +
15321 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq)
15322 +{
15323 + return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15324 +}
15325 +
15326 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15327 + object for current atom, if success fq->atom remains locked. */
15328 +flush_queue_t *get_fq_for_current_atom(void)
15329 +{
15330 + flush_queue_t *fq = NULL;
15331 + txn_atom *atom;
15332 + int ret;
15333 +
15334 + do {
15335 + atom = get_current_atom_locked();
15336 + ret = reiser4_fq_by_atom(atom, &fq);
15337 + } while (ret == -E_REPEAT);
15338 +
15339 + if (ret)
15340 + return ERR_PTR(ret);
15341 + return fq;
15342 +}
15343 +
15344 +/* Releasing flush queue object after exclusive use */
15345 +void reiser4_fq_put_nolock(flush_queue_t *fq)
15346 +{
15347 + assert("zam-747", fq->atom != NULL);
15348 + assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15349 + mark_fq_ready(fq);
15350 + assert("vs-1245", fq->owner == current);
15351 + ON_DEBUG(fq->owner = NULL);
15352 +}
15353 +
15354 +void reiser4_fq_put(flush_queue_t *fq)
15355 +{
15356 + txn_atom *atom;
15357 +
15358 + spin_lock(&(fq->guard));
15359 + atom = atom_locked_by_fq_nolock(fq);
15360 +
15361 + assert("zam-746", atom != NULL);
15362 +
15363 + reiser4_fq_put_nolock(fq);
15364 + reiser4_atom_send_event(atom);
15365 +
15366 + spin_unlock(&(fq->guard));
15367 + spin_unlock_atom(atom);
15368 +}
15369 +
15370 +/* A part of atom object initialization related to the embedded flush queue
15371 + list head */
15372 +
15373 +void init_atom_fq_parts(txn_atom *atom)
15374 +{
15375 + INIT_LIST_HEAD(&atom->flush_queues);
15376 +}
15377 +
15378 +#if REISER4_DEBUG
15379 +
15380 +void reiser4_check_fq(const txn_atom *atom)
15381 +{
15382 + /* check number of nodes on all atom's flush queues */
15383 + flush_queue_t *fq;
15384 + int count;
15385 + struct list_head *pos;
15386 +
15387 + count = 0;
15388 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15389 + spin_lock(&(fq->guard));
15390 + /* calculate number of jnodes on fq' list of prepped jnodes */
15391 + list_for_each(pos, ATOM_FQ_LIST(fq))
15392 + count++;
15393 + spin_unlock(&(fq->guard));
15394 + }
15395 + if (count != atom->fq)
15396 + warning("", "fq counter %d, real %d\n", atom->fq, count);
15397 +
15398 +}
15399 +
15400 +#endif
15401 +
15402 +/*
15403 + * Local variables:
15404 + * c-indentation-style: "K&R"
15405 + * mode-name: "LC"
15406 + * c-basic-offset: 8
15407 + * tab-width: 8
15408 + * fill-column: 79
15409 + * scroll-step: 1
15410 + * End:
15411 + */
15412 diff -urN linux-2.6.33.orig/fs/reiser4/forward.h linux-2.6.33/fs/reiser4/forward.h
15413 --- linux-2.6.33.orig/fs/reiser4/forward.h 1970-01-01 01:00:00.000000000 +0100
15414 +++ linux-2.6.33/fs/reiser4/forward.h 2010-03-04 19:33:22.000000000 +0100
15415 @@ -0,0 +1,256 @@
15416 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
15417 + reiser4/README */
15418 +
15419 +/* Forward declarations. Thank you Kernighan. */
15420 +
15421 +#if !defined(__REISER4_FORWARD_H__)
15422 +#define __REISER4_FORWARD_H__
15423 +
15424 +#include <asm/errno.h>
15425 +#include <linux/types.h>
15426 +
15427 +typedef struct zlock zlock;
15428 +typedef struct lock_stack lock_stack;
15429 +typedef struct lock_handle lock_handle;
15430 +typedef struct znode znode;
15431 +typedef struct flow flow_t;
15432 +typedef struct coord coord_t;
15433 +typedef struct tree_access_pointer tap_t;
15434 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15435 +typedef union reiser4_plugin reiser4_plugin;
15436 +typedef __u16 reiser4_plugin_id;
15437 +typedef __u64 reiser4_plugin_groups;
15438 +typedef struct item_plugin item_plugin;
15439 +typedef struct jnode_plugin jnode_plugin;
15440 +typedef struct reiser4_item_data reiser4_item_data;
15441 +typedef union reiser4_key reiser4_key;
15442 +typedef struct reiser4_tree reiser4_tree;
15443 +typedef struct carry_cut_data carry_cut_data;
15444 +typedef struct carry_kill_data carry_kill_data;
15445 +typedef struct carry_tree_op carry_tree_op;
15446 +typedef struct carry_tree_node carry_tree_node;
15447 +typedef struct carry_plugin_info carry_plugin_info;
15448 +typedef struct reiser4_journal reiser4_journal;
15449 +typedef struct txn_atom txn_atom;
15450 +typedef struct txn_handle txn_handle;
15451 +typedef struct txn_mgr txn_mgr;
15452 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15453 +typedef struct reiser4_context reiser4_context;
15454 +typedef struct carry_level carry_level;
15455 +typedef struct blocknr_set_entry blocknr_set_entry;
15456 +/* super_block->s_fs_info points to this */
15457 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15458 +/* next two objects are fields of reiser4_super_info_data */
15459 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15460 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15461 +
15462 +typedef struct flush_scan flush_scan;
15463 +typedef struct flush_position flush_pos_t;
15464 +
15465 +typedef unsigned short pos_in_node_t;
15466 +#define MAX_POS_IN_NODE 65535
15467 +
15468 +typedef struct jnode jnode;
15469 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15470 +
15471 +typedef struct uf_coord uf_coord_t;
15472 +typedef struct hint hint_t;
15473 +
15474 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15475 +
15476 +struct inode;
15477 +struct page;
15478 +struct file;
15479 +struct dentry;
15480 +struct super_block;
15481 +
15482 +/* return values of coord_by_key(). cbk == coord_by_key */
15483 +typedef enum {
15484 + CBK_COORD_FOUND = 0,
15485 + CBK_COORD_NOTFOUND = -ENOENT,
15486 +} lookup_result;
15487 +
15488 +/* results of lookup with directory file */
15489 +typedef enum {
15490 + FILE_NAME_FOUND = 0,
15491 + FILE_NAME_NOTFOUND = -ENOENT,
15492 + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM,
15493 + IO_ERROR return codes for each search. */
15494 + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM,
15495 + IO_ERROR return codes for each search. */
15496 +} file_lookup_result;
15497 +
15498 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15499 + both coincide. */
15500 +typedef enum {
15501 + /* search exactly for the coord with key given */
15502 + FIND_EXACT,
15503 + /* search for coord with the maximal key not greater than one
15504 + given */
15505 + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15506 +} lookup_bias;
15507 +
15508 +typedef enum {
15509 + /* number of leaf level of the tree
15510 + The fake root has (tree_level=0). */
15511 + LEAF_LEVEL = 1,
15512 +
15513 + /* number of level one above leaf level of the tree.
15514 +
15515 + It is supposed that internal tree used by reiser4 to store file
15516 + system data and meta data will have height 2 initially (when
15517 + created by mkfs).
15518 + */
15519 + TWIG_LEVEL = 2,
15520 +} tree_level;
15521 +
15522 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15523 + array, since the zero'th level is not used. */
15524 +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15525 +
15526 +/* enumeration of possible mutual position of item and coord. This enum is
15527 + return type of ->is_in_item() item plugin method which see. */
15528 +typedef enum {
15529 + /* coord is on the left of an item */
15530 + IP_ON_THE_LEFT,
15531 + /* coord is inside item */
15532 + IP_INSIDE,
15533 + /* coord is inside item, but to the right of the rightmost unit of
15534 + this item */
15535 + IP_RIGHT_EDGE,
15536 + /* coord is on the right of an item */
15537 + IP_ON_THE_RIGHT
15538 +} interposition;
15539 +
15540 +/* type of lock to acquire on znode before returning it to caller */
15541 +typedef enum {
15542 + ZNODE_NO_LOCK = 0,
15543 + ZNODE_READ_LOCK = 1,
15544 + ZNODE_WRITE_LOCK = 2,
15545 +} znode_lock_mode;
15546 +
15547 +/* type of lock request */
15548 +typedef enum {
15549 + ZNODE_LOCK_LOPRI = 0,
15550 + ZNODE_LOCK_HIPRI = (1 << 0),
15551 +
15552 + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to
15553 + longterm_lock_znode will not sleep waiting for the lock to become
15554 + available. If the lock is unavailable, reiser4_znode_lock will
15555 + immediately return the value -E_REPEAT. */
15556 + ZNODE_LOCK_NONBLOCK = (1 << 1),
15557 + /* An option for longterm_lock_znode which prevents atom fusion */
15558 + ZNODE_LOCK_DONT_FUSE = (1 << 2)
15559 +} znode_lock_request;
15560 +
15561 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15562 +
15563 +/* used to specify direction of shift. These must be -1 and 1 */
15564 +typedef enum {
15565 + SHIFT_LEFT = 1,
15566 + SHIFT_RIGHT = -1
15567 +} shift_direction;
15568 +
15569 +typedef enum {
15570 + LEFT_SIDE,
15571 + RIGHT_SIDE
15572 +} sideof;
15573 +
15574 +#define round_up(value, order) \
15575 + ((typeof(value))(((long) (value) + (order) - 1U) & \
15576 + ~((order) - 1)))
15577 +
15578 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15579 +typedef enum {
15580 + /* unit of internal item is moved */
15581 + SUBTREE_MOVED = 0,
15582 + /* nothing else can be squeezed into left neighbor */
15583 + SQUEEZE_TARGET_FULL = 1,
15584 + /* all content of node is squeezed into its left neighbor */
15585 + SQUEEZE_SOURCE_EMPTY = 2,
15586 + /* one more item is copied (this is only returned by
15587 + allocate_and_copy_extent to squalloc_twig)) */
15588 + SQUEEZE_CONTINUE = 3
15589 +} squeeze_result;
15590 +
15591 +/* Do not change items ids. If you do - there will be format change */
15592 +typedef enum {
15593 + STATIC_STAT_DATA_ID = 0x0,
15594 + SIMPLE_DIR_ENTRY_ID = 0x1,
15595 + COMPOUND_DIR_ID = 0x2,
15596 + NODE_POINTER_ID = 0x3,
15597 + EXTENT_POINTER_ID = 0x5,
15598 + FORMATTING_ID = 0x6,
15599 + CTAIL_ID = 0x7,
15600 + BLACK_BOX_ID = 0x8,
15601 + LAST_ITEM_ID = 0x9
15602 +} item_id;
15603 +
15604 +/* Flags passed to jnode_flush() to allow it to distinguish default settings
15605 + based on whether commit() was called or VM memory pressure was applied. */
15606 +typedef enum {
15607 + /* submit flush queue to disk at jnode_flush completion */
15608 + JNODE_FLUSH_WRITE_BLOCKS = 1,
15609 +
15610 + /* flush is called for commit */
15611 + JNODE_FLUSH_COMMIT = 2,
15612 + /* not implemented */
15613 + JNODE_FLUSH_MEMORY_FORMATTED = 4,
15614 +
15615 + /* not implemented */
15616 + JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15617 +} jnode_flush_flags;
15618 +
15619 +/* Flags to insert/paste carry operations. Currently they only used in
15620 + flushing code, but in future, they can be used to optimize for repetitive
15621 + accesses. */
15622 +typedef enum {
15623 + /* carry is not allowed to shift data to the left when trying to find
15624 + free space */
15625 + COPI_DONT_SHIFT_LEFT = (1 << 0),
15626 + /* carry is not allowed to shift data to the right when trying to find
15627 + free space */
15628 + COPI_DONT_SHIFT_RIGHT = (1 << 1),
15629 + /* carry is not allowed to allocate new node(s) when trying to find
15630 + free space */
15631 + COPI_DONT_ALLOCATE = (1 << 2),
15632 + /* try to load left neighbor if its not in a cache */
15633 + COPI_LOAD_LEFT = (1 << 3),
15634 + /* try to load right neighbor if its not in a cache */
15635 + COPI_LOAD_RIGHT = (1 << 4),
15636 + /* shift insertion point to the left neighbor */
15637 + COPI_GO_LEFT = (1 << 5),
15638 + /* shift insertion point to the right neighbor */
15639 + COPI_GO_RIGHT = (1 << 6),
15640 + /* try to step back into original node if insertion into new node
15641 + fails after shifting data there. */
15642 + COPI_STEP_BACK = (1 << 7)
15643 +} cop_insert_flag;
15644 +
15645 +typedef enum {
15646 + SAFE_UNLINK, /* safe-link for unlink */
15647 + SAFE_TRUNCATE /* safe-link for truncate */
15648 +} reiser4_safe_link_t;
15649 +
15650 +/* this is to show on which list of atom jnode is */
15651 +typedef enum {
15652 + NOT_CAPTURED,
15653 + DIRTY_LIST,
15654 + CLEAN_LIST,
15655 + FQ_LIST,
15656 + WB_LIST,
15657 + OVRWR_LIST
15658 +} atom_list;
15659 +
15660 +/* __REISER4_FORWARD_H__ */
15661 +#endif
15662 +
15663 +/* Make Linus happy.
15664 + Local variables:
15665 + c-indentation-style: "K&R"
15666 + mode-name: "LC"
15667 + c-basic-offset: 8
15668 + tab-width: 8
15669 + fill-column: 120
15670 + End:
15671 +*/
15672 diff -urN linux-2.6.33.orig/fs/reiser4/fsdata.c linux-2.6.33/fs/reiser4/fsdata.c
15673 --- linux-2.6.33.orig/fs/reiser4/fsdata.c 1970-01-01 01:00:00.000000000 +0100
15674 +++ linux-2.6.33/fs/reiser4/fsdata.c 2010-03-04 19:33:22.000000000 +0100
15675 @@ -0,0 +1,804 @@
15676 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15677 + * reiser4/README */
15678 +
15679 +#include "fsdata.h"
15680 +#include "inode.h"
15681 +
15682 +
15683 +/* cache or dir_cursors */
15684 +static struct kmem_cache *d_cursor_cache;
15685 +
15686 +/* list of unused cursors */
15687 +static LIST_HEAD(cursor_cache);
15688 +
15689 +/* number of cursors in list of ununsed cursors */
15690 +static unsigned long d_cursor_unused = 0;
15691 +
15692 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15693 +DEFINE_SPINLOCK(d_lock);
15694 +
15695 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15696 +static int file_is_stateless(struct file *file);
15697 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15698 +static void kill_cursor(dir_cursor *);
15699 +
15700 +/**
15701 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15702 + * @nr: number of objects to free
15703 + * @mask: GFP mask
15704 + *
15705 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15706 + * number. Return number of still freeable cursors.
15707 + */
15708 +static int d_cursor_shrink(int nr, gfp_t mask)
15709 +{
15710 + if (nr != 0) {
15711 + dir_cursor *scan;
15712 + int killed;
15713 +
15714 + killed = 0;
15715 + spin_lock(&d_lock);
15716 + while (!list_empty(&cursor_cache)) {
15717 + scan = list_entry(cursor_cache.next, dir_cursor, alist);
15718 + assert("nikita-3567", scan->ref == 0);
15719 + kill_cursor(scan);
15720 + ++killed;
15721 + --nr;
15722 + if (nr == 0)
15723 + break;
15724 + }
15725 + spin_unlock(&d_lock);
15726 + }
15727 + return d_cursor_unused;
15728 +}
15729 +
15730 +/*
15731 + * actually, d_cursors are "priceless", because there is no way to
15732 + * recover information stored in them. On the other hand, we don't
15733 + * want to consume all kernel memory by them. As a compromise, just
15734 + * assign higher "seeks" value to d_cursor cache, so that it will be
15735 + * shrunk only if system is really tight on memory.
15736 + */
15737 +static struct shrinker d_cursor_shrinker = {
15738 + .shrink = d_cursor_shrink,
15739 + .seeks = DEFAULT_SEEKS << 3,
15740 +};
15741 +
15742 +/**
15743 + * reiser4_init_d_cursor - create d_cursor cache
15744 + *
15745 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15746 + * initialization.
15747 + */
15748 +int reiser4_init_d_cursor(void)
15749 +{
15750 + d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15751 + SLAB_HWCACHE_ALIGN, NULL);
15752 + if (d_cursor_cache == NULL)
15753 + return RETERR(-ENOMEM);
15754 +
15755 + register_shrinker(&d_cursor_shrinker);
15756 + return 0;
15757 +}
15758 +
15759 +/**
15760 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15761 + *
15762 + * This is called on reiser4 module unloading or system shutdown.
15763 + */
15764 +void reiser4_done_d_cursor(void)
15765 +{
15766 + unregister_shrinker(&d_cursor_shrinker);
15767 +
15768 + destroy_reiser4_cache(&d_cursor_cache);
15769 +}
15770 +
15771 +#define D_CURSOR_TABLE_SIZE (256)
15772 +
15773 +static inline unsigned long
15774 +d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key)
15775 +{
15776 + assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15777 + return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15778 +}
15779 +
15780 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
15781 + const struct d_cursor_key *k2)
15782 +{
15783 + return k1->cid == k2->cid && k1->oid == k2->oid;
15784 +}
15785 +
15786 +/*
15787 + * define functions to manipulate reiser4 super block's hash table of
15788 + * dir_cursors
15789 + */
15790 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15791 +#define KFREE(ptr, size) kfree(ptr)
15792 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15793 + dir_cursor,
15794 + struct d_cursor_key,
15795 + key, hash, d_cursor_hash, d_cursor_eq);
15796 +#undef KFREE
15797 +#undef KMALLOC
15798 +
15799 +/**
15800 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15801 + * @super: super block to initialize
15802 + *
15803 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15804 + * of mount.
15805 + */
15806 +int reiser4_init_super_d_info(struct super_block *super)
15807 +{
15808 + struct d_cursor_info *p;
15809 +
15810 + p = &get_super_private(super)->d_info;
15811 +
15812 + INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15813 + return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15814 +}
15815 +
15816 +/**
15817 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15818 + * @super: super block being umounted
15819 + *
15820 + * It is called on umount. Kills all directory cursors attached to suoer block.
15821 + */
15822 +void reiser4_done_super_d_info(struct super_block *super)
15823 +{
15824 + struct d_cursor_info *d_info;
15825 + dir_cursor *cursor, *next;
15826 +
15827 + d_info = &get_super_private(super)->d_info;
15828 + for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15829 + kill_cursor(cursor);
15830 +
15831 + BUG_ON(d_info->tree.rnode != NULL);
15832 + d_cursor_hash_done(&d_info->table);
15833 +}
15834 +
15835 +/**
15836 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15837 + * @cursor: cursor to free
15838 + *
15839 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15840 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15841 + * indices, hash table, list of unused cursors and frees it.
15842 + */
15843 +static void kill_cursor(dir_cursor *cursor)
15844 +{
15845 + unsigned long index;
15846 +
15847 + assert("nikita-3566", cursor->ref == 0);
15848 + assert("nikita-3572", cursor->fsdata != NULL);
15849 +
15850 + index = (unsigned long)cursor->key.oid;
15851 + list_del_init(&cursor->fsdata->dir.linkage);
15852 + free_fsdata(cursor->fsdata);
15853 + cursor->fsdata = NULL;
15854 +
15855 + if (list_empty_careful(&cursor->list))
15856 + /* this is last cursor for a file. Kill radix-tree entry */
15857 + radix_tree_delete(&cursor->info->tree, index);
15858 + else {
15859 + void **slot;
15860 +
15861 + /*
15862 + * there are other cursors for the same oid.
15863 + */
15864 +
15865 + /*
15866 + * if radix tree point to the cursor being removed, re-target
15867 + * radix tree slot to the next cursor in the (non-empty as was
15868 + * checked above) element of the circular list of all cursors
15869 + * for this oid.
15870 + */
15871 + slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15872 + assert("nikita-3571", *slot != NULL);
15873 + if (*slot == cursor)
15874 + *slot = list_entry(cursor->list.next, dir_cursor, list);
15875 + /* remove cursor from circular list */
15876 + list_del_init(&cursor->list);
15877 + }
15878 + /* remove cursor from the list of unused cursors */
15879 + list_del_init(&cursor->alist);
15880 + /* remove cursor from the hash table */
15881 + d_cursor_hash_remove(&cursor->info->table, cursor);
15882 + /* and free it */
15883 + kmem_cache_free(d_cursor_cache, cursor);
15884 + --d_cursor_unused;
15885 +}
15886 +
15887 +/* possible actions that can be performed on all cursors for the given file */
15888 +enum cursor_action {
15889 + /*
15890 + * load all detached state: this is called when stat-data is loaded
15891 + * from the disk to recover information about all pending readdirs
15892 + */
15893 + CURSOR_LOAD,
15894 + /*
15895 + * detach all state from inode, leaving it in the cache. This is called
15896 + * when inode is removed form the memory by memory pressure
15897 + */
15898 + CURSOR_DISPOSE,
15899 + /*
15900 + * detach cursors from the inode, and free them. This is called when
15901 + * inode is destroyed
15902 + */
15903 + CURSOR_KILL
15904 +};
15905 +
15906 +/*
15907 + * return d_cursor data for the file system @inode is in.
15908 + */
15909 +static inline struct d_cursor_info *d_info(struct inode *inode)
15910 +{
15911 + return &get_super_private(inode->i_sb)->d_info;
15912 +}
15913 +
15914 +/*
15915 + * lookup d_cursor in the per-super-block radix tree.
15916 + */
15917 +static inline dir_cursor *lookup(struct d_cursor_info *info,
15918 + unsigned long index)
15919 +{
15920 + return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15921 +}
15922 +
15923 +/*
15924 + * attach @cursor to the radix tree. There may be multiple cursors for the
15925 + * same oid, they are chained into circular list.
15926 + */
15927 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15928 +{
15929 + dir_cursor *head;
15930 +
15931 + head = lookup(cursor->info, index);
15932 + if (head == NULL) {
15933 + /* this is the first cursor for this index */
15934 + INIT_LIST_HEAD(&cursor->list);
15935 + radix_tree_insert(&cursor->info->tree, index, cursor);
15936 + } else {
15937 + /* some cursor already exists. Chain ours */
15938 + list_add(&cursor->list, &head->list);
15939 + }
15940 +}
15941 +
15942 +/*
15943 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15944 + * "unused" list. Called when file descriptor is not longer in active use.
15945 + */
15946 +static void clean_fsdata(struct file *file)
15947 +{
15948 + dir_cursor *cursor;
15949 + reiser4_file_fsdata *fsdata;
15950 +
15951 + assert("nikita-3570", file_is_stateless(file));
15952 +
15953 + fsdata = (reiser4_file_fsdata *) file->private_data;
15954 + if (fsdata != NULL) {
15955 + cursor = fsdata->cursor;
15956 + if (cursor != NULL) {
15957 + spin_lock(&d_lock);
15958 + --cursor->ref;
15959 + if (cursor->ref == 0) {
15960 + list_add_tail(&cursor->alist, &cursor_cache);
15961 + ++d_cursor_unused;
15962 + }
15963 + spin_unlock(&d_lock);
15964 + file->private_data = NULL;
15965 + }
15966 + }
15967 +}
15968 +
15969 +/*
15970 + * global counter used to generate "client ids". These ids are encoded into
15971 + * high bits of fpos.
15972 + */
15973 +static __u32 cid_counter = 0;
15974 +#define CID_SHIFT (20)
15975 +#define CID_MASK (0xfffffull)
15976 +
15977 +static void free_file_fsdata_nolock(struct file *);
15978 +
15979 +/**
15980 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15981 + * @cursor:
15982 + * @file:
15983 + * @inode:
15984 + *
15985 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15986 + * reiser4 super block's hash table and radix tree.
15987 + add detachable readdir
15988 + * state to the @f
15989 + */
15990 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15991 + struct inode *inode)
15992 +{
15993 + int result;
15994 + reiser4_file_fsdata *fsdata;
15995 +
15996 + memset(cursor, 0, sizeof *cursor);
15997 +
15998 + /* this is either first call to readdir, or rewind. Anyway, create new
15999 + * cursor. */
16000 + fsdata = create_fsdata(NULL);
16001 + if (fsdata != NULL) {
16002 + result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
16003 + if (result == 0) {
16004 + struct d_cursor_info *info;
16005 + oid_t oid;
16006 +
16007 + info = d_info(inode);
16008 + oid = get_inode_oid(inode);
16009 + /* cid occupies higher 12 bits of f->f_pos. Don't
16010 + * allow it to become negative: this confuses
16011 + * nfsd_readdir() */
16012 + cursor->key.cid = (++cid_counter) & 0x7ff;
16013 + cursor->key.oid = oid;
16014 + cursor->fsdata = fsdata;
16015 + cursor->info = info;
16016 + cursor->ref = 1;
16017 +
16018 + spin_lock_inode(inode);
16019 + /* install cursor as @f's private_data, discarding old
16020 + * one if necessary */
16021 +#if REISER4_DEBUG
16022 + if (file->private_data)
16023 + warning("", "file has fsdata already");
16024 +#endif
16025 + clean_fsdata(file);
16026 + free_file_fsdata_nolock(file);
16027 + file->private_data = fsdata;
16028 + fsdata->cursor = cursor;
16029 + spin_unlock_inode(inode);
16030 + spin_lock(&d_lock);
16031 + /* insert cursor into hash table */
16032 + d_cursor_hash_insert(&info->table, cursor);
16033 + /* and chain it into radix-tree */
16034 + bind_cursor(cursor, (unsigned long)oid);
16035 + spin_unlock(&d_lock);
16036 + radix_tree_preload_end();
16037 + file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
16038 + }
16039 + } else
16040 + result = RETERR(-ENOMEM);
16041 + return result;
16042 +}
16043 +
16044 +/**
16045 + * process_cursors - do action on each cursor attached to inode
16046 + * @inode:
16047 + * @act: action to do
16048 + *
16049 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
16050 + * and performs action specified by @act on each of cursors.
16051 + */
16052 +static void process_cursors(struct inode *inode, enum cursor_action act)
16053 +{
16054 + oid_t oid;
16055 + dir_cursor *start;
16056 + struct list_head *head;
16057 + reiser4_context *ctx;
16058 + struct d_cursor_info *info;
16059 +
16060 + /* this can be called by
16061 + *
16062 + * kswapd->...->prune_icache->..reiser4_destroy_inode
16063 + *
16064 + * without reiser4_context
16065 + */
16066 + ctx = reiser4_init_context(inode->i_sb);
16067 + if (IS_ERR(ctx)) {
16068 + warning("vs-23", "failed to init context");
16069 + return;
16070 + }
16071 +
16072 + assert("nikita-3558", inode != NULL);
16073 +
16074 + info = d_info(inode);
16075 + oid = get_inode_oid(inode);
16076 + spin_lock_inode(inode);
16077 + head = get_readdir_list(inode);
16078 + spin_lock(&d_lock);
16079 + /* find any cursor for this oid: reference to it is hanging of radix
16080 + * tree */
16081 + start = lookup(info, (unsigned long)oid);
16082 + if (start != NULL) {
16083 + dir_cursor *scan;
16084 + reiser4_file_fsdata *fsdata;
16085 +
16086 + /* process circular list of cursors for this oid */
16087 + scan = start;
16088 + do {
16089 + dir_cursor *next;
16090 +
16091 + next = list_entry(scan->list.next, dir_cursor, list);
16092 + fsdata = scan->fsdata;
16093 + assert("nikita-3557", fsdata != NULL);
16094 + if (scan->key.oid == oid) {
16095 + switch (act) {
16096 + case CURSOR_DISPOSE:
16097 + list_del_init(&fsdata->dir.linkage);
16098 + break;
16099 + case CURSOR_LOAD:
16100 + list_add(&fsdata->dir.linkage, head);
16101 + break;
16102 + case CURSOR_KILL:
16103 + kill_cursor(scan);
16104 + break;
16105 + }
16106 + }
16107 + if (scan == next)
16108 + /* last cursor was just killed */
16109 + break;
16110 + scan = next;
16111 + } while (scan != start);
16112 + }
16113 + spin_unlock(&d_lock);
16114 + /* check that we killed 'em all */
16115 + assert("nikita-3568",
16116 + ergo(act == CURSOR_KILL,
16117 + list_empty_careful(get_readdir_list(inode))));
16118 + assert("nikita-3569",
16119 + ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16120 + spin_unlock_inode(inode);
16121 + reiser4_exit_context(ctx);
16122 +}
16123 +
16124 +/**
16125 + * reiser4_dispose_cursors - removes cursors from inode's list
16126 + * @inode: inode to dispose cursors of
16127 + *
16128 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16129 + * attached to cursor from inode's readdir list. This is called when inode is
16130 + * removed from the memory by memory pressure.
16131 + */
16132 +void reiser4_dispose_cursors(struct inode *inode)
16133 +{
16134 + process_cursors(inode, CURSOR_DISPOSE);
16135 +}
16136 +
16137 +/**
16138 + * reiser4_load_cursors - attach cursors to inode
16139 + * @inode: inode to load cursors to
16140 + *
16141 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16142 + * attached to cursor to inode's readdir list. This is done when inode is
16143 + * loaded into memory.
16144 + */
16145 +void reiser4_load_cursors(struct inode *inode)
16146 +{
16147 + process_cursors(inode, CURSOR_LOAD);
16148 +}
16149 +
16150 +/**
16151 + * reiser4_kill_cursors - kill all inode cursors
16152 + * @inode: inode to kill cursors of
16153 + *
16154 + * Frees all cursors for this inode. This is called when inode is destroyed.
16155 + */
16156 +void reiser4_kill_cursors(struct inode *inode)
16157 +{
16158 + process_cursors(inode, CURSOR_KILL);
16159 +}
16160 +
16161 +/**
16162 + * file_is_stateless -
16163 + * @file:
16164 + *
16165 + * true, if file descriptor @f is created by NFS server by "demand" to serve
16166 + * one file system operation. This means that there may be "detached state"
16167 + * for underlying inode.
16168 + */
16169 +static int file_is_stateless(struct file *file)
16170 +{
16171 + return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16172 +}
16173 +
16174 +/**
16175 + * reiser4_get_dir_fpos -
16176 + * @dir:
16177 + *
16178 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16179 + * in the case of stateless directory operation (readdir-over-nfs), client id
16180 + * was encoded in the high bits of cookie and should me masked off.
16181 + */
16182 +loff_t reiser4_get_dir_fpos(struct file *dir)
16183 +{
16184 + if (file_is_stateless(dir))
16185 + return dir->f_pos & CID_MASK;
16186 + else
16187 + return dir->f_pos;
16188 +}
16189 +
16190 +/**
16191 + * reiser4_attach_fsdata - try to attach fsdata
16192 + * @file:
16193 + * @inode:
16194 + *
16195 + * Finds or creates cursor for readdir-over-nfs.
16196 + */
16197 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
16198 +{
16199 + loff_t pos;
16200 + int result;
16201 + dir_cursor *cursor;
16202 +
16203 + /*
16204 + * we are serialized by inode->i_mutex
16205 + */
16206 + if (!file_is_stateless(file))
16207 + return 0;
16208 +
16209 + pos = file->f_pos;
16210 + result = 0;
16211 + if (pos == 0) {
16212 + /*
16213 + * first call to readdir (or rewind to the beginning of
16214 + * directory)
16215 + */
16216 + cursor = kmem_cache_alloc(d_cursor_cache,
16217 + reiser4_ctx_gfp_mask_get());
16218 + if (cursor != NULL)
16219 + result = insert_cursor(cursor, file, inode);
16220 + else
16221 + result = RETERR(-ENOMEM);
16222 + } else {
16223 + /* try to find existing cursor */
16224 + struct d_cursor_key key;
16225 +
16226 + key.cid = pos >> CID_SHIFT;
16227 + key.oid = get_inode_oid(inode);
16228 + spin_lock(&d_lock);
16229 + cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16230 + if (cursor != NULL) {
16231 + /* cursor was found */
16232 + if (cursor->ref == 0) {
16233 + /* move it from unused list */
16234 + list_del_init(&cursor->alist);
16235 + --d_cursor_unused;
16236 + }
16237 + ++cursor->ref;
16238 + }
16239 + spin_unlock(&d_lock);
16240 + if (cursor != NULL) {
16241 + spin_lock_inode(inode);
16242 + assert("nikita-3556", cursor->fsdata->back == NULL);
16243 + clean_fsdata(file);
16244 + free_file_fsdata_nolock(file);
16245 + file->private_data = cursor->fsdata;
16246 + spin_unlock_inode(inode);
16247 + }
16248 + }
16249 + return result;
16250 +}
16251 +
16252 +/**
16253 + * reiser4_detach_fsdata - ???
16254 + * @file:
16255 + *
16256 + * detach fsdata, if necessary
16257 + */
16258 +void reiser4_detach_fsdata(struct file *file)
16259 +{
16260 + struct inode *inode;
16261 +
16262 + if (!file_is_stateless(file))
16263 + return;
16264 +
16265 + inode = file->f_dentry->d_inode;
16266 + spin_lock_inode(inode);
16267 + clean_fsdata(file);
16268 + spin_unlock_inode(inode);
16269 +}
16270 +
16271 +/* slab for reiser4_dentry_fsdata */
16272 +static struct kmem_cache *dentry_fsdata_cache;
16273 +
16274 +/**
16275 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
16276 + *
16277 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
16278 + * part of reiser4 module initialization.
16279 + */
16280 +int reiser4_init_dentry_fsdata(void)
16281 +{
16282 + dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16283 + sizeof(struct reiser4_dentry_fsdata),
16284 + 0,
16285 + SLAB_HWCACHE_ALIGN |
16286 + SLAB_RECLAIM_ACCOUNT,
16287 + NULL);
16288 + if (dentry_fsdata_cache == NULL)
16289 + return RETERR(-ENOMEM);
16290 + return 0;
16291 +}
16292 +
16293 +/**
16294 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
16295 + *
16296 + * This is called on reiser4 module unloading or system shutdown.
16297 + */
16298 +void reiser4_done_dentry_fsdata(void)
16299 +{
16300 + destroy_reiser4_cache(&dentry_fsdata_cache);
16301 +}
16302 +
16303 +/**
16304 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16305 + * @dentry: queried dentry
16306 + *
16307 + * Allocates if necessary and returns per-dentry data that we attach to each
16308 + * dentry.
16309 + */
16310 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16311 +{
16312 + assert("nikita-1365", dentry != NULL);
16313 +
16314 + if (dentry->d_fsdata == NULL) {
16315 + dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16316 + reiser4_ctx_gfp_mask_get());
16317 + if (dentry->d_fsdata == NULL)
16318 + return ERR_PTR(RETERR(-ENOMEM));
16319 + memset(dentry->d_fsdata, 0,
16320 + sizeof(struct reiser4_dentry_fsdata));
16321 + }
16322 + return dentry->d_fsdata;
16323 +}
16324 +
16325 +/**
16326 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16327 + * @dentry: dentry to free fsdata of
16328 + *
16329 + * Detaches and frees fs-specific dentry data
16330 + */
16331 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16332 +{
16333 + if (dentry->d_fsdata != NULL) {
16334 + kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16335 + dentry->d_fsdata = NULL;
16336 + }
16337 +}
16338 +
16339 +/* slab for reiser4_file_fsdata */
16340 +static struct kmem_cache *file_fsdata_cache;
16341 +
16342 +/**
16343 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16344 + *
16345 + * Initializes slab cache of structures attached to file->private_data. It is
16346 + * part of reiser4 module initialization.
16347 + */
16348 +int reiser4_init_file_fsdata(void)
16349 +{
16350 + file_fsdata_cache = kmem_cache_create("file_fsdata",
16351 + sizeof(reiser4_file_fsdata),
16352 + 0,
16353 + SLAB_HWCACHE_ALIGN |
16354 + SLAB_RECLAIM_ACCOUNT, NULL);
16355 + if (file_fsdata_cache == NULL)
16356 + return RETERR(-ENOMEM);
16357 + return 0;
16358 +}
16359 +
16360 +/**
16361 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16362 + *
16363 + * This is called on reiser4 module unloading or system shutdown.
16364 + */
16365 +void reiser4_done_file_fsdata(void)
16366 +{
16367 + destroy_reiser4_cache(&file_fsdata_cache);
16368 +}
16369 +
16370 +/**
16371 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16372 + * @file: what to create file_fsdata for, may be NULL
16373 + *
16374 + * Allocates and initializes reiser4_file_fsdata structure.
16375 + */
16376 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16377 +{
16378 + reiser4_file_fsdata *fsdata;
16379 +
16380 + fsdata = kmem_cache_alloc(file_fsdata_cache,
16381 + reiser4_ctx_gfp_mask_get());
16382 + if (fsdata != NULL) {
16383 + memset(fsdata, 0, sizeof *fsdata);
16384 + fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16385 + fsdata->back = file;
16386 + INIT_LIST_HEAD(&fsdata->dir.linkage);
16387 + }
16388 + return fsdata;
16389 +}
16390 +
16391 +/**
16392 + * free_fsdata - free reiser4_file_fsdata
16393 + * @fsdata: object to free
16394 + *
16395 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16396 + */
16397 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16398 +{
16399 + BUG_ON(fsdata == NULL);
16400 + kmem_cache_free(file_fsdata_cache, fsdata);
16401 +}
16402 +
16403 +/**
16404 + * reiser4_get_file_fsdata - get fs-specific file data
16405 + * @file: queried file
16406 + *
16407 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16408 + * to @file.
16409 + */
16410 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16411 +{
16412 + assert("nikita-1603", file != NULL);
16413 +
16414 + if (file->private_data == NULL) {
16415 + reiser4_file_fsdata *fsdata;
16416 + struct inode *inode;
16417 +
16418 + fsdata = create_fsdata(file);
16419 + if (fsdata == NULL)
16420 + return ERR_PTR(RETERR(-ENOMEM));
16421 +
16422 + inode = file->f_dentry->d_inode;
16423 + spin_lock_inode(inode);
16424 + if (file->private_data == NULL) {
16425 + file->private_data = fsdata;
16426 + fsdata = NULL;
16427 + }
16428 + spin_unlock_inode(inode);
16429 + if (fsdata != NULL)
16430 + /* other thread initialized ->fsdata */
16431 + kmem_cache_free(file_fsdata_cache, fsdata);
16432 + }
16433 + assert("nikita-2665", file->private_data != NULL);
16434 + return file->private_data;
16435 +}
16436 +
16437 +/**
16438 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16439 + * @file:
16440 + *
16441 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16442 + * readdir list, frees if it is not linked to d_cursor object.
16443 + */
16444 +static void free_file_fsdata_nolock(struct file *file)
16445 +{
16446 + reiser4_file_fsdata *fsdata;
16447 +
16448 + assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16449 + fsdata = file->private_data;
16450 + if (fsdata != NULL) {
16451 + list_del_init(&fsdata->dir.linkage);
16452 + if (fsdata->cursor == NULL)
16453 + free_fsdata(fsdata);
16454 + }
16455 + file->private_data = NULL;
16456 +}
16457 +
16458 +/**
16459 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16460 + * @file:
16461 + *
16462 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16463 + */
16464 +void reiser4_free_file_fsdata(struct file *file)
16465 +{
16466 + spin_lock_inode(file->f_dentry->d_inode);
16467 + free_file_fsdata_nolock(file);
16468 + spin_unlock_inode(file->f_dentry->d_inode);
16469 +}
16470 +
16471 +/*
16472 + * Local variables:
16473 + * c-indentation-style: "K&R"
16474 + * mode-name: "LC"
16475 + * c-basic-offset: 8
16476 + * tab-width: 8
16477 + * fill-column: 79
16478 + * End:
16479 + */
16480 diff -urN linux-2.6.33.orig/fs/reiser4/fsdata.h linux-2.6.33/fs/reiser4/fsdata.h
16481 --- linux-2.6.33.orig/fs/reiser4/fsdata.h 1970-01-01 01:00:00.000000000 +0100
16482 +++ linux-2.6.33/fs/reiser4/fsdata.h 2010-03-04 19:33:22.000000000 +0100
16483 @@ -0,0 +1,205 @@
16484 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16485 + * reiser4/README */
16486 +
16487 +#if !defined(__REISER4_FSDATA_H__)
16488 +#define __REISER4_FSDATA_H__
16489 +
16490 +#include "debug.h"
16491 +#include "kassign.h"
16492 +#include "seal.h"
16493 +#include "type_safe_hash.h"
16494 +#include "plugin/file/file.h"
16495 +#include "readahead.h"
16496 +
16497 +/*
16498 + * comment about reiser4_dentry_fsdata
16499 + *
16500 + *
16501 + */
16502 +
16503 +/*
16504 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16505 + * protected by ->i_mutex on inode. Under this lock following invariant
16506 + * holds:
16507 + *
16508 + * file descriptor is "looking" at the entry_no-th directory entry from
16509 + * the beginning of directory. This entry has key dir_entry_key and is
16510 + * pos-th entry with duplicate-key sequence.
16511 + *
16512 + */
16513 +
16514 +/* logical position within directory */
16515 +struct dir_pos {
16516 + /* key of directory entry (actually, part of a key sufficient to
16517 + identify directory entry) */
16518 + de_id dir_entry_key;
16519 + /* ordinal number of directory entry among all entries with the same
16520 + key. (Starting from 0.) */
16521 + unsigned pos;
16522 +};
16523 +
16524 +struct readdir_pos {
16525 + /* f_pos corresponding to this readdir position */
16526 + __u64 fpos;
16527 + /* logical position within directory */
16528 + struct dir_pos position;
16529 + /* logical number of directory entry within
16530 + directory */
16531 + __u64 entry_no;
16532 +};
16533 +
16534 +/*
16535 + * this is used to speed up lookups for directory entry: on initial call to
16536 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16537 + * in struct dentry and reused later to avoid tree traversals.
16538 + */
16539 +struct de_location {
16540 + /* seal covering directory entry */
16541 + seal_t entry_seal;
16542 + /* coord of directory entry */
16543 + coord_t entry_coord;
16544 + /* ordinal number of directory entry among all entries with the same
16545 + key. (Starting from 0.) */
16546 + int pos;
16547 +};
16548 +
16549 +/**
16550 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16551 + *
16552 + * This is allocated dynamically and released in d_op->d_release()
16553 + *
16554 + * Currently it only contains cached location (hint) of directory entry, but
16555 + * it is expected that other information will be accumulated here.
16556 + */
16557 +struct reiser4_dentry_fsdata {
16558 + /*
16559 + * here will go fields filled by ->lookup() to speedup next
16560 + * create/unlink, like blocknr of znode with stat-data, or key of
16561 + * stat-data.
16562 + */
16563 + struct de_location dec;
16564 + int stateless; /* created through reiser4_decode_fh, needs
16565 + * special treatment in readdir. */
16566 +};
16567 +
16568 +extern int reiser4_init_dentry_fsdata(void);
16569 +extern void reiser4_done_dentry_fsdata(void);
16570 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16571 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16572 +
16573 +/**
16574 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16575 + *
16576 + * This is allocated dynamically and released in inode->i_fop->release
16577 + */
16578 +typedef struct reiser4_file_fsdata {
16579 + /*
16580 + * pointer back to the struct file which this reiser4_file_fsdata is
16581 + * part of
16582 + */
16583 + struct file *back;
16584 + /* detached cursor for stateless readdir. */
16585 + struct dir_cursor *cursor;
16586 + /*
16587 + * We need both directory and regular file parts here, because there
16588 + * are file system objects that are files and directories.
16589 + */
16590 + struct {
16591 + /*
16592 + * position in directory. It is updated each time directory is
16593 + * modified
16594 + */
16595 + struct readdir_pos readdir;
16596 + /* head of this list is reiser4_inode->lists.readdir_list */
16597 + struct list_head linkage;
16598 + } dir;
16599 + /* hints to speed up operations with regular files: read and write. */
16600 + struct {
16601 + hint_t hint;
16602 + } reg;
16603 + struct reiser4_file_ra_state ra1;
16604 +
16605 +} reiser4_file_fsdata;
16606 +
16607 +extern int reiser4_init_file_fsdata(void);
16608 +extern void reiser4_done_file_fsdata(void);
16609 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16610 +extern void reiser4_free_file_fsdata(struct file *);
16611 +
16612 +/*
16613 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16614 + * used to address problem reiser4 has with readdir accesses via NFS. See
16615 + * plugin/file_ops_readdir.c for more details.
16616 + */
16617 +struct d_cursor_key{
16618 + __u16 cid;
16619 + __u64 oid;
16620 +};
16621 +
16622 +/*
16623 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16624 + * maintain hash table of dir_cursor-s in reiser4's super block
16625 + */
16626 +typedef struct dir_cursor dir_cursor;
16627 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16628 +
16629 +struct dir_cursor {
16630 + int ref;
16631 + reiser4_file_fsdata *fsdata;
16632 +
16633 + /* link to reiser4 super block hash table of cursors */
16634 + d_cursor_hash_link hash;
16635 +
16636 + /*
16637 + * this is to link cursors to reiser4 super block's radix tree of
16638 + * cursors if there are more than one cursor of the same objectid
16639 + */
16640 + struct list_head list;
16641 + struct d_cursor_key key;
16642 + struct d_cursor_info *info;
16643 + /* list of unused cursors */
16644 + struct list_head alist;
16645 +};
16646 +
16647 +extern int reiser4_init_d_cursor(void);
16648 +extern void reiser4_done_d_cursor(void);
16649 +
16650 +extern int reiser4_init_super_d_info(struct super_block *);
16651 +extern void reiser4_done_super_d_info(struct super_block *);
16652 +
16653 +extern loff_t reiser4_get_dir_fpos(struct file *);
16654 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16655 +extern void reiser4_detach_fsdata(struct file *);
16656 +
16657 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16658 + more details */
16659 +void reiser4_dispose_cursors(struct inode *inode);
16660 +void reiser4_load_cursors(struct inode *inode);
16661 +void reiser4_kill_cursors(struct inode *inode);
16662 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16663 + int offset, int adj);
16664 +
16665 +/*
16666 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16667 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16668 + */
16669 +struct d_cursor_info {
16670 + d_cursor_hash_table table;
16671 + struct radix_tree_root tree;
16672 +};
16673 +
16674 +/* spinlock protecting readdir cursors */
16675 +extern spinlock_t d_lock;
16676 +
16677 +/* __REISER4_FSDATA_H__ */
16678 +#endif
16679 +
16680 +/*
16681 + * Local variables:
16682 + * c-indentation-style: "K&R"
16683 + * mode-name: "LC"
16684 + * c-basic-offset: 8
16685 + * tab-width: 8
16686 + * fill-column: 120
16687 + * End:
16688 + */
16689 diff -urN linux-2.6.33.orig/fs/reiser4/init_super.c linux-2.6.33/fs/reiser4/init_super.c
16690 --- linux-2.6.33.orig/fs/reiser4/init_super.c 1970-01-01 01:00:00.000000000 +0100
16691 +++ linux-2.6.33/fs/reiser4/init_super.c 2010-03-04 19:33:22.000000000 +0100
16692 @@ -0,0 +1,761 @@
16693 +/* Copyright by Hans Reiser, 2003 */
16694 +
16695 +#include "super.h"
16696 +#include "inode.h"
16697 +#include "plugin/plugin_set.h"
16698 +
16699 +#include <linux/swap.h>
16700 +
16701 +/**
16702 + * init_fs_info - allocate reiser4 specific super block
16703 + * @super: super block of filesystem
16704 + *
16705 + * Allocates and initialize reiser4_super_info_data, attaches it to
16706 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16707 + */
16708 +int reiser4_init_fs_info(struct super_block *super)
16709 +{
16710 + reiser4_super_info_data *sbinfo;
16711 +
16712 + sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16713 + reiser4_ctx_gfp_mask_get());
16714 + if (!sbinfo)
16715 + return RETERR(-ENOMEM);
16716 +
16717 + super->s_fs_info = sbinfo;
16718 + super->s_op = NULL;
16719 +
16720 + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16721 + ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16722 +
16723 + mutex_init(&sbinfo->delete_mutex);
16724 + spin_lock_init(&(sbinfo->guard));
16725 +
16726 + /* initialize per-super-block d_cursor resources */
16727 + reiser4_init_super_d_info(super);
16728 +
16729 + return 0;
16730 +}
16731 +
16732 +/**
16733 + * reiser4_done_fs_info - free reiser4 specific super block
16734 + * @super: super block of filesystem
16735 + *
16736 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16737 + * frees reiser4_super_info_data.
16738 + */
16739 +void reiser4_done_fs_info(struct super_block *super)
16740 +{
16741 + assert("zam-990", super->s_fs_info != NULL);
16742 +
16743 + /* release per-super-block d_cursor resources */
16744 + reiser4_done_super_d_info(super);
16745 +
16746 + /* make sure that there are not jnodes already */
16747 + assert("", list_empty(&get_super_private(super)->all_jnodes));
16748 + assert("", get_current_context()->trans->atom == NULL);
16749 + reiser4_check_block_counters(super);
16750 + kfree(super->s_fs_info);
16751 + super->s_fs_info = NULL;
16752 +}
16753 +
16754 +/* type of option parseable by parse_option() */
16755 +typedef enum {
16756 + /* value of option is arbitrary string */
16757 + OPT_STRING,
16758 +
16759 + /*
16760 + * option specifies bit in a bitmask. When option is set - bit in
16761 + * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16762 + * dont_load_bitmap, atomic_write.
16763 + */
16764 + OPT_BIT,
16765 +
16766 + /*
16767 + * value of option should conform to sprintf() format. Examples are
16768 + * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16769 + */
16770 + OPT_FORMAT,
16771 +
16772 + /*
16773 + * option can take one of predefined values. Example is onerror=panic or
16774 + * onerror=remount-ro
16775 + */
16776 + OPT_ONEOF,
16777 +} opt_type_t;
16778 +
16779 +#if 0
16780 +struct opt_bitmask_bit {
16781 + const char *bit_name;
16782 + int bit_nr;
16783 +};
16784 +#endif
16785 +
16786 +/* description of option parseable by parse_option() */
16787 +struct opt_desc {
16788 + /* option name.
16789 +
16790 + parsed portion of string has a form "name=value".
16791 + */
16792 + const char *name;
16793 + /* type of option */
16794 + opt_type_t type;
16795 + union {
16796 + /* where to store value of string option (type == OPT_STRING) */
16797 + char **string;
16798 + /* description of bits for bit option (type == OPT_BIT) */
16799 + struct {
16800 + int nr;
16801 + void *addr;
16802 + } bit;
16803 + /* description of format and targets for format option (type
16804 + == OPT_FORMAT) */
16805 + struct {
16806 + const char *format;
16807 + int nr_args;
16808 + void *arg1;
16809 + void *arg2;
16810 + void *arg3;
16811 + void *arg4;
16812 + } f;
16813 + struct {
16814 + int *result;
16815 + const char *list[10];
16816 + } oneof;
16817 + struct {
16818 + void *addr;
16819 + int nr_bits;
16820 + /* struct opt_bitmask_bit *bits; */
16821 + } bitmask;
16822 + } u;
16823 +};
16824 +
16825 +/**
16826 + * parse_option - parse one option
16827 + * @opt_strin: starting point of parsing
16828 + * @opt: option description
16829 + *
16830 + * foo=bar,
16831 + * ^ ^ ^
16832 + * | | +-- replaced to '\0'
16833 + * | +-- val_start
16834 + * +-- opt_string
16835 + * Figures out option type and handles option correspondingly.
16836 + */
16837 +static int parse_option(char *opt_string, struct opt_desc *opt)
16838 +{
16839 + char *val_start;
16840 + int result;
16841 + const char *err_msg;
16842 +
16843 + /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16844 +
16845 + val_start = strchr(opt_string, '=');
16846 + if (val_start != NULL) {
16847 + *val_start = '\0';
16848 + ++val_start;
16849 + }
16850 +
16851 + err_msg = NULL;
16852 + result = 0;
16853 + switch (opt->type) {
16854 + case OPT_STRING:
16855 + if (val_start == NULL) {
16856 + err_msg = "String arg missing";
16857 + result = RETERR(-EINVAL);
16858 + } else
16859 + *opt->u.string = val_start;
16860 + break;
16861 + case OPT_BIT:
16862 + if (val_start != NULL)
16863 + err_msg = "Value ignored";
16864 + else
16865 + set_bit(opt->u.bit.nr, opt->u.bit.addr);
16866 + break;
16867 + case OPT_FORMAT:
16868 + if (val_start == NULL) {
16869 + err_msg = "Formatted arg missing";
16870 + result = RETERR(-EINVAL);
16871 + break;
16872 + }
16873 + if (sscanf(val_start, opt->u.f.format,
16874 + opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16875 + opt->u.f.arg4) != opt->u.f.nr_args) {
16876 + err_msg = "Wrong conversion";
16877 + result = RETERR(-EINVAL);
16878 + }
16879 + break;
16880 + case OPT_ONEOF:
16881 + {
16882 + int i = 0;
16883 +
16884 + if (val_start == NULL) {
16885 + err_msg = "Value is missing";
16886 + result = RETERR(-EINVAL);
16887 + break;
16888 + }
16889 + err_msg = "Wrong option value";
16890 + result = RETERR(-EINVAL);
16891 + while (opt->u.oneof.list[i]) {
16892 + if (!strcmp(opt->u.oneof.list[i], val_start)) {
16893 + result = 0;
16894 + err_msg = NULL;
16895 + *opt->u.oneof.result = i;
16896 + break;
16897 + }
16898 + i++;
16899 + }
16900 + break;
16901 + }
16902 + default:
16903 + wrong_return_value("nikita-2100", "opt -> type");
16904 + break;
16905 + }
16906 + if (err_msg != NULL) {
16907 + warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16908 + err_msg, opt->name, val_start ? "=" : "",
16909 + val_start ? : "");
16910 + }
16911 + return result;
16912 +}
16913 +
16914 +/**
16915 + * parse_options - parse reiser4 mount options
16916 + * @opt_string: starting point
16917 + * @opts: array of option description
16918 + * @nr_opts: number of elements in @opts
16919 + *
16920 + * Parses comma separated list of reiser4 mount options.
16921 + */
16922 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16923 +{
16924 + int result;
16925 +
16926 + result = 0;
16927 + while ((result == 0) && opt_string && *opt_string) {
16928 + int j;
16929 + char *next;
16930 +
16931 + next = strchr(opt_string, ',');
16932 + if (next != NULL) {
16933 + *next = '\0';
16934 + ++next;
16935 + }
16936 + for (j = 0; j < nr_opts; ++j) {
16937 + if (!strncmp(opt_string, opts[j].name,
16938 + strlen(opts[j].name))) {
16939 + result = parse_option(opt_string, &opts[j]);
16940 + break;
16941 + }
16942 + }
16943 + if (j == nr_opts) {
16944 + warning("nikita-2307", "Unrecognized option: \"%s\"",
16945 + opt_string);
16946 + /* traditionally, -EINVAL is returned on wrong mount
16947 + option */
16948 + result = RETERR(-EINVAL);
16949 + }
16950 + opt_string = next;
16951 + }
16952 + return result;
16953 +}
16954 +
16955 +#define NUM_OPT(label, fmt, addr) \
16956 + { \
16957 + .name = (label), \
16958 + .type = OPT_FORMAT, \
16959 + .u = { \
16960 + .f = { \
16961 + .format = (fmt), \
16962 + .nr_args = 1, \
16963 + .arg1 = (addr), \
16964 + .arg2 = NULL, \
16965 + .arg3 = NULL, \
16966 + .arg4 = NULL \
16967 + } \
16968 + } \
16969 + }
16970 +
16971 +#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field)
16972 +
16973 +#define BIT_OPT(label, bitnr) \
16974 + { \
16975 + .name = label, \
16976 + .type = OPT_BIT, \
16977 + .u = { \
16978 + .bit = { \
16979 + .nr = bitnr, \
16980 + .addr = &sbinfo->fs_flags \
16981 + } \
16982 + } \
16983 + }
16984 +
16985 +#define MAX_NR_OPTIONS (30)
16986 +
16987 +#if REISER4_DEBUG
16988 +# define OPT_ARRAY_CHECK(opt, array) \
16989 + if ((opt) > (array) + MAX_NR_OPTIONS) { \
16990 + warning("zam-1046", "opt array is overloaded"); break; \
16991 + }
16992 +#else
16993 +# define OPT_ARRAY_CHECK(opt, array) noop
16994 +#endif
16995 +
16996 +#define PUSH_OPT(opt, array, ...) \
16997 +do { \
16998 + struct opt_desc o = __VA_ARGS__; \
16999 + OPT_ARRAY_CHECK(opt, array); \
17000 + *(opt) ++ = o; \
17001 +} while (0)
17002 +
17003 +static noinline void push_sb_field_opts(struct opt_desc **p,
17004 + struct opt_desc *opts,
17005 + reiser4_super_info_data *sbinfo)
17006 +{
17007 +#define PUSH_SB_FIELD_OPT(field, format) \
17008 + PUSH_OPT(*p, opts, SB_FIELD_OPT(field, format))
17009 + /*
17010 + * tmgr.atom_max_size=N
17011 + * Atoms containing more than N blocks will be forced to commit. N is
17012 + * decimal.
17013 + */
17014 + PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17015 + /*
17016 + * tmgr.atom_max_age=N
17017 + * Atoms older than N seconds will be forced to commit. N is decimal.
17018 + */
17019 + PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17020 + /*
17021 + * tmgr.atom_min_size=N
17022 + * In committing an atom to free dirty pages, force the atom less than
17023 + * N in size to fuse with another one.
17024 + */
17025 + PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17026 + /*
17027 + * tmgr.atom_max_flushers=N
17028 + * limit of concurrent flushers for one atom. 0 means no limit.
17029 + */
17030 + PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17031 + /*
17032 + * tree.cbk_cache_slots=N
17033 + * Number of slots in the cbk cache.
17034 + */
17035 + PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17036 + /*
17037 + * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17038 + * leaf-level blocks it will force them to be relocated.
17039 + */
17040 + PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17041 + /*
17042 + * If flush finds can find a block allocation closer than at most
17043 + * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17044 + * position.
17045 + */
17046 + PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17047 + /*
17048 + * If we have written this much or more blocks before encountering busy
17049 + * jnode in flush list - abort flushing hoping that next time we get
17050 + * called this jnode will be clean already, and we will save some
17051 + * seeks.
17052 + */
17053 + PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17054 + /* The maximum number of nodes to scan left on a level during flush. */
17055 + PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17056 + /* preferred IO size */
17057 + PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17058 + /* carry flags used for insertion of new nodes */
17059 + PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17060 + /* carry flags used for insertion of new extents */
17061 + PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17062 + /* carry flags used for paste operations */
17063 + PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17064 + /* carry flags used for insert operations */
17065 + PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17066 +
17067 +#ifdef CONFIG_REISER4_BADBLOCKS
17068 + /*
17069 + * Alternative master superblock location in case if it's original
17070 + * location is not writeable/accessable. This is offset in BYTES.
17071 + */
17072 + PUSH_SB_FIELD_OPT(altsuper, "%lu");
17073 +#endif
17074 +}
17075 +
17076 +/**
17077 + * reiser4_init_super_data - initialize reiser4 private super block
17078 + * @super: super block to initialize
17079 + * @opt_string: list of reiser4 mount options
17080 + *
17081 + * Sets various reiser4 parameters to default values. Parses mount options and
17082 + * overwrites default settings.
17083 + */
17084 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
17085 +{
17086 + int result;
17087 + struct opt_desc *opts, *p;
17088 + reiser4_super_info_data *sbinfo = get_super_private(super);
17089 +
17090 + /* initialize super, export, dentry operations */
17091 + sbinfo->ops.super = reiser4_super_operations;
17092 + sbinfo->ops.export = reiser4_export_operations;
17093 + sbinfo->ops.dentry = reiser4_dentry_operations;
17094 + super->s_op = &sbinfo->ops.super;
17095 + super->s_export_op = &sbinfo->ops.export;
17096 +
17097 + /* initialize transaction manager parameters to default values */
17098 + sbinfo->tmgr.atom_max_size = totalram_pages / 4;
17099 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
17100 + sbinfo->tmgr.atom_min_size = 256;
17101 + sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
17102 +
17103 + /* initialize cbk cache parameter */
17104 + sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
17105 +
17106 + /* initialize flush parameters */
17107 + sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
17108 + sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
17109 + sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
17110 + sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
17111 +
17112 + sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
17113 +
17114 + /* preliminary tree initializations */
17115 + sbinfo->tree.super = super;
17116 + sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
17117 + sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
17118 + sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
17119 + sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
17120 + rwlock_init(&(sbinfo->tree.tree_lock));
17121 + spin_lock_init(&(sbinfo->tree.epoch_lock));
17122 +
17123 + /* initialize default readahead params */
17124 + sbinfo->ra_params.max = num_physpages / 4;
17125 + sbinfo->ra_params.flags = 0;
17126 +
17127 + /* allocate memory for structure describing reiser4 mount options */
17128 + opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
17129 + reiser4_ctx_gfp_mask_get());
17130 + if (opts == NULL)
17131 + return RETERR(-ENOMEM);
17132 +
17133 + /* initialize structure describing reiser4 mount options */
17134 + p = opts;
17135 +
17136 + push_sb_field_opts(&p, opts, sbinfo);
17137 + /* turn on BSD-style gid assignment */
17138 +
17139 +#define PUSH_BIT_OPT(name, bit) \
17140 + PUSH_OPT(p, opts, BIT_OPT(name, bit))
17141 +
17142 + PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17143 + /* turn on 32 bit times */
17144 + PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17145 + /*
17146 + * Don't load all bitmap blocks at mount time, it is useful for
17147 + * machines with tiny RAM and large disks.
17148 + */
17149 + PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17150 + /* disable transaction commits during write() */
17151 + PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17152 + /* disable use of write barriers in the reiser4 log writer. */
17153 + PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17154 +
17155 + PUSH_OPT(p, opts,
17156 + {
17157 + /*
17158 + * tree traversal readahead parameters:
17159 + * -o readahead:MAXNUM:FLAGS
17160 + * MAXNUM - max number fo nodes to request readahead for: -1UL
17161 + * will set it to max_sane_readahead()
17162 + * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17163 + * CONTINUE_ON_PRESENT
17164 + */
17165 + .name = "readahead",
17166 + .type = OPT_FORMAT,
17167 + .u = {
17168 + .f = {
17169 + .format = "%u:%u",
17170 + .nr_args = 2,
17171 + .arg1 = &sbinfo->ra_params.max,
17172 + .arg2 = &sbinfo->ra_params.flags,
17173 + .arg3 = NULL,
17174 + .arg4 = NULL
17175 + }
17176 + }
17177 + }
17178 + );
17179 +
17180 + /* What to do in case of fs error */
17181 + PUSH_OPT(p, opts,
17182 + {
17183 + .name = "onerror",
17184 + .type = OPT_ONEOF,
17185 + .u = {
17186 + .oneof = {
17187 + .result = &sbinfo->onerror,
17188 + .list = {
17189 + "panic", "remount-ro", NULL
17190 + },
17191 + }
17192 + }
17193 + }
17194 + );
17195 +
17196 + /* modify default settings to values set by mount options */
17197 + result = parse_options(opt_string, opts, p - opts);
17198 + kfree(opts);
17199 + if (result != 0)
17200 + return result;
17201 +
17202 + /* correct settings to sanity values */
17203 + sbinfo->tmgr.atom_max_age *= HZ;
17204 + if (sbinfo->tmgr.atom_max_age <= 0)
17205 + /* overflow */
17206 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17207 +
17208 + /* round optimal io size up to 512 bytes */
17209 + sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17210 + sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17211 + if (sbinfo->optimal_io_size == 0) {
17212 + warning("nikita-2497", "optimal_io_size is too small");
17213 + return RETERR(-EINVAL);
17214 + }
17215 + return result;
17216 +}
17217 +
17218 +/**
17219 + * reiser4_init_read_super - read reiser4 master super block
17220 + * @super: super block to fill
17221 + * @silent: if 0 - print warnings
17222 + *
17223 + * Reads reiser4 master super block either from predefined location or from
17224 + * location specified by altsuper mount option, initializes disk format plugin.
17225 + */
17226 +int reiser4_init_read_super(struct super_block *super, int silent)
17227 +{
17228 + struct buffer_head *super_bh;
17229 + struct reiser4_master_sb *master_sb;
17230 + reiser4_super_info_data *sbinfo = get_super_private(super);
17231 + unsigned long blocksize;
17232 +
17233 + read_super_block:
17234 +#ifdef CONFIG_REISER4_BADBLOCKS
17235 + if (sbinfo->altsuper)
17236 + /*
17237 + * read reiser4 master super block at position specified by
17238 + * mount option
17239 + */
17240 + super_bh = sb_bread(super,
17241 + (sector_t)(sbinfo->altsuper / super->s_blocksize));
17242 + else
17243 +#endif
17244 + /* read reiser4 master super block at 16-th 4096 block */
17245 + super_bh = sb_bread(super,
17246 + (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17247 + if (!super_bh)
17248 + return RETERR(-EIO);
17249 +
17250 + master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17251 + /* check reiser4 magic string */
17252 + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17253 + sizeof(REISER4_SUPER_MAGIC_STRING))) {
17254 + /* reiser4 master super block contains filesystem blocksize */
17255 + blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17256 +
17257 + if (blocksize != PAGE_CACHE_SIZE) {
17258 + /*
17259 + * currenly reiser4's blocksize must be equal to
17260 + * pagesize
17261 + */
17262 + if (!silent)
17263 + warning("nikita-2609",
17264 + "%s: wrong block size %ld\n", super->s_id,
17265 + blocksize);
17266 + brelse(super_bh);
17267 + return RETERR(-EINVAL);
17268 + }
17269 + if (blocksize != super->s_blocksize) {
17270 + /*
17271 + * filesystem uses different blocksize. Reread master
17272 + * super block with correct blocksize
17273 + */
17274 + brelse(super_bh);
17275 + if (!sb_set_blocksize(super, (int)blocksize))
17276 + return RETERR(-EINVAL);
17277 + goto read_super_block;
17278 + }
17279 +
17280 + sbinfo->df_plug =
17281 + disk_format_plugin_by_id(
17282 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17283 + if (sbinfo->df_plug == NULL) {
17284 + if (!silent)
17285 + warning("nikita-26091",
17286 + "%s: unknown disk format plugin %d\n",
17287 + super->s_id,
17288 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17289 + brelse(super_bh);
17290 + return RETERR(-EINVAL);
17291 + }
17292 + sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17293 + brelse(super_bh);
17294 + return 0;
17295 + }
17296 +
17297 + /* there is no reiser4 on the device */
17298 + if (!silent)
17299 + warning("nikita-2608",
17300 + "%s: wrong master super block magic", super->s_id);
17301 + brelse(super_bh);
17302 + return RETERR(-EINVAL);
17303 +}
17304 +
17305 +static struct {
17306 + reiser4_plugin_type type;
17307 + reiser4_plugin_id id;
17308 +} default_plugins[PSET_LAST] = {
17309 + [PSET_FILE] = {
17310 + .type = REISER4_FILE_PLUGIN_TYPE,
17311 + .id = UNIX_FILE_PLUGIN_ID
17312 + },
17313 + [PSET_DIR] = {
17314 + .type = REISER4_DIR_PLUGIN_TYPE,
17315 + .id = HASHED_DIR_PLUGIN_ID
17316 + },
17317 + [PSET_HASH] = {
17318 + .type = REISER4_HASH_PLUGIN_TYPE,
17319 + .id = R5_HASH_ID
17320 + },
17321 + [PSET_FIBRATION] = {
17322 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
17323 + .id = FIBRATION_DOT_O
17324 + },
17325 + [PSET_PERM] = {
17326 + .type = REISER4_PERM_PLUGIN_TYPE,
17327 + .id = NULL_PERM_ID
17328 + },
17329 + [PSET_FORMATTING] = {
17330 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
17331 + .id = SMALL_FILE_FORMATTING_ID
17332 + },
17333 + [PSET_SD] = {
17334 + .type = REISER4_ITEM_PLUGIN_TYPE,
17335 + .id = STATIC_STAT_DATA_ID
17336 + },
17337 + [PSET_DIR_ITEM] = {
17338 + .type = REISER4_ITEM_PLUGIN_TYPE,
17339 + .id = COMPOUND_DIR_ID
17340 + },
17341 + [PSET_CIPHER] = {
17342 + .type = REISER4_CIPHER_PLUGIN_TYPE,
17343 + .id = NONE_CIPHER_ID
17344 + },
17345 + [PSET_DIGEST] = {
17346 + .type = REISER4_DIGEST_PLUGIN_TYPE,
17347 + .id = SHA256_32_DIGEST_ID
17348 + },
17349 + [PSET_COMPRESSION] = {
17350 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17351 + .id = LZO1_COMPRESSION_ID
17352 + },
17353 + [PSET_COMPRESSION_MODE] = {
17354 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17355 + .id = CONVX_COMPRESSION_MODE_ID
17356 + },
17357 + [PSET_CLUSTER] = {
17358 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
17359 + .id = CLUSTER_64K_ID
17360 + },
17361 + [PSET_CREATE] = {
17362 + .type = REISER4_FILE_PLUGIN_TYPE,
17363 + .id = UNIX_FILE_PLUGIN_ID
17364 + }
17365 +};
17366 +
17367 +/* access to default plugin table */
17368 +reiser4_plugin *get_default_plugin(pset_member memb)
17369 +{
17370 + return plugin_by_id(default_plugins[memb].type,
17371 + default_plugins[memb].id);
17372 +}
17373 +
17374 +/**
17375 + * reiser4_init_root_inode - obtain inode of root directory
17376 + * @super: super block of filesystem
17377 + *
17378 + * Obtains inode of root directory (reading it from disk), initializes plugin
17379 + * set it was not initialized.
17380 + */
17381 +int reiser4_init_root_inode(struct super_block *super)
17382 +{
17383 + reiser4_super_info_data *sbinfo = get_super_private(super);
17384 + struct inode *inode;
17385 + int result = 0;
17386 +
17387 + inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17388 + if (IS_ERR(inode))
17389 + return RETERR(PTR_ERR(inode));
17390 +
17391 + super->s_root = d_alloc_root(inode);
17392 + if (!super->s_root) {
17393 + iput(inode);
17394 + return RETERR(-ENOMEM);
17395 + }
17396 +
17397 + super->s_root->d_op = &sbinfo->ops.dentry;
17398 +
17399 + if (!is_inode_loaded(inode)) {
17400 + pset_member memb;
17401 + plugin_set *pset;
17402 +
17403 + pset = reiser4_inode_data(inode)->pset;
17404 + for (memb = 0; memb < PSET_LAST; ++memb) {
17405 +
17406 + if (aset_get(pset, memb) != NULL)
17407 + continue;
17408 +
17409 + result = grab_plugin_pset(inode, NULL, memb);
17410 + if (result != 0)
17411 + break;
17412 +
17413 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17414 + }
17415 +
17416 + if (result == 0) {
17417 + if (REISER4_DEBUG) {
17418 + for (memb = 0; memb < PSET_LAST; ++memb)
17419 + assert("nikita-3500",
17420 + aset_get(pset, memb) != NULL);
17421 + }
17422 + } else
17423 + warning("nikita-3448", "Cannot set plugins of root: %i",
17424 + result);
17425 + reiser4_iget_complete(inode);
17426 +
17427 + /* As the default pset kept in the root dir may has been changed
17428 + (length is unknown), call update_sd. */
17429 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17430 + result = reiser4_grab_space(
17431 + inode_file_plugin(inode)->estimate.update(inode),
17432 + BA_CAN_COMMIT);
17433 +
17434 + if (result == 0)
17435 + result = reiser4_update_sd(inode);
17436 +
17437 + all_grabbed2free();
17438 + }
17439 + }
17440 +
17441 + super->s_maxbytes = MAX_LFS_FILESIZE;
17442 + return result;
17443 +}
17444 +
17445 +/*
17446 + * Local variables:
17447 + * c-indentation-style: "K&R"
17448 + * mode-name: "LC"
17449 + * c-basic-offset: 8
17450 + * tab-width: 8
17451 + * fill-column: 79
17452 + * End:
17453 + */
17454 diff -urN linux-2.6.33.orig/fs/reiser4/inode.c linux-2.6.33/fs/reiser4/inode.c
17455 --- linux-2.6.33.orig/fs/reiser4/inode.c 1970-01-01 01:00:00.000000000 +0100
17456 +++ linux-2.6.33/fs/reiser4/inode.c 2010-03-04 19:33:22.000000000 +0100
17457 @@ -0,0 +1,711 @@
17458 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
17459 + reiser4/README */
17460 +
17461 +/* Inode specific operations. */
17462 +
17463 +#include "forward.h"
17464 +#include "debug.h"
17465 +#include "key.h"
17466 +#include "kassign.h"
17467 +#include "coord.h"
17468 +#include "seal.h"
17469 +#include "dscale.h"
17470 +#include "plugin/item/item.h"
17471 +#include "plugin/security/perm.h"
17472 +#include "plugin/plugin.h"
17473 +#include "plugin/object.h"
17474 +#include "znode.h"
17475 +#include "vfs_ops.h"
17476 +#include "inode.h"
17477 +#include "super.h"
17478 +#include "reiser4.h"
17479 +
17480 +#include <linux/fs.h> /* for struct super_block, address_space */
17481 +
17482 +/* return reiser4 internal tree which inode belongs to */
17483 +/* Audited by: green(2002.06.17) */
17484 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode/* inode queried*/)
17485 +{
17486 + assert("nikita-256", inode != NULL);
17487 + assert("nikita-257", inode->i_sb != NULL);
17488 + return reiser4_get_tree(inode->i_sb);
17489 +}
17490 +
17491 +/* return reiser4-specific inode flags */
17492 +static inline unsigned long *inode_flags(const struct inode *const inode)
17493 +{
17494 + assert("nikita-2842", inode != NULL);
17495 + return &reiser4_inode_data(inode)->flags;
17496 +}
17497 +
17498 +/* set reiser4-specific flag @f in @inode */
17499 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17500 +{
17501 + assert("nikita-2248", inode != NULL);
17502 + set_bit((int)f, inode_flags(inode));
17503 +}
17504 +
17505 +/* clear reiser4-specific flag @f in @inode */
17506 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17507 +{
17508 + assert("nikita-2250", inode != NULL);
17509 + clear_bit((int)f, inode_flags(inode));
17510 +}
17511 +
17512 +/* true if reiser4-specific flag @f is set in @inode */
17513 +int reiser4_inode_get_flag(const struct inode *inode,
17514 + reiser4_file_plugin_flags f)
17515 +{
17516 + assert("nikita-2251", inode != NULL);
17517 + return test_bit((int)f, inode_flags(inode));
17518 +}
17519 +
17520 +/* convert oid to inode number */
17521 +ino_t oid_to_ino(oid_t oid)
17522 +{
17523 + return (ino_t) oid;
17524 +}
17525 +
17526 +/* convert oid to user visible inode number */
17527 +ino_t oid_to_uino(oid_t oid)
17528 +{
17529 + /* reiser4 object is uniquely identified by oid which is 64 bit
17530 + quantity. Kernel in-memory inode is indexed (in the hash table) by
17531 + 32 bit i_ino field, but this is not a problem, because there is a
17532 + way to further distinguish inodes with identical inode numbers
17533 + (find_actor supplied to iget()).
17534 +
17535 + But user space expects unique 32 bit inode number. Obviously this
17536 + is impossible. Work-around is to somehow hash oid into user visible
17537 + inode number.
17538 + */
17539 + oid_t max_ino = (ino_t) ~0;
17540 +
17541 + if (REISER4_INO_IS_OID || (oid <= max_ino))
17542 + return oid;
17543 + else
17544 + /* this is remotely similar to algorithm used to find next pid
17545 + to use for process: after wrap-around start from some
17546 + offset rather than from 0. Idea is that there are some long
17547 + living objects with which we don't want to collide.
17548 + */
17549 + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17550 +}
17551 +
17552 +/* check that "inode" is on reiser4 file-system */
17553 +int is_reiser4_inode(const struct inode *inode/* inode queried */)
17554 +{
17555 + return inode != NULL && is_reiser4_super(inode->i_sb);
17556 +}
17557 +
17558 +/* Maximal length of a name that can be stored in directory @inode.
17559 +
17560 + This is used in check during file creation and lookup. */
17561 +int reiser4_max_filename_len(const struct inode *inode/* inode queried */)
17562 +{
17563 + assert("nikita-287", is_reiser4_inode(inode));
17564 + assert("nikita-1710", inode_dir_item_plugin(inode));
17565 + if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17566 + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17567 + else
17568 + return 255;
17569 +}
17570 +
17571 +#if REISER4_USE_COLLISION_LIMIT
17572 +/* Maximal number of hash collisions for this directory. */
17573 +int max_hash_collisions(const struct inode *dir/* inode queried */)
17574 +{
17575 + assert("nikita-1711", dir != NULL);
17576 + return reiser4_inode_data(dir)->plugin.max_collisions;
17577 +}
17578 +#endif /* REISER4_USE_COLLISION_LIMIT */
17579 +
17580 +/* Install file, inode, and address_space operation on @inode, depending on
17581 + its mode. */
17582 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17583 + reiser4_object_create_data * data /* parameters to create
17584 + * object */ )
17585 +{
17586 + reiser4_super_info_data *sinfo;
17587 + file_plugin *fplug;
17588 + dir_plugin *dplug;
17589 +
17590 + fplug = inode_file_plugin(inode);
17591 + dplug = inode_dir_plugin(inode);
17592 +
17593 + sinfo = get_super_private(inode->i_sb);
17594 +
17595 + switch (inode->i_mode & S_IFMT) {
17596 + case S_IFSOCK:
17597 + case S_IFBLK:
17598 + case S_IFCHR:
17599 + case S_IFIFO:
17600 + {
17601 + dev_t rdev; /* to keep gcc happy */
17602 +
17603 + assert("vs-46", fplug != NULL);
17604 + /* ugly hack with rdev */
17605 + if (data == NULL) {
17606 + rdev = inode->i_rdev;
17607 + inode->i_rdev = 0;
17608 + } else
17609 + rdev = data->rdev;
17610 + inode->i_blocks = 0;
17611 + assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17612 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17613 + /* initialize inode->i_fop and inode->i_rdev for block
17614 + and char devices */
17615 + init_special_inode(inode, inode->i_mode, rdev);
17616 + /* all address space operations are null */
17617 + inode->i_mapping->a_ops =
17618 + file_plugins[fplug->h.id].as_ops;
17619 + break;
17620 + }
17621 + case S_IFLNK:
17622 + assert("vs-46", fplug != NULL);
17623 + assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17624 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17625 + inode->i_fop = NULL;
17626 + /* all address space operations are null */
17627 + inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17628 + break;
17629 + case S_IFDIR:
17630 + assert("vs-46", dplug != NULL);
17631 + assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17632 + dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17633 + inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17634 + inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17635 + inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17636 + break;
17637 + case S_IFREG:
17638 + assert("vs-46", fplug != NULL);
17639 + assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17640 + fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17641 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17642 + inode->i_fop = file_plugins[fplug->h.id].file_ops;
17643 + inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17644 + break;
17645 + default:
17646 + warning("nikita-291", "wrong file mode: %o for %llu",
17647 + inode->i_mode,
17648 + (unsigned long long)get_inode_oid(inode));
17649 + reiser4_make_bad_inode(inode);
17650 + return RETERR(-EINVAL);
17651 + }
17652 + return 0;
17653 +}
17654 +
17655 +/* Initialize inode from disk data. Called with inode locked.
17656 + Return inode locked. */
17657 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17658 + coord_t *coord/* coord of stat data */)
17659 +{
17660 + int result;
17661 + item_plugin *iplug;
17662 + void *body;
17663 + int length;
17664 + reiser4_inode *state;
17665 +
17666 + assert("nikita-292", coord != NULL);
17667 + assert("nikita-293", inode != NULL);
17668 +
17669 + coord_clear_iplug(coord);
17670 + result = zload(coord->node);
17671 + if (result)
17672 + return result;
17673 + iplug = item_plugin_by_coord(coord);
17674 + body = item_body_by_coord(coord);
17675 + length = item_length_by_coord(coord);
17676 +
17677 + assert("nikita-295", iplug != NULL);
17678 + assert("nikita-296", body != NULL);
17679 + assert("nikita-297", length > 0);
17680 +
17681 + /* inode is under I_LOCK now */
17682 +
17683 + state = reiser4_inode_data(inode);
17684 + /* call stat-data plugin method to load sd content into inode */
17685 + result = iplug->s.sd.init_inode(inode, body, length);
17686 + set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17687 + if (result == 0) {
17688 + result = setup_inode_ops(inode, NULL);
17689 + if (result == 0 && inode->i_sb->s_root &&
17690 + inode->i_sb->s_root->d_inode)
17691 + result = finish_pset(inode);
17692 + }
17693 + zrelse(coord->node);
17694 + return result;
17695 +}
17696 +
17697 +/* read `inode' from the disk. This is what was previously in
17698 + reiserfs_read_inode2().
17699 +
17700 + Must be called with inode locked. Return inode still locked.
17701 +*/
17702 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17703 + const reiser4_key * key /* key of stat data */ ,
17704 + int silent)
17705 +{
17706 + int result;
17707 + lock_handle lh;
17708 + reiser4_inode *info;
17709 + coord_t coord;
17710 +
17711 + assert("nikita-298", inode != NULL);
17712 + assert("nikita-1945", !is_inode_loaded(inode));
17713 +
17714 + info = reiser4_inode_data(inode);
17715 + assert("nikita-300", info->locality_id != 0);
17716 +
17717 + coord_init_zero(&coord);
17718 + init_lh(&lh);
17719 + /* locate stat-data in a tree and return znode locked */
17720 + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17721 + assert("nikita-301", !is_inode_loaded(inode));
17722 + if (result == 0) {
17723 + /* use stat-data plugin to load sd into inode. */
17724 + result = init_inode(inode, &coord);
17725 + if (result == 0) {
17726 + /* initialize stat-data seal */
17727 + spin_lock_inode(inode);
17728 + reiser4_seal_init(&info->sd_seal, &coord, key);
17729 + info->sd_coord = coord;
17730 + spin_unlock_inode(inode);
17731 +
17732 + /* call file plugin's method to initialize plugin
17733 + * specific part of inode */
17734 + if (inode_file_plugin(inode)->init_inode_data)
17735 + inode_file_plugin(inode)->init_inode_data(inode,
17736 + NULL,
17737 + 0);
17738 + /* load detached directory cursors for stateless
17739 + * directory readers (NFS). */
17740 + reiser4_load_cursors(inode);
17741 +
17742 + /* Check the opened inode for consistency. */
17743 + result =
17744 + get_super_private(inode->i_sb)->df_plug->
17745 + check_open(inode);
17746 + }
17747 + }
17748 + /* lookup_sd() doesn't release coord because we want znode
17749 + stay read-locked while stat-data fields are accessed in
17750 + init_inode() */
17751 + done_lh(&lh);
17752 +
17753 + if (result != 0)
17754 + reiser4_make_bad_inode(inode);
17755 + return result;
17756 +}
17757 +
17758 +/* initialise new reiser4 inode being inserted into hash table. */
17759 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17760 + void *opaque /* key of stat data passed to
17761 + * the iget5_locked as cookie */)
17762 +{
17763 + reiser4_key *key;
17764 +
17765 + assert("nikita-1995", inode != NULL);
17766 + assert("nikita-1996", opaque != NULL);
17767 + key = opaque;
17768 + set_inode_oid(inode, get_key_objectid(key));
17769 + reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17770 + return 0;
17771 +}
17772 +
17773 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to
17774 + iget5_locked().
17775 +
17776 + This function is called by iget5_locked() to distinguish reiser4 inodes
17777 + having the same inode numbers. Such inodes can only exist due to some error
17778 + condition. One of them should be bad. Inodes with identical inode numbers
17779 + (objectids) are distinguished by their packing locality.
17780 +
17781 +*/
17782 +static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table
17783 + * to check */ ,
17784 + void *opaque /* "cookie" passed to
17785 + * iget5_locked(). This
17786 + * is stat-data key */)
17787 +{
17788 + reiser4_key *key;
17789 +
17790 + key = opaque;
17791 + return
17792 + /* oid is unique, so first term is enough, actually. */
17793 + get_inode_oid(inode) == get_key_objectid(key) &&
17794 + /*
17795 + * also, locality should be checked, but locality is stored in
17796 + * the reiser4-specific part of the inode, and actor can be
17797 + * called against arbitrary inode that happened to be in this
17798 + * hash chain. Hence we first have to check that this is
17799 + * reiser4 inode at least. is_reiser4_inode() is probably too
17800 + * early to call, as inode may have ->i_op not yet
17801 + * initialised.
17802 + */
17803 + is_reiser4_super(inode->i_sb) &&
17804 + /*
17805 + * usually objectid is unique, but pseudo files use counter to
17806 + * generate objectid. All pseudo files are placed into special
17807 + * (otherwise unused) locality.
17808 + */
17809 + reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17810 +}
17811 +
17812 +/* hook for kmem_cache_create */
17813 +void loading_init_once(reiser4_inode * info)
17814 +{
17815 + mutex_init(&info->loading);
17816 +}
17817 +
17818 +/* for reiser4_alloc_inode */
17819 +void loading_alloc(reiser4_inode * info)
17820 +{
17821 + assert("vs-1717", !mutex_is_locked(&info->loading));
17822 +}
17823 +
17824 +/* for reiser4_destroy */
17825 +void loading_destroy(reiser4_inode * info)
17826 +{
17827 + assert("vs-1717a", !mutex_is_locked(&info->loading));
17828 +}
17829 +
17830 +static void loading_begin(reiser4_inode * info)
17831 +{
17832 + mutex_lock(&info->loading);
17833 +}
17834 +
17835 +static void loading_end(reiser4_inode * info)
17836 +{
17837 + mutex_unlock(&info->loading);
17838 +}
17839 +
17840 +/**
17841 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17842 + * @super: super block of filesystem
17843 + * @key: key of inode's stat-data
17844 + * @silent:
17845 + *
17846 + * This is our helper function a la iget(). This is be called by
17847 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17848 + * encountered.
17849 + */
17850 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17851 + int silent)
17852 +{
17853 + struct inode *inode;
17854 + int result;
17855 + reiser4_inode *info;
17856 +
17857 + assert("nikita-302", super != NULL);
17858 + assert("nikita-303", key != NULL);
17859 +
17860 + result = 0;
17861 +
17862 + /* call iget(). Our ->read_inode() is dummy, so this will either
17863 + find inode in cache or return uninitialised inode */
17864 + inode = iget5_locked(super,
17865 + (unsigned long)get_key_objectid(key),
17866 + reiser4_inode_find_actor,
17867 + init_locked_inode, (reiser4_key *) key);
17868 + if (inode == NULL)
17869 + return ERR_PTR(RETERR(-ENOMEM));
17870 + if (is_bad_inode(inode)) {
17871 + warning("nikita-304", "Bad inode found");
17872 + reiser4_print_key("key", key);
17873 + iput(inode);
17874 + return ERR_PTR(RETERR(-EIO));
17875 + }
17876 +
17877 + info = reiser4_inode_data(inode);
17878 +
17879 + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17880 + loaded and initialized inode from just allocated inode. If
17881 + REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17882 + info->loading. The place in reiser4 which uses not initialized inode
17883 + is the reiser4 repacker, see repacker-related functions in
17884 + plugin/item/extent.c */
17885 + if (!is_inode_loaded(inode)) {
17886 + loading_begin(info);
17887 + if (!is_inode_loaded(inode)) {
17888 + /* locking: iget5_locked returns locked inode */
17889 + assert("nikita-1941", !is_inode_loaded(inode));
17890 + assert("nikita-1949",
17891 + reiser4_inode_find_actor(inode,
17892 + (reiser4_key *) key));
17893 + /* now, inode has objectid as ->i_ino and locality in
17894 + reiser4-specific part. This is enough for
17895 + read_inode() to read stat data from the disk */
17896 + result = read_inode(inode, key, silent);
17897 + } else
17898 + loading_end(info);
17899 + }
17900 +
17901 + if (inode->i_state & I_NEW)
17902 + unlock_new_inode(inode);
17903 +
17904 + if (is_bad_inode(inode)) {
17905 + assert("vs-1717", result != 0);
17906 + loading_end(info);
17907 + iput(inode);
17908 + inode = ERR_PTR(result);
17909 + } else if (REISER4_DEBUG) {
17910 + reiser4_key found_key;
17911 +
17912 + assert("vs-1717", result == 0);
17913 + build_sd_key(inode, &found_key);
17914 + if (!keyeq(&found_key, key)) {
17915 + warning("nikita-305", "Wrong key in sd");
17916 + reiser4_print_key("sought for", key);
17917 + reiser4_print_key("found", &found_key);
17918 + }
17919 + if (inode->i_nlink == 0) {
17920 + warning("nikita-3559", "Unlinked inode found: %llu\n",
17921 + (unsigned long long)get_inode_oid(inode));
17922 + }
17923 + }
17924 + return inode;
17925 +}
17926 +
17927 +/* reiser4_iget() may return not fully initialized inode, this function should
17928 + * be called after one completes reiser4 inode initializing. */
17929 +void reiser4_iget_complete(struct inode *inode)
17930 +{
17931 + assert("zam-988", is_reiser4_inode(inode));
17932 +
17933 + if (!is_inode_loaded(inode)) {
17934 + reiser4_inode_set_flag(inode, REISER4_LOADED);
17935 + loading_end(reiser4_inode_data(inode));
17936 + }
17937 +}
17938 +
17939 +void reiser4_make_bad_inode(struct inode *inode)
17940 +{
17941 + assert("nikita-1934", inode != NULL);
17942 +
17943 + /* clear LOADED bit */
17944 + reiser4_inode_clr_flag(inode, REISER4_LOADED);
17945 + make_bad_inode(inode);
17946 + return;
17947 +}
17948 +
17949 +file_plugin *inode_file_plugin(const struct inode *inode)
17950 +{
17951 + assert("nikita-1997", inode != NULL);
17952 + return reiser4_inode_data(inode)->pset->file;
17953 +}
17954 +
17955 +dir_plugin *inode_dir_plugin(const struct inode *inode)
17956 +{
17957 + assert("nikita-1998", inode != NULL);
17958 + return reiser4_inode_data(inode)->pset->dir;
17959 +}
17960 +
17961 +formatting_plugin *inode_formatting_plugin(const struct inode *inode)
17962 +{
17963 + assert("nikita-2000", inode != NULL);
17964 + return reiser4_inode_data(inode)->pset->formatting;
17965 +}
17966 +
17967 +hash_plugin *inode_hash_plugin(const struct inode *inode)
17968 +{
17969 + assert("nikita-2001", inode != NULL);
17970 + return reiser4_inode_data(inode)->pset->hash;
17971 +}
17972 +
17973 +fibration_plugin *inode_fibration_plugin(const struct inode *inode)
17974 +{
17975 + assert("nikita-2001", inode != NULL);
17976 + return reiser4_inode_data(inode)->pset->fibration;
17977 +}
17978 +
17979 +cipher_plugin *inode_cipher_plugin(const struct inode *inode)
17980 +{
17981 + assert("edward-36", inode != NULL);
17982 + return reiser4_inode_data(inode)->pset->cipher;
17983 +}
17984 +
17985 +compression_plugin *inode_compression_plugin(const struct inode *inode)
17986 +{
17987 + assert("edward-37", inode != NULL);
17988 + return reiser4_inode_data(inode)->pset->compression;
17989 +}
17990 +
17991 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17992 + inode)
17993 +{
17994 + assert("edward-1330", inode != NULL);
17995 + return reiser4_inode_data(inode)->pset->compression_mode;
17996 +}
17997 +
17998 +cluster_plugin *inode_cluster_plugin(const struct inode *inode)
17999 +{
18000 + assert("edward-1328", inode != NULL);
18001 + return reiser4_inode_data(inode)->pset->cluster;
18002 +}
18003 +
18004 +file_plugin *inode_create_plugin(const struct inode *inode)
18005 +{
18006 + assert("edward-1329", inode != NULL);
18007 + return reiser4_inode_data(inode)->pset->create;
18008 +}
18009 +
18010 +digest_plugin *inode_digest_plugin(const struct inode *inode)
18011 +{
18012 + assert("edward-86", inode != NULL);
18013 + return reiser4_inode_data(inode)->pset->digest;
18014 +}
18015 +
18016 +item_plugin *inode_sd_plugin(const struct inode *inode)
18017 +{
18018 + assert("vs-534", inode != NULL);
18019 + return reiser4_inode_data(inode)->pset->sd;
18020 +}
18021 +
18022 +item_plugin *inode_dir_item_plugin(const struct inode *inode)
18023 +{
18024 + assert("vs-534", inode != NULL);
18025 + return reiser4_inode_data(inode)->pset->dir_item;
18026 +}
18027 +
18028 +file_plugin *child_create_plugin(const struct inode *inode)
18029 +{
18030 + assert("edward-1329", inode != NULL);
18031 + return reiser4_inode_data(inode)->hset->create;
18032 +}
18033 +
18034 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
18035 +{
18036 + reiser4_inode *state;
18037 +
18038 + assert("nikita-2716", inode != NULL);
18039 + assert("nikita-2717", ext < LAST_SD_EXTENSION);
18040 + assert("nikita-3491", spin_inode_is_locked(inode));
18041 +
18042 + state = reiser4_inode_data(inode);
18043 + state->extmask |= 1 << ext;
18044 + /* force re-calculation of stat-data length on next call to
18045 + update_sd(). */
18046 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18047 +}
18048 +
18049 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
18050 +{
18051 + reiser4_inode *state;
18052 +
18053 + assert("vpf-1926", inode != NULL);
18054 + assert("vpf-1927", ext < LAST_SD_EXTENSION);
18055 + assert("vpf-1928", spin_inode_is_locked(inode));
18056 +
18057 + state = reiser4_inode_data(inode);
18058 + state->extmask &= ~(1 << ext);
18059 + /* force re-calculation of stat-data length on next call to
18060 + update_sd(). */
18061 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18062 +}
18063 +
18064 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18065 +{
18066 + assert("edward-1287", inode != NULL);
18067 + if (!dscale_fit(old, new))
18068 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18069 + return;
18070 +}
18071 +
18072 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18073 +{
18074 + assert("nikita-2875", inode != NULL);
18075 + spin_lock_inode(inode);
18076 + inode_check_scale_nolock(inode, old, new);
18077 + spin_unlock_inode(inode);
18078 +}
18079 +
18080 +/*
18081 + * initialize ->ordering field of inode. This field defines how file stat-data
18082 + * and body is ordered within a tree with respect to other objects within the
18083 + * same parent directory.
18084 + */
18085 +void
18086 +init_inode_ordering(struct inode *inode,
18087 + reiser4_object_create_data * crd, int create)
18088 +{
18089 + reiser4_key key;
18090 +
18091 + if (create) {
18092 + struct inode *parent;
18093 +
18094 + parent = crd->parent;
18095 + assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18096 + inode_dir_plugin(parent)->build_entry_key(parent,
18097 + &crd->dentry->d_name,
18098 + &key);
18099 + } else {
18100 + coord_t *coord;
18101 +
18102 + coord = &reiser4_inode_data(inode)->sd_coord;
18103 + coord_clear_iplug(coord);
18104 + /* safe to use ->sd_coord, because node is under long term
18105 + * lock */
18106 + WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18107 + }
18108 +
18109 + set_inode_ordering(inode, get_key_ordering(&key));
18110 +}
18111 +
18112 +znode *inode_get_vroot(struct inode *inode)
18113 +{
18114 + reiser4_block_nr blk;
18115 + znode *result;
18116 +
18117 + spin_lock_inode(inode);
18118 + blk = reiser4_inode_data(inode)->vroot;
18119 + spin_unlock_inode(inode);
18120 + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18121 + result = zlook(reiser4_tree_by_inode(inode), &blk);
18122 + else
18123 + result = NULL;
18124 + return result;
18125 +}
18126 +
18127 +void inode_set_vroot(struct inode *inode, znode *vroot)
18128 +{
18129 + spin_lock_inode(inode);
18130 + reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18131 + spin_unlock_inode(inode);
18132 +}
18133 +
18134 +#if REISER4_DEBUG
18135 +
18136 +void reiser4_inode_invariant(const struct inode *inode)
18137 +{
18138 + assert("nikita-3077", spin_inode_is_locked(inode));
18139 +}
18140 +
18141 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
18142 +{
18143 + return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18144 + r4_inode->nr_jnodes == 0;
18145 +}
18146 +
18147 +#endif
18148 +
18149 +/* true if directory is empty (only contains dot and dotdot) */
18150 +/* FIXME: shouldn't it be dir plugin method? */
18151 +int is_dir_empty(const struct inode *dir)
18152 +{
18153 + assert("nikita-1976", dir != NULL);
18154 +
18155 + /* rely on our method to maintain directory i_size being equal to the
18156 + number of entries. */
18157 + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18158 +}
18159 +
18160 +/* Make Linus happy.
18161 + Local variables:
18162 + c-indentation-style: "K&R"
18163 + mode-name: "LC"
18164 + c-basic-offset: 8
18165 + tab-width: 8
18166 + fill-column: 120
18167 + End:
18168 +*/
18169 diff -urN linux-2.6.33.orig/fs/reiser4/inode.h linux-2.6.33/fs/reiser4/inode.h
18170 --- linux-2.6.33.orig/fs/reiser4/inode.h 1970-01-01 01:00:00.000000000 +0100
18171 +++ linux-2.6.33/fs/reiser4/inode.h 2010-03-04 19:33:22.000000000 +0100
18172 @@ -0,0 +1,453 @@
18173 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18174 + reiser4/README */
18175 +
18176 +/* Inode functions. */
18177 +
18178 +#if !defined(__REISER4_INODE_H__)
18179 +#define __REISER4_INODE_H__
18180 +
18181 +#include "forward.h"
18182 +#include "debug.h"
18183 +#include "key.h"
18184 +#include "seal.h"
18185 +#include "plugin/plugin.h"
18186 +#include "plugin/file/cryptcompress.h"
18187 +#include "plugin/file/file.h"
18188 +#include "plugin/dir/dir.h"
18189 +#include "plugin/plugin_set.h"
18190 +#include "plugin/security/perm.h"
18191 +#include "vfs_ops.h"
18192 +#include "jnode.h"
18193 +#include "fsdata.h"
18194 +
18195 +#include <linux/types.h> /* for __u?? , ino_t */
18196 +#include <linux/fs.h> /* for struct super_block, struct
18197 + * rw_semaphore, etc */
18198 +#include <linux/spinlock.h>
18199 +#include <asm/types.h>
18200 +
18201 +/* reiser4-specific inode flags. They are "transient" and are not
18202 + supposed to be stored on disk. Used to trace "state" of
18203 + inode
18204 +*/
18205 +typedef enum {
18206 + /* this is light-weight inode, inheriting some state from its
18207 + parent */
18208 + REISER4_LIGHT_WEIGHT = 0,
18209 + /* stat data wasn't yet created */
18210 + REISER4_NO_SD = 1,
18211 + /* internal immutable flag. Currently is only used
18212 + to avoid race condition during file creation.
18213 + See comment in create_object(). */
18214 + REISER4_IMMUTABLE = 2,
18215 + /* inode was read from storage */
18216 + REISER4_LOADED = 3,
18217 + /* this bit is set for symlinks. inode->i_private points to target
18218 + name of symlink. */
18219 + REISER4_GENERIC_PTR_USED = 4,
18220 + /* set if size of stat-data item for this inode is known. If this is
18221 + * set we can avoid recalculating size of stat-data on each update. */
18222 + REISER4_SDLEN_KNOWN = 5,
18223 + /* reiser4_inode->crypt points to the crypto stat */
18224 + REISER4_CRYPTO_STAT_LOADED = 6,
18225 + /* cryptcompress_inode_data points to the secret key */
18226 + REISER4_SECRET_KEY_INSTALLED = 7,
18227 + /* File (possibly) has pages corresponding to the tail items, that
18228 + * were created by ->readpage. It is set by mmap_unix_file() and
18229 + * sendfile_unix_file(). This bit is inspected by write_unix_file and
18230 + * kill-hook of tail items. It is never cleared once set. This bit is
18231 + * modified and inspected under i_mutex. */
18232 + REISER4_HAS_MMAP = 8,
18233 + REISER4_PART_MIXED = 9,
18234 + REISER4_PART_IN_CONV = 10,
18235 + /* This flag indicates that file plugin conversion is in progress */
18236 + REISER4_FILE_CONV_IN_PROGRESS = 11
18237 +} reiser4_file_plugin_flags;
18238 +
18239 +/* state associated with each inode.
18240 + reiser4 inode.
18241 +
18242 + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18243 + be of the same size. File-system allocates inodes by itself through
18244 + s_op->allocate_inode() method. So, it is possible to adjust size of inode
18245 + at the time of its creation.
18246 +
18247 + Invariants involving parts of this data-type:
18248 +
18249 + [inode->eflushed]
18250 +
18251 +*/
18252 +
18253 +typedef struct reiser4_inode reiser4_inode;
18254 +/* return pointer to reiser4-specific part of inode */
18255 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18256 + /* inode queried */ );
18257 +
18258 +#if BITS_PER_LONG == 64
18259 +
18260 +#define REISER4_INO_IS_OID (1)
18261 +typedef struct {;
18262 +} oid_hi_t;
18263 +
18264 +/* BITS_PER_LONG == 64 */
18265 +#else
18266 +
18267 +#define REISER4_INO_IS_OID (0)
18268 +typedef __u32 oid_hi_t;
18269 +
18270 +/* BITS_PER_LONG == 64 */
18271 +#endif
18272 +
18273 +struct reiser4_inode {
18274 + /* spin lock protecting fields of this structure. */
18275 + spinlock_t guard;
18276 + /* main plugin set that control the file
18277 + (see comments in plugin/plugin_set.c) */
18278 + plugin_set *pset;
18279 + /* plugin set for inheritance
18280 + (see comments in plugin/plugin_set.c) */
18281 + plugin_set *hset;
18282 + /* high 32 bits of object id */
18283 + oid_hi_t oid_hi;
18284 + /* seal for stat-data */
18285 + seal_t sd_seal;
18286 + /* locality id for this file */
18287 + oid_t locality_id;
18288 +#if REISER4_LARGE_KEY
18289 + __u64 ordering;
18290 +#endif
18291 + /* coord of stat-data in sealed node */
18292 + coord_t sd_coord;
18293 + /* bit-mask of stat-data extentions used by this file */
18294 + __u64 extmask;
18295 + /* bitmask of non-default plugins for this inode */
18296 + __u16 plugin_mask;
18297 + /* bitmask of set heir plugins for this inode. */
18298 + __u16 heir_mask;
18299 + union {
18300 + struct list_head readdir_list;
18301 + struct list_head not_used;
18302 + } lists;
18303 + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18304 + unsigned long flags;
18305 + union {
18306 + /* fields specific to unix_file plugin */
18307 + struct unix_file_info unix_file_info;
18308 + /* fields specific to cryptcompress file plugin */
18309 + struct cryptcompress_info cryptcompress_info;
18310 + } file_plugin_data;
18311 +
18312 + /* this semaphore is to serialize readers and writers of @pset->file
18313 + * when file plugin conversion is enabled
18314 + */
18315 + struct rw_semaphore conv_sem;
18316 +
18317 + /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18318 + tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18319 + struct radix_tree_root jnodes_tree;
18320 +#if REISER4_DEBUG
18321 + /* number of unformatted node jnodes of this file in jnode hash table */
18322 + unsigned long nr_jnodes;
18323 +#endif
18324 +
18325 + /* block number of virtual root for this object. See comment above
18326 + * fs/reiser4/search.c:handle_vroot() */
18327 + reiser4_block_nr vroot;
18328 + struct mutex loading;
18329 +};
18330 +
18331 +void loading_init_once(reiser4_inode *);
18332 +void loading_alloc(reiser4_inode *);
18333 +void loading_destroy(reiser4_inode *);
18334 +
18335 +struct reiser4_inode_object {
18336 + /* private part */
18337 + reiser4_inode p;
18338 + /* generic fields not specific to reiser4, but used by VFS */
18339 + struct inode vfs_inode;
18340 +};
18341 +
18342 +/* return pointer to the reiser4 specific portion of @inode */
18343 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18344 + /* inode queried */ )
18345 +{
18346 + assert("nikita-254", inode != NULL);
18347 + return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
18348 +}
18349 +
18350 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18351 + r4_inode /* inode queried */
18352 + )
18353 +{
18354 + return &container_of(r4_inode, struct reiser4_inode_object,
18355 + p)->vfs_inode;
18356 +}
18357 +
18358 +/*
18359 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18360 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18361 + * bits.
18362 + *
18363 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18364 + * of inode, otherwise whole oid is stored in i_ino.
18365 + *
18366 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18367 + */
18368 +
18369 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18370 +
18371 +#if REISER4_INO_IS_OID
18372 +
18373 +static inline oid_t get_inode_oid(const struct inode *inode)
18374 +{
18375 + return inode->i_ino;
18376 +}
18377 +
18378 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18379 +{
18380 + inode->i_ino = oid;
18381 +}
18382 +
18383 +/* REISER4_INO_IS_OID */
18384 +#else
18385 +
18386 +static inline oid_t get_inode_oid(const struct inode *inode)
18387 +{
18388 + return
18389 + ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18390 + inode->i_ino;
18391 +}
18392 +
18393 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18394 +{
18395 + assert("nikita-2519", inode != NULL);
18396 + inode->i_ino = (ino_t) (oid);
18397 + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18398 + assert("nikita-2521", get_inode_oid(inode) == (oid));
18399 +}
18400 +
18401 +/* REISER4_INO_IS_OID */
18402 +#endif
18403 +
18404 +static inline oid_t get_inode_locality(const struct inode *inode)
18405 +{
18406 + return reiser4_inode_data(inode)->locality_id;
18407 +}
18408 +
18409 +#if REISER4_LARGE_KEY
18410 +static inline __u64 get_inode_ordering(const struct inode *inode)
18411 +{
18412 + return reiser4_inode_data(inode)->ordering;
18413 +}
18414 +
18415 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18416 +{
18417 + reiser4_inode_data(inode)->ordering = ordering;
18418 +}
18419 +
18420 +#else
18421 +
18422 +#define get_inode_ordering(inode) (0)
18423 +#define set_inode_ordering(inode, val) noop
18424 +
18425 +#endif
18426 +
18427 +/* return inode in which @uf_info is embedded */
18428 +static inline struct inode *
18429 +unix_file_info_to_inode(const struct unix_file_info *uf_info)
18430 +{
18431 + return &container_of(uf_info, struct reiser4_inode_object,
18432 + p.file_plugin_data.unix_file_info)->vfs_inode;
18433 +}
18434 +
18435 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18436 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18437 +
18438 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18439 +
18440 +#if REISER4_DEBUG
18441 +extern void reiser4_inode_invariant(const struct inode *inode);
18442 +extern int inode_has_no_jnodes(reiser4_inode *);
18443 +#else
18444 +#define reiser4_inode_invariant(inode) noop
18445 +#endif
18446 +
18447 +static inline int spin_inode_is_locked(const struct inode *inode)
18448 +{
18449 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18450 + return 1;
18451 +}
18452 +
18453 +/**
18454 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18455 + * @inode: inode to lock
18456 + *
18457 + * In debug mode it checks that lower priority locks are not held and
18458 + * increments reiser4_context's lock counters on which lock ordering checking
18459 + * is based.
18460 + */
18461 +static inline void spin_lock_inode(struct inode *inode)
18462 +{
18463 + assert("", LOCK_CNT_NIL(spin_locked));
18464 + /* check lock ordering */
18465 + assert_spin_not_locked(&d_lock);
18466 +
18467 + spin_lock(&reiser4_inode_data(inode)->guard);
18468 +
18469 + LOCK_CNT_INC(spin_locked_inode);
18470 + LOCK_CNT_INC(spin_locked);
18471 +
18472 + reiser4_inode_invariant(inode);
18473 +}
18474 +
18475 +/**
18476 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18477 + * @inode: inode to unlock
18478 + *
18479 + * In debug mode it checks that spinlock is held and decrements
18480 + * reiser4_context's lock counters on which lock ordering checking is based.
18481 + */
18482 +static inline void spin_unlock_inode(struct inode *inode)
18483 +{
18484 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18485 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18486 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18487 +
18488 + reiser4_inode_invariant(inode);
18489 +
18490 + LOCK_CNT_DEC(spin_locked_inode);
18491 + LOCK_CNT_DEC(spin_locked);
18492 +
18493 + spin_unlock(&reiser4_inode_data(inode)->guard);
18494 +}
18495 +
18496 +extern znode *inode_get_vroot(struct inode *inode);
18497 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18498 +
18499 +extern int reiser4_max_filename_len(const struct inode *inode);
18500 +extern int max_hash_collisions(const struct inode *dir);
18501 +extern void reiser4_unlock_inode(struct inode *inode);
18502 +extern int is_reiser4_inode(const struct inode *inode);
18503 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18504 +extern struct inode *reiser4_iget(struct super_block *super,
18505 + const reiser4_key * key, int silent);
18506 +extern void reiser4_iget_complete(struct inode *inode);
18507 +extern void reiser4_inode_set_flag(struct inode *inode,
18508 + reiser4_file_plugin_flags f);
18509 +extern void reiser4_inode_clr_flag(struct inode *inode,
18510 + reiser4_file_plugin_flags f);
18511 +extern int reiser4_inode_get_flag(const struct inode *inode,
18512 + reiser4_file_plugin_flags f);
18513 +
18514 +/* has inode been initialized? */
18515 +static inline int
18516 +is_inode_loaded(const struct inode *inode/* inode queried */)
18517 +{
18518 + assert("nikita-1120", inode != NULL);
18519 + return reiser4_inode_get_flag(inode, REISER4_LOADED);
18520 +}
18521 +
18522 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18523 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18524 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18525 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18526 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18527 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18528 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18529 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18530 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18531 + *inode);
18532 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18533 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18534 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18535 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18536 +extern file_plugin *child_create_plugin(const struct inode *inode);
18537 +
18538 +extern void reiser4_make_bad_inode(struct inode *inode);
18539 +
18540 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18541 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18542 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18543 +extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new);
18544 +
18545 +#define INODE_SET_SIZE(i, value) \
18546 +({ \
18547 + struct inode *__i; \
18548 + typeof(value) __v; \
18549 + \
18550 + __i = (i); \
18551 + __v = (value); \
18552 + inode_check_scale(__i, __i->i_size, __v); \
18553 + i_size_write(__i, __v); \
18554 +})
18555 +
18556 +/*
18557 + * update field @field in inode @i to contain value @value.
18558 + */
18559 +#define INODE_SET_FIELD(i, field, value) \
18560 +({ \
18561 + struct inode *__i; \
18562 + typeof(value) __v; \
18563 + \
18564 + __i = (i); \
18565 + __v = (value); \
18566 + inode_check_scale(__i, __i->field, __v); \
18567 + __i->field = __v; \
18568 +})
18569 +
18570 +#define INODE_INC_FIELD(i, field) \
18571 +({ \
18572 + struct inode *__i; \
18573 + \
18574 + __i = (i); \
18575 + inode_check_scale(__i, __i->field, __i->field + 1); \
18576 + ++ __i->field; \
18577 +})
18578 +
18579 +#define INODE_DEC_FIELD(i, field) \
18580 +({ \
18581 + struct inode *__i; \
18582 + \
18583 + __i = (i); \
18584 + inode_check_scale(__i, __i->field, __i->field - 1); \
18585 + -- __i->field; \
18586 +})
18587 +
18588 +/* See comment before reiser4_readdir_common() for description. */
18589 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18590 +{
18591 + return &reiser4_inode_data(inode)->lists.readdir_list;
18592 +}
18593 +
18594 +extern void init_inode_ordering(struct inode *inode,
18595 + reiser4_object_create_data * crd, int create);
18596 +
18597 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18598 +{
18599 + return &reiser4_inode_data(inode)->jnodes_tree;
18600 +}
18601 +
18602 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18603 + *r4_inode)
18604 +{
18605 + return &r4_inode->jnodes_tree;
18606 +}
18607 +
18608 +#if REISER4_DEBUG
18609 +extern void print_inode(const char *prefix, const struct inode *i);
18610 +#endif
18611 +
18612 +int is_dir_empty(const struct inode *);
18613 +
18614 +/* __REISER4_INODE_H__ */
18615 +#endif
18616 +
18617 +/* Make Linus happy.
18618 + Local variables:
18619 + c-indentation-style: "K&R"
18620 + mode-name: "LC"
18621 + c-basic-offset: 8
18622 + tab-width: 8
18623 + fill-column: 120
18624 + End:
18625 +*/
18626 diff -urN linux-2.6.33.orig/fs/reiser4/ioctl.h linux-2.6.33/fs/reiser4/ioctl.h
18627 --- linux-2.6.33.orig/fs/reiser4/ioctl.h 1970-01-01 01:00:00.000000000 +0100
18628 +++ linux-2.6.33/fs/reiser4/ioctl.h 2010-03-04 19:33:22.000000000 +0100
18629 @@ -0,0 +1,41 @@
18630 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18631 + * reiser4/README */
18632 +
18633 +#if !defined(__REISER4_IOCTL_H__)
18634 +#define __REISER4_IOCTL_H__
18635 +
18636 +#include <linux/fs.h>
18637 +
18638 +/*
18639 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18640 + * extents and fix in this state. This is used by applications that rely on
18641 + *
18642 + * . files being block aligned, and
18643 + *
18644 + * . files never migrating on disk
18645 + *
18646 + * for example, boot loaders (LILO) need this.
18647 + *
18648 + * This ioctl should be used as
18649 + *
18650 + * result = ioctl(fd, REISER4_IOC_UNPACK);
18651 + *
18652 + * File behind fd descriptor will be converted to the extents (if necessary),
18653 + * and its stat-data will be updated so that it will never be converted back
18654 + * into tails again.
18655 + */
18656 +#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long)
18657 +
18658 +/* __REISER4_IOCTL_H__ */
18659 +#endif
18660 +
18661 +/* Make Linus happy.
18662 + Local variables:
18663 + c-indentation-style: "K&R"
18664 + mode-name: "LC"
18665 + c-basic-offset: 8
18666 + tab-width: 8
18667 + fill-column: 120
18668 + scroll-step: 1
18669 + End:
18670 +*/
18671 diff -urN linux-2.6.33.orig/fs/reiser4/jnode.c linux-2.6.33/fs/reiser4/jnode.c
18672 --- linux-2.6.33.orig/fs/reiser4/jnode.c 1970-01-01 01:00:00.000000000 +0100
18673 +++ linux-2.6.33/fs/reiser4/jnode.c 2010-03-04 19:33:22.000000000 +0100
18674 @@ -0,0 +1,1923 @@
18675 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18676 + * reiser4/README */
18677 +/* Jnode manipulation functions. */
18678 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18679 +
18680 + In particular, jnodes are used to track transactional information
18681 + associated with each block. Each znode contains jnode as ->zjnode field.
18682 +
18683 + Jnode stands for either Josh or Journal node.
18684 +*/
18685 +
18686 +/*
18687 + * Taxonomy.
18688 + *
18689 + * Jnode represents block containing data or meta-data. There are jnodes
18690 + * for:
18691 + *
18692 + * unformatted blocks (jnodes proper). There are plans, however to
18693 + * have a handle per extent unit rather than per each unformatted
18694 + * block, because there are so many of them.
18695 + *
18696 + * For bitmaps. Each bitmap is actually represented by two jnodes--one
18697 + * for working and another for "commit" data, together forming bnode.
18698 + *
18699 + * For io-heads. These are used by log writer.
18700 + *
18701 + * For formatted nodes (znode). See comment at the top of znode.c for
18702 + * details specific to the formatted nodes (znodes).
18703 + *
18704 + * Node data.
18705 + *
18706 + * Jnode provides access to the data of node it represents. Data are
18707 + * stored in a page. Page is kept in a page cache. This means, that jnodes
18708 + * are highly interconnected with page cache and VM internals.
18709 + *
18710 + * jnode has a pointer to page (->pg) containing its data. Pointer to data
18711 + * themselves is cached in ->data field to avoid frequent calls to
18712 + * page_address().
18713 + *
18714 + * jnode and page are attached to each other by jnode_attach_page(). This
18715 + * function places pointer to jnode in set_page_private(), sets PG_private
18716 + * flag and increments page counter.
18717 + *
18718 + * Opposite operation is performed by page_clear_jnode().
18719 + *
18720 + * jnode->pg is protected by jnode spin lock, and page->private is
18721 + * protected by page lock. See comment at the top of page_cache.c for
18722 + * more.
18723 + *
18724 + * page can be detached from jnode for two reasons:
18725 + *
18726 + * . jnode is removed from a tree (file is truncated, of formatted
18727 + * node is removed by balancing).
18728 + *
18729 + * . during memory pressure, VM calls ->releasepage() method
18730 + * (reiser4_releasepage()) to evict page from memory.
18731 + *
18732 + * (there, of course, is also umount, but this is special case we are not
18733 + * concerned with here).
18734 + *
18735 + * To protect jnode page from eviction, one calls jload() function that
18736 + * "pins" page in memory (loading it if necessary), increments
18737 + * jnode->d_count, and kmap()s page. Page is unpinned through call to
18738 + * jrelse().
18739 + *
18740 + * Jnode life cycle.
18741 + *
18742 + * jnode is created, placed in hash table, and, optionally, in per-inode
18743 + * radix tree. Page can be attached to jnode, pinned, released, etc.
18744 + *
18745 + * When jnode is captured into atom its reference counter is
18746 + * increased. While being part of an atom, jnode can be "early
18747 + * flushed". This means that as part of flush procedure, jnode is placed
18748 + * into "relocate set", and its page is submitted to the disk. After io
18749 + * completes, page can be detached, then loaded again, re-dirtied, etc.
18750 + *
18751 + * Thread acquired reference to jnode by calling jref() and releases it by
18752 + * jput(). When last reference is removed, jnode is still retained in
18753 + * memory (cached) if it has page attached, _unless_ it is scheduled for
18754 + * destruction (has JNODE_HEARD_BANSHEE bit set).
18755 + *
18756 + * Tree read-write lock was used as "existential" lock for jnodes. That is,
18757 + * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18758 + * that is, tree lock protected unreferenced jnodes stored in the hash
18759 + * table, from recycling.
18760 + *
18761 + * This resulted in high contention on tree lock, because jref()/jput() is
18762 + * frequent operation. To ameliorate this problem, RCU is used: when jput()
18763 + * is just about to release last reference on jnode it sets JNODE_RIP bit
18764 + * on it, and then proceed with jnode destruction (removing jnode from hash
18765 + * table, cbk_cache, detaching page, etc.). All places that change jnode
18766 + * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18767 + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18768 + * jnode_rip_check() function), and pretend that nothing was found in hash
18769 + * table if bit is set.
18770 + *
18771 + * jput defers actual return of jnode into slab cache to some later time
18772 + * (by call_rcu()), this guarantees that other threads can safely continue
18773 + * working with JNODE_RIP-ped jnode.
18774 + *
18775 + */
18776 +
18777 +#include "reiser4.h"
18778 +#include "debug.h"
18779 +#include "dformat.h"
18780 +#include "jnode.h"
18781 +#include "plugin/plugin_header.h"
18782 +#include "plugin/plugin.h"
18783 +#include "txnmgr.h"
18784 +/*#include "jnode.h"*/
18785 +#include "znode.h"
18786 +#include "tree.h"
18787 +#include "tree_walk.h"
18788 +#include "super.h"
18789 +#include "inode.h"
18790 +#include "page_cache.h"
18791 +
18792 +#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18793 +#include <linux/types.h>
18794 +#include <linux/slab.h>
18795 +#include <linux/pagemap.h>
18796 +#include <linux/swap.h>
18797 +#include <linux/fs.h> /* for struct address_space */
18798 +#include <linux/writeback.h> /* for inode_lock */
18799 +
18800 +static struct kmem_cache *_jnode_slab = NULL;
18801 +
18802 +static void jnode_set_type(jnode * node, jnode_type type);
18803 +static int jdelete(jnode * node);
18804 +static int jnode_try_drop(jnode * node);
18805 +
18806 +#if REISER4_DEBUG
18807 +static int jnode_invariant(jnode * node, int tlocked, int jlocked);
18808 +#endif
18809 +
18810 +/* true if valid page is attached to jnode */
18811 +static inline int jnode_is_parsed(jnode * node)
18812 +{
18813 + return JF_ISSET(node, JNODE_PARSED);
18814 +}
18815 +
18816 +/* hash table support */
18817 +
18818 +/* compare two jnode keys for equality. Used by hash-table macros */
18819 +static inline int jnode_key_eq(const struct jnode_key *k1,
18820 + const struct jnode_key *k2)
18821 +{
18822 + assert("nikita-2350", k1 != NULL);
18823 + assert("nikita-2351", k2 != NULL);
18824 +
18825 + return (k1->index == k2->index && k1->objectid == k2->objectid);
18826 +}
18827 +
18828 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18829 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
18830 + const struct jnode_key *key)
18831 +{
18832 + assert("nikita-2352", key != NULL);
18833 + assert("nikita-3346", IS_POW(table->_buckets));
18834 +
18835 + /* yes, this is remarkable simply (where not stupid) hash function. */
18836 + return (key->objectid + key->index) & (table->_buckets - 1);
18837 +}
18838 +
18839 +/* The hash table definition */
18840 +#define KMALLOC(size) reiser4_vmalloc(size)
18841 +#define KFREE(ptr, size) vfree(ptr)
18842 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18843 + jnode_key_hashfn, jnode_key_eq);
18844 +#undef KFREE
18845 +#undef KMALLOC
18846 +
18847 +/* call this to initialise jnode hash table */
18848 +int jnodes_tree_init(reiser4_tree * tree/* tree to initialise jnodes for */)
18849 +{
18850 + assert("nikita-2359", tree != NULL);
18851 + return j_hash_init(&tree->jhash_table, 16384);
18852 +}
18853 +
18854 +/* call this to destroy jnode hash table. This is called during umount. */
18855 +int jnodes_tree_done(reiser4_tree * tree/* tree to destroy jnodes for */)
18856 +{
18857 + j_hash_table *jtable;
18858 + jnode *node;
18859 + jnode *next;
18860 +
18861 + assert("nikita-2360", tree != NULL);
18862 +
18863 + /*
18864 + * Scan hash table and free all jnodes.
18865 + */
18866 + jtable = &tree->jhash_table;
18867 + if (jtable->_table) {
18868 + for_all_in_htable(jtable, j, node, next) {
18869 + assert("nikita-2361", !atomic_read(&node->x_count));
18870 + jdrop(node);
18871 + }
18872 +
18873 + j_hash_done(&tree->jhash_table);
18874 + }
18875 + return 0;
18876 +}
18877 +
18878 +/**
18879 + * init_jnodes - create jnode cache
18880 + *
18881 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18882 + */
18883 +int init_jnodes(void)
18884 +{
18885 + assert("umka-168", _jnode_slab == NULL);
18886 +
18887 + _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18888 + SLAB_HWCACHE_ALIGN |
18889 + SLAB_RECLAIM_ACCOUNT, NULL);
18890 + if (_jnode_slab == NULL)
18891 + return RETERR(-ENOMEM);
18892 +
18893 + return 0;
18894 +}
18895 +
18896 +/**
18897 + * done_znodes - delete znode cache
18898 + *
18899 + * This is called on reiser4 module unloading or system shutdown.
18900 + */
18901 +void done_jnodes(void)
18902 +{
18903 + destroy_reiser4_cache(&_jnode_slab);
18904 +}
18905 +
18906 +/* Initialize a jnode. */
18907 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18908 +{
18909 + assert("umka-175", node != NULL);
18910 +
18911 + memset(node, 0, sizeof(jnode));
18912 + ON_DEBUG(node->magic = JMAGIC);
18913 + jnode_set_type(node, type);
18914 + atomic_set(&node->d_count, 0);
18915 + atomic_set(&node->x_count, 0);
18916 + spin_lock_init(&node->guard);
18917 + spin_lock_init(&node->load);
18918 + node->atom = NULL;
18919 + node->tree = tree;
18920 + INIT_LIST_HEAD(&node->capture_link);
18921 +
18922 + ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18923 +
18924 + INIT_RCU_HEAD(&node->rcu);
18925 +
18926 +#if REISER4_DEBUG
18927 + {
18928 + reiser4_super_info_data *sbinfo;
18929 +
18930 + sbinfo = get_super_private(tree->super);
18931 + spin_lock_irq(&sbinfo->all_guard);
18932 + list_add(&node->jnodes, &sbinfo->all_jnodes);
18933 + spin_unlock_irq(&sbinfo->all_guard);
18934 + }
18935 +#endif
18936 +}
18937 +
18938 +#if REISER4_DEBUG
18939 +/*
18940 + * Remove jnode from ->all_jnodes list.
18941 + */
18942 +static void jnode_done(jnode * node, reiser4_tree * tree)
18943 +{
18944 + reiser4_super_info_data *sbinfo;
18945 +
18946 + sbinfo = get_super_private(tree->super);
18947 +
18948 + spin_lock_irq(&sbinfo->all_guard);
18949 + assert("nikita-2422", !list_empty(&node->jnodes));
18950 + list_del_init(&node->jnodes);
18951 + spin_unlock_irq(&sbinfo->all_guard);
18952 +}
18953 +#endif
18954 +
18955 +/* return already existing jnode of page */
18956 +jnode *jnode_by_page(struct page *pg)
18957 +{
18958 + assert("nikita-2066", pg != NULL);
18959 + assert("nikita-2400", PageLocked(pg));
18960 + assert("nikita-2068", PagePrivate(pg));
18961 + assert("nikita-2067", jprivate(pg) != NULL);
18962 + return jprivate(pg);
18963 +}
18964 +
18965 +/* exported functions to allocate/free jnode objects outside this file */
18966 +jnode *jalloc(void)
18967 +{
18968 + jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18969 + return jal;
18970 +}
18971 +
18972 +/* return jnode back to the slab allocator */
18973 +inline void jfree(jnode * node)
18974 +{
18975 + assert("zam-449", node != NULL);
18976 +
18977 + assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18978 + NODE_LIST(node) == NOT_CAPTURED));
18979 + assert("nikita-3222", list_empty(&node->jnodes));
18980 + assert("nikita-3221", jnode_page(node) == NULL);
18981 +
18982 + /* not yet phash_jnode_destroy(node); */
18983 +
18984 + kmem_cache_free(_jnode_slab, node);
18985 +}
18986 +
18987 +/*
18988 + * This function is supplied as RCU callback. It actually frees jnode when
18989 + * last reference to it is gone.
18990 + */
18991 +static void jnode_free_actor(struct rcu_head *head)
18992 +{
18993 + jnode *node;
18994 + jnode_type jtype;
18995 +
18996 + node = container_of(head, jnode, rcu);
18997 + jtype = jnode_get_type(node);
18998 +
18999 + ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
19000 +
19001 + switch (jtype) {
19002 + case JNODE_IO_HEAD:
19003 + case JNODE_BITMAP:
19004 + case JNODE_UNFORMATTED_BLOCK:
19005 + jfree(node);
19006 + break;
19007 + case JNODE_FORMATTED_BLOCK:
19008 + zfree(JZNODE(node));
19009 + break;
19010 + case JNODE_INODE:
19011 + default:
19012 + wrong_return_value("nikita-3197", "Wrong jnode type");
19013 + }
19014 +}
19015 +
19016 +/*
19017 + * Free a jnode. Post a callback to be executed later through RCU when all
19018 + * references to @node are released.
19019 + */
19020 +static inline void jnode_free(jnode * node, jnode_type jtype)
19021 +{
19022 + if (jtype != JNODE_INODE) {
19023 + /*assert("nikita-3219", list_empty(&node->rcu.list)); */
19024 + call_rcu(&node->rcu, jnode_free_actor);
19025 + } else
19026 + jnode_list_remove(node);
19027 +}
19028 +
19029 +/* allocate new unformatted jnode */
19030 +static jnode *jnew_unformatted(void)
19031 +{
19032 + jnode *jal;
19033 +
19034 + jal = jalloc();
19035 + if (jal == NULL)
19036 + return NULL;
19037 +
19038 + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
19039 + jal->key.j.mapping = NULL;
19040 + jal->key.j.index = (unsigned long)-1;
19041 + jal->key.j.objectid = 0;
19042 + return jal;
19043 +}
19044 +
19045 +/* look for jnode with given mapping and offset within hash table */
19046 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
19047 +{
19048 + struct jnode_key jkey;
19049 + jnode *node;
19050 +
19051 + assert("nikita-2353", tree != NULL);
19052 +
19053 + jkey.objectid = objectid;
19054 + jkey.index = index;
19055 +
19056 + /*
19057 + * hash table is _not_ protected by any lock during lookups. All we
19058 + * have to do is to disable preemption to keep RCU happy.
19059 + */
19060 +
19061 + rcu_read_lock();
19062 + node = j_hash_find(&tree->jhash_table, &jkey);
19063 + if (node != NULL) {
19064 + /* protect @node from recycling */
19065 + jref(node);
19066 + assert("nikita-2955", jnode_invariant(node, 0, 0));
19067 + node = jnode_rip_check(tree, node);
19068 + }
19069 + rcu_read_unlock();
19070 + return node;
19071 +}
19072 +
19073 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19074 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19075 +{
19076 + assert("vs-1694", mapping->host != NULL);
19077 +
19078 + return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19079 +}
19080 +
19081 +jnode *jfind(struct address_space *mapping, unsigned long index)
19082 +{
19083 + reiser4_tree *tree;
19084 + jnode *node;
19085 +
19086 + assert("vs-1694", mapping->host != NULL);
19087 + tree = reiser4_tree_by_inode(mapping->host);
19088 +
19089 + read_lock_tree(tree);
19090 + node = jfind_nolock(mapping, index);
19091 + if (node != NULL)
19092 + jref(node);
19093 + read_unlock_tree(tree);
19094 + return node;
19095 +}
19096 +
19097 +static void inode_attach_jnode(jnode * node)
19098 +{
19099 + struct inode *inode;
19100 + reiser4_inode *info;
19101 + struct radix_tree_root *rtree;
19102 +
19103 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19104 + assert("zam-1043", node->key.j.mapping != NULL);
19105 + inode = node->key.j.mapping->host;
19106 + info = reiser4_inode_data(inode);
19107 + rtree = jnode_tree_by_reiser4_inode(info);
19108 + if (rtree->rnode == NULL) {
19109 + /* prevent inode from being pruned when it has jnodes attached
19110 + to it */
19111 + spin_lock_irq(&inode->i_data.tree_lock);
19112 + inode->i_data.nrpages++;
19113 + spin_unlock_irq(&inode->i_data.tree_lock);
19114 + }
19115 + assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19116 + check_me("zam-1045",
19117 + !radix_tree_insert(rtree, node->key.j.index, node));
19118 + ON_DEBUG(info->nr_jnodes++);
19119 +}
19120 +
19121 +static void inode_detach_jnode(jnode * node)
19122 +{
19123 + struct inode *inode;
19124 + reiser4_inode *info;
19125 + struct radix_tree_root *rtree;
19126 +
19127 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19128 + assert("zam-1044", node->key.j.mapping != NULL);
19129 + inode = node->key.j.mapping->host;
19130 + info = reiser4_inode_data(inode);
19131 + rtree = jnode_tree_by_reiser4_inode(info);
19132 +
19133 + assert("zam-1051", info->nr_jnodes != 0);
19134 + assert("zam-1052", rtree->rnode != NULL);
19135 + ON_DEBUG(info->nr_jnodes--);
19136 +
19137 + /* delete jnode from inode's radix tree of jnodes */
19138 + check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19139 + if (rtree->rnode == NULL) {
19140 + /* inode can be pruned now */
19141 + spin_lock_irq(&inode->i_data.tree_lock);
19142 + inode->i_data.nrpages--;
19143 + spin_unlock_irq(&inode->i_data.tree_lock);
19144 + }
19145 +}
19146 +
19147 +/* put jnode into hash table (where they can be found by flush who does not know
19148 + mapping) and to inode's tree of jnodes (where they can be found (hopefully
19149 + faster) in places where mapping is known). Currently it is used by
19150 + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19151 + created */
19152 +static void
19153 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19154 + unsigned long index)
19155 +{
19156 + j_hash_table *jtable;
19157 +
19158 + assert("vs-1446", jnode_is_unformatted(node));
19159 + assert("vs-1442", node->key.j.mapping == 0);
19160 + assert("vs-1443", node->key.j.objectid == 0);
19161 + assert("vs-1444", node->key.j.index == (unsigned long)-1);
19162 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19163 +
19164 + node->key.j.mapping = mapping;
19165 + node->key.j.objectid = get_inode_oid(mapping->host);
19166 + node->key.j.index = index;
19167 +
19168 + jtable = &jnode_get_tree(node)->jhash_table;
19169 +
19170 + /* race with some other thread inserting jnode into the hash table is
19171 + * impossible, because we keep the page lock. */
19172 + /*
19173 + * following assertion no longer holds because of RCU: it is possible
19174 + * jnode is in the hash table, but with JNODE_RIP bit set.
19175 + */
19176 + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19177 + j_hash_insert_rcu(jtable, node);
19178 + inode_attach_jnode(node);
19179 +}
19180 +
19181 +static void unhash_unformatted_node_nolock(jnode * node)
19182 +{
19183 + assert("vs-1683", node->key.j.mapping != NULL);
19184 + assert("vs-1684",
19185 + node->key.j.objectid ==
19186 + get_inode_oid(node->key.j.mapping->host));
19187 +
19188 + /* remove jnode from hash-table */
19189 + j_hash_remove_rcu(&node->tree->jhash_table, node);
19190 + inode_detach_jnode(node);
19191 + node->key.j.mapping = NULL;
19192 + node->key.j.index = (unsigned long)-1;
19193 + node->key.j.objectid = 0;
19194 +
19195 +}
19196 +
19197 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19198 + reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19199 + reiser4_uncapture_jnode */
19200 +void unhash_unformatted_jnode(jnode * node)
19201 +{
19202 + assert("vs-1445", jnode_is_unformatted(node));
19203 +
19204 + write_lock_tree(node->tree);
19205 + unhash_unformatted_node_nolock(node);
19206 + write_unlock_tree(node->tree);
19207 +}
19208 +
19209 +/*
19210 + * search hash table for a jnode with given oid and index. If not found,
19211 + * allocate new jnode, insert it, and also insert into radix tree for the
19212 + * given inode/mapping.
19213 + */
19214 +static jnode *find_get_jnode(reiser4_tree * tree,
19215 + struct address_space *mapping,
19216 + oid_t oid, unsigned long index)
19217 +{
19218 + jnode *result;
19219 + jnode *shadow;
19220 + int preload;
19221 +
19222 + result = jnew_unformatted();
19223 +
19224 + if (unlikely(result == NULL))
19225 + return ERR_PTR(RETERR(-ENOMEM));
19226 +
19227 + preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
19228 + if (preload != 0)
19229 + return ERR_PTR(preload);
19230 +
19231 + write_lock_tree(tree);
19232 + shadow = jfind_nolock(mapping, index);
19233 + if (likely(shadow == NULL)) {
19234 + /* add new jnode to hash table and inode's radix tree of
19235 + * jnodes */
19236 + jref(result);
19237 + hash_unformatted_jnode(result, mapping, index);
19238 + } else {
19239 + /* jnode is found in inode's radix tree of jnodes */
19240 + jref(shadow);
19241 + jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19242 + assert("vs-1498", shadow->key.j.mapping == mapping);
19243 + result = shadow;
19244 + }
19245 + write_unlock_tree(tree);
19246 +
19247 + assert("nikita-2955",
19248 + ergo(result != NULL, jnode_invariant(result, 0, 0)));
19249 + radix_tree_preload_end();
19250 + return result;
19251 +}
19252 +
19253 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19254 + creates) jnode corresponding to page @pg. jnode is attached to page and
19255 + inserted into jnode hash-table. */
19256 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19257 +{
19258 + /*
19259 + * There are two ways to create jnode: starting with pre-existing page
19260 + * and without page.
19261 + *
19262 + * When page already exists, jnode is created
19263 + * (jnode_of_page()->do_jget()) under page lock. This is done in
19264 + * ->writepage(), or when capturing anonymous page dirtied through
19265 + * mmap.
19266 + *
19267 + * Jnode without page is created by index_extent_jnode().
19268 + *
19269 + */
19270 +
19271 + jnode *result;
19272 + oid_t oid = get_inode_oid(pg->mapping->host);
19273 +
19274 + assert("umka-176", pg != NULL);
19275 + assert("nikita-2394", PageLocked(pg));
19276 +
19277 + result = jprivate(pg);
19278 + if (likely(result != NULL))
19279 + return jref(result);
19280 +
19281 + tree = reiser4_tree_by_page(pg);
19282 +
19283 + /* check hash-table first */
19284 + result = jfind(pg->mapping, pg->index);
19285 + if (unlikely(result != NULL)) {
19286 + spin_lock_jnode(result);
19287 + jnode_attach_page(result, pg);
19288 + spin_unlock_jnode(result);
19289 + result->key.j.mapping = pg->mapping;
19290 + return result;
19291 + }
19292 +
19293 + /* since page is locked, jnode should be allocated with GFP_NOFS flag */
19294 + reiser4_ctx_gfp_mask_force(GFP_NOFS);
19295 + result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19296 + if (unlikely(IS_ERR(result)))
19297 + return result;
19298 + /* attach jnode to page */
19299 + spin_lock_jnode(result);
19300 + jnode_attach_page(result, pg);
19301 + spin_unlock_jnode(result);
19302 + return result;
19303 +}
19304 +
19305 +/*
19306 + * return jnode for @pg, creating it if necessary.
19307 + */
19308 +jnode *jnode_of_page(struct page *pg)
19309 +{
19310 + jnode *result;
19311 +
19312 + assert("umka-176", pg != NULL);
19313 + assert("nikita-2394", PageLocked(pg));
19314 +
19315 + result = do_jget(reiser4_tree_by_page(pg), pg);
19316 +
19317 + if (REISER4_DEBUG && !IS_ERR(result)) {
19318 + assert("nikita-3210", result == jprivate(pg));
19319 + assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19320 + if (jnode_is_unformatted(jprivate(pg))) {
19321 + assert("nikita-2364",
19322 + jprivate(pg)->key.j.index == pg->index);
19323 + assert("nikita-2367",
19324 + jprivate(pg)->key.j.mapping == pg->mapping);
19325 + assert("nikita-2365",
19326 + jprivate(pg)->key.j.objectid ==
19327 + get_inode_oid(pg->mapping->host));
19328 + assert("vs-1200",
19329 + jprivate(pg)->key.j.objectid ==
19330 + pg->mapping->host->i_ino);
19331 + assert("nikita-2356",
19332 + jnode_is_unformatted(jnode_by_page(pg)));
19333 + }
19334 + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19335 + }
19336 + return result;
19337 +}
19338 +
19339 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19340 + * page.*/
19341 +void jnode_attach_page(jnode * node, struct page *pg)
19342 +{
19343 + assert("nikita-2060", node != NULL);
19344 + assert("nikita-2061", pg != NULL);
19345 +
19346 + assert("nikita-2050", jprivate(pg) == 0ul);
19347 + assert("nikita-2393", !PagePrivate(pg));
19348 + assert("vs-1741", node->pg == NULL);
19349 +
19350 + assert("nikita-2396", PageLocked(pg));
19351 + assert_spin_locked(&(node->guard));
19352 +
19353 + page_cache_get(pg);
19354 + set_page_private(pg, (unsigned long)node);
19355 + node->pg = pg;
19356 + SetPagePrivate(pg);
19357 +}
19358 +
19359 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19360 +void page_clear_jnode(struct page *page, jnode * node)
19361 +{
19362 + assert("nikita-2424", page != NULL);
19363 + assert("nikita-2425", PageLocked(page));
19364 + assert("nikita-2426", node != NULL);
19365 + assert_spin_locked(&(node->guard));
19366 + assert("nikita-2428", PagePrivate(page));
19367 +
19368 + assert("nikita-3551", !PageWriteback(page));
19369 +
19370 + JF_CLR(node, JNODE_PARSED);
19371 + set_page_private(page, 0ul);
19372 + ClearPagePrivate(page);
19373 + node->pg = NULL;
19374 + page_cache_release(page);
19375 +}
19376 +
19377 +#if 0
19378 +/* it is only used in one place to handle error */
19379 +void
19380 +page_detach_jnode(struct page *page, struct address_space *mapping,
19381 + unsigned long index)
19382 +{
19383 + assert("nikita-2395", page != NULL);
19384 +
19385 + lock_page(page);
19386 + if ((page->mapping == mapping) && (page->index == index)
19387 + && PagePrivate(page)) {
19388 + jnode *node;
19389 +
19390 + node = jprivate(page);
19391 + spin_lock_jnode(node);
19392 + page_clear_jnode(page, node);
19393 + spin_unlock_jnode(node);
19394 + }
19395 + unlock_page(page);
19396 +}
19397 +#endif /* 0 */
19398 +
19399 +/* return @node page locked.
19400 +
19401 + Locking ordering requires that one first takes page lock and afterwards
19402 + spin lock on node attached to this page. Sometimes it is necessary to go in
19403 + the opposite direction. This is done through standard trylock-and-release
19404 + loop.
19405 +*/
19406 +static struct page *jnode_lock_page(jnode * node)
19407 +{
19408 + struct page *page;
19409 +
19410 + assert("nikita-2052", node != NULL);
19411 + assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19412 +
19413 + while (1) {
19414 +
19415 + spin_lock_jnode(node);
19416 + page = jnode_page(node);
19417 + if (page == NULL)
19418 + break;
19419 +
19420 + /* no need to page_cache_get( page ) here, because page cannot
19421 + be evicted from memory without detaching it from jnode and
19422 + this requires spin lock on jnode that we already hold.
19423 + */
19424 + if (trylock_page(page)) {
19425 + /* We won a lock on jnode page, proceed. */
19426 + break;
19427 + }
19428 +
19429 + /* Page is locked by someone else. */
19430 + page_cache_get(page);
19431 + spin_unlock_jnode(node);
19432 + wait_on_page_locked(page);
19433 + /* it is possible that page was detached from jnode and
19434 + returned to the free pool, or re-assigned while we were
19435 + waiting on locked bit. This will be rechecked on the next
19436 + loop iteration.
19437 + */
19438 + page_cache_release(page);
19439 +
19440 + /* try again */
19441 + }
19442 + return page;
19443 +}
19444 +
19445 +/*
19446 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19447 + * validness of jnode content.
19448 + */
19449 +static inline int jparse(jnode * node)
19450 +{
19451 + int result;
19452 +
19453 + assert("nikita-2466", node != NULL);
19454 +
19455 + spin_lock_jnode(node);
19456 + if (likely(!jnode_is_parsed(node))) {
19457 + result = jnode_ops(node)->parse(node);
19458 + if (likely(result == 0))
19459 + JF_SET(node, JNODE_PARSED);
19460 + } else
19461 + result = 0;
19462 + spin_unlock_jnode(node);
19463 + return result;
19464 +}
19465 +
19466 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19467 + * one. */
19468 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19469 +{
19470 + struct page *page;
19471 +
19472 + spin_lock_jnode(node);
19473 + page = jnode_page(node);
19474 +
19475 + if (page == NULL) {
19476 + spin_unlock_jnode(node);
19477 + page = find_or_create_page(jnode_get_mapping(node),
19478 + jnode_get_index(node), gfp_flags);
19479 + if (page == NULL)
19480 + return ERR_PTR(RETERR(-ENOMEM));
19481 + } else {
19482 + if (trylock_page(page)) {
19483 + spin_unlock_jnode(node);
19484 + return page;
19485 + }
19486 + page_cache_get(page);
19487 + spin_unlock_jnode(node);
19488 + lock_page(page);
19489 + assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19490 + }
19491 +
19492 + spin_lock_jnode(node);
19493 + if (!jnode_page(node))
19494 + jnode_attach_page(node, page);
19495 + spin_unlock_jnode(node);
19496 +
19497 + page_cache_release(page);
19498 + assert("zam-894", jnode_page(node) == page);
19499 + return page;
19500 +}
19501 +
19502 +/* Start read operation for jnode's page if page is not up-to-date. */
19503 +static int jnode_start_read(jnode * node, struct page *page)
19504 +{
19505 + assert("zam-893", PageLocked(page));
19506 +
19507 + if (PageUptodate(page)) {
19508 + unlock_page(page);
19509 + return 0;
19510 + }
19511 + return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19512 +}
19513 +
19514 +#if REISER4_DEBUG
19515 +static void check_jload(jnode * node, struct page *page)
19516 +{
19517 + if (jnode_is_znode(node)) {
19518 + node40_header *nh;
19519 + znode *z;
19520 +
19521 + z = JZNODE(node);
19522 + if (znode_is_any_locked(z)) {
19523 + nh = (node40_header *) kmap(page);
19524 + /* this only works for node40-only file systems. For
19525 + * debugging. */
19526 + assert("nikita-3253",
19527 + z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19528 + kunmap(page);
19529 + }
19530 + assert("nikita-3565", znode_invariant(z));
19531 + }
19532 +}
19533 +#else
19534 +#define check_jload(node, page) noop
19535 +#endif
19536 +
19537 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19538 + * to call jload() shortly. This will bring appropriate portion of jnode into
19539 + * CPU cache. */
19540 +void jload_prefetch(jnode * node)
19541 +{
19542 + prefetchw(&node->x_count);
19543 +}
19544 +
19545 +/* load jnode's data into memory */
19546 +int jload_gfp(jnode * node /* node to load */ ,
19547 + gfp_t gfp_flags /* allocation flags */ ,
19548 + int do_kmap/* true if page should be kmapped */)
19549 +{
19550 + struct page *page;
19551 + int result = 0;
19552 + int parsed;
19553 +
19554 + assert("nikita-3010", reiser4_schedulable());
19555 +
19556 + prefetchw(&node->pg);
19557 +
19558 + /* taking d-reference implies taking x-reference. */
19559 + jref(node);
19560 +
19561 + /*
19562 + * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19563 + * should be atomic, otherwise there is a race against
19564 + * reiser4_releasepage().
19565 + */
19566 + spin_lock(&(node->load));
19567 + add_d_ref(node);
19568 + parsed = jnode_is_parsed(node);
19569 + spin_unlock(&(node->load));
19570 +
19571 + if (unlikely(!parsed)) {
19572 + page = jnode_get_page_locked(node, gfp_flags);
19573 + if (unlikely(IS_ERR(page))) {
19574 + result = PTR_ERR(page);
19575 + goto failed;
19576 + }
19577 +
19578 + result = jnode_start_read(node, page);
19579 + if (unlikely(result != 0))
19580 + goto failed;
19581 +
19582 + wait_on_page_locked(page);
19583 + if (unlikely(!PageUptodate(page))) {
19584 + result = RETERR(-EIO);
19585 + goto failed;
19586 + }
19587 +
19588 + if (do_kmap)
19589 + node->data = kmap(page);
19590 +
19591 + result = jparse(node);
19592 + if (unlikely(result != 0)) {
19593 + if (do_kmap)
19594 + kunmap(page);
19595 + goto failed;
19596 + }
19597 + check_jload(node, page);
19598 + } else {
19599 + page = jnode_page(node);
19600 + check_jload(node, page);
19601 + if (do_kmap)
19602 + node->data = kmap(page);
19603 + }
19604 +
19605 + if (!is_writeout_mode())
19606 + /* We do not mark pages active if jload is called as a part of
19607 + * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19608 + * and write_logs() add no value to cached data, there is no
19609 + * sense to mark pages as active when they go to disk, it just
19610 + * confuses vm scanning routines because clean page could be
19611 + * moved out from inactive list as a result of this
19612 + * mark_page_accessed() call. */
19613 + mark_page_accessed(page);
19614 +
19615 + return 0;
19616 +
19617 +failed:
19618 + jrelse_tail(node);
19619 + return result;
19620 +
19621 +}
19622 +
19623 +/* start asynchronous reading for given jnode's page. */
19624 +int jstartio(jnode * node)
19625 +{
19626 + struct page *page;
19627 +
19628 + page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19629 + if (IS_ERR(page))
19630 + return PTR_ERR(page);
19631 +
19632 + return jnode_start_read(node, page);
19633 +}
19634 +
19635 +/* Initialize a node by calling appropriate plugin instead of reading
19636 + * node from disk as in jload(). */
19637 +int jinit_new(jnode * node, gfp_t gfp_flags)
19638 +{
19639 + struct page *page;
19640 + int result;
19641 +
19642 + jref(node);
19643 + add_d_ref(node);
19644 +
19645 + page = jnode_get_page_locked(node, gfp_flags);
19646 + if (IS_ERR(page)) {
19647 + result = PTR_ERR(page);
19648 + goto failed;
19649 + }
19650 +
19651 + SetPageUptodate(page);
19652 + unlock_page(page);
19653 +
19654 + node->data = kmap(page);
19655 +
19656 + if (!jnode_is_parsed(node)) {
19657 + jnode_plugin *jplug = jnode_ops(node);
19658 + spin_lock_jnode(node);
19659 + result = jplug->init(node);
19660 + spin_unlock_jnode(node);
19661 + if (result) {
19662 + kunmap(page);
19663 + goto failed;
19664 + }
19665 + JF_SET(node, JNODE_PARSED);
19666 + }
19667 +
19668 + return 0;
19669 +
19670 +failed:
19671 + jrelse(node);
19672 + return result;
19673 +}
19674 +
19675 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19676 +void jrelse_tail(jnode * node/* jnode to release references to */)
19677 +{
19678 + assert("nikita-489", atomic_read(&node->d_count) > 0);
19679 + atomic_dec(&node->d_count);
19680 + /* release reference acquired in jload_gfp() or jinit_new() */
19681 + jput(node);
19682 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
19683 + LOCK_CNT_DEC(d_refs);
19684 +}
19685 +
19686 +/* drop reference to node data. When last reference is dropped, data are
19687 + unloaded. */
19688 +void jrelse(jnode * node/* jnode to release references to */)
19689 +{
19690 + struct page *page;
19691 +
19692 + assert("nikita-487", node != NULL);
19693 + assert_spin_not_locked(&(node->guard));
19694 +
19695 + page = jnode_page(node);
19696 + if (likely(page != NULL)) {
19697 + /*
19698 + * it is safe not to lock jnode here, because at this point
19699 + * @node->d_count is greater than zero (if jrelse() is used
19700 + * correctly, that is). JNODE_PARSED may be not set yet, if,
19701 + * for example, we got here as a result of error handling path
19702 + * in jload(). Anyway, page cannot be detached by
19703 + * reiser4_releasepage(). truncate will invalidate page
19704 + * regardless, but this should not be a problem.
19705 + */
19706 + kunmap(page);
19707 + }
19708 + jrelse_tail(node);
19709 +}
19710 +
19711 +/* called from jput() to wait for io completion */
19712 +static void jnode_finish_io(jnode * node)
19713 +{
19714 + struct page *page;
19715 +
19716 + assert("nikita-2922", node != NULL);
19717 +
19718 + spin_lock_jnode(node);
19719 + page = jnode_page(node);
19720 + if (page != NULL) {
19721 + page_cache_get(page);
19722 + spin_unlock_jnode(node);
19723 + wait_on_page_writeback(page);
19724 + page_cache_release(page);
19725 + } else
19726 + spin_unlock_jnode(node);
19727 +}
19728 +
19729 +/*
19730 + * This is called by jput() when last reference to jnode is released. This is
19731 + * separate function, because we want fast path of jput() to be inline and,
19732 + * therefore, small.
19733 + */
19734 +void jput_final(jnode * node)
19735 +{
19736 + int r_i_p;
19737 +
19738 + /* A fast check for keeping node in cache. We always keep node in cache
19739 + * if its page is present and node was not marked for deletion */
19740 + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19741 + rcu_read_unlock();
19742 + return;
19743 + }
19744 + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19745 + /*
19746 + * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19747 + * this case it is safe to access node after unlock.
19748 + */
19749 + rcu_read_unlock();
19750 + if (r_i_p) {
19751 + jnode_finish_io(node);
19752 + if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19753 + /* node is removed from the tree. */
19754 + jdelete(node);
19755 + else
19756 + jnode_try_drop(node);
19757 + }
19758 + /* if !r_i_p some other thread is already killing it */
19759 +}
19760 +
19761 +int jwait_io(jnode * node, int rw)
19762 +{
19763 + struct page *page;
19764 + int result;
19765 +
19766 + assert("zam-447", node != NULL);
19767 + assert("zam-448", jnode_page(node) != NULL);
19768 +
19769 + page = jnode_page(node);
19770 +
19771 + result = 0;
19772 + if (rw == READ) {
19773 + wait_on_page_locked(page);
19774 + } else {
19775 + assert("nikita-2227", rw == WRITE);
19776 + wait_on_page_writeback(page);
19777 + }
19778 + if (PageError(page))
19779 + result = RETERR(-EIO);
19780 +
19781 + return result;
19782 +}
19783 +
19784 +/*
19785 + * jnode types and plugins.
19786 + *
19787 + * jnode by itself is a "base type". There are several different jnode
19788 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19789 + * has to do different things based on jnode type. In the standard reiser4 way
19790 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19791 + *
19792 + * Functions below deal with jnode types and define methods of jnode plugin.
19793 + *
19794 + */
19795 +
19796 +/* set jnode type. This is done during jnode initialization. */
19797 +static void jnode_set_type(jnode * node, jnode_type type)
19798 +{
19799 + static unsigned long type_to_mask[] = {
19800 + [JNODE_UNFORMATTED_BLOCK] = 1,
19801 + [JNODE_FORMATTED_BLOCK] = 0,
19802 + [JNODE_BITMAP] = 2,
19803 + [JNODE_IO_HEAD] = 6,
19804 + [JNODE_INODE] = 4
19805 + };
19806 +
19807 + assert("zam-647", type < LAST_JNODE_TYPE);
19808 + assert("nikita-2815", !jnode_is_loaded(node));
19809 + assert("nikita-3386", node->state == 0);
19810 +
19811 + node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19812 +}
19813 +
19814 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19815 + * specific initialization. */
19816 +static int init_noinit(jnode * node UNUSED_ARG)
19817 +{
19818 + return 0;
19819 +}
19820 +
19821 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19822 + * specific pasring. */
19823 +static int parse_noparse(jnode * node UNUSED_ARG)
19824 +{
19825 + return 0;
19826 +}
19827 +
19828 +/* ->mapping() method for unformatted jnode */
19829 +struct address_space *mapping_jnode(const jnode * node)
19830 +{
19831 + struct address_space *map;
19832 +
19833 + assert("nikita-2713", node != NULL);
19834 +
19835 + /* mapping is stored in jnode */
19836 +
19837 + map = node->key.j.mapping;
19838 + assert("nikita-2714", map != NULL);
19839 + assert("nikita-2897", is_reiser4_inode(map->host));
19840 + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19841 + return map;
19842 +}
19843 +
19844 +/* ->index() method for unformatted jnodes */
19845 +unsigned long index_jnode(const jnode * node)
19846 +{
19847 + /* index is stored in jnode */
19848 + return node->key.j.index;
19849 +}
19850 +
19851 +/* ->remove() method for unformatted jnodes */
19852 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19853 +{
19854 + /* remove jnode from hash table and radix tree */
19855 + if (node->key.j.mapping)
19856 + unhash_unformatted_node_nolock(node);
19857 +}
19858 +
19859 +/* ->mapping() method for znodes */
19860 +static struct address_space *mapping_znode(const jnode * node)
19861 +{
19862 + /* all znodes belong to fake inode */
19863 + return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19864 +}
19865 +
19866 +/* ->index() method for znodes */
19867 +static unsigned long index_znode(const jnode * node)
19868 +{
19869 + unsigned long addr;
19870 + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19871 +
19872 + /* index of znode is just its address (shifted) */
19873 + addr = (unsigned long)node;
19874 + return (addr - PAGE_OFFSET) >> znode_shift_order;
19875 +}
19876 +
19877 +/* ->mapping() method for bitmap jnode */
19878 +static struct address_space *mapping_bitmap(const jnode * node)
19879 +{
19880 + /* all bitmap blocks belong to special bitmap inode */
19881 + return get_super_private(jnode_get_tree(node)->super)->bitmap->
19882 + i_mapping;
19883 +}
19884 +
19885 +/* ->index() method for jnodes that are indexed by address */
19886 +static unsigned long index_is_address(const jnode * node)
19887 +{
19888 + unsigned long ind;
19889 +
19890 + ind = (unsigned long)node;
19891 + return ind - PAGE_OFFSET;
19892 +}
19893 +
19894 +/* resolve race with jput */
19895 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19896 +{
19897 + /*
19898 + * This is used as part of RCU-based jnode handling.
19899 + *
19900 + * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19901 + * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19902 + * not protected during this, so concurrent thread may execute
19903 + * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19904 + * freed in jput_final(). To avoid such races, jput_final() sets
19905 + * JNODE_RIP on jnode (under tree lock). All places that work with
19906 + * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19907 + * (first without taking tree lock), and if this bit is set, released
19908 + * reference acquired by the current thread and returns NULL.
19909 + *
19910 + * As a result, if jnode is being concurrently freed, NULL is returned
19911 + * and caller should pretend that jnode wasn't found in the first
19912 + * place.
19913 + *
19914 + * Otherwise it's safe to release "rcu-read-lock" and continue with
19915 + * jnode.
19916 + */
19917 + if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19918 + read_lock_tree(tree);
19919 + if (JF_ISSET(node, JNODE_RIP)) {
19920 + dec_x_ref(node);
19921 + node = NULL;
19922 + }
19923 + read_unlock_tree(tree);
19924 + }
19925 + return node;
19926 +}
19927 +
19928 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19929 +{
19930 + struct inode *inode;
19931 + item_plugin *iplug;
19932 + loff_t off;
19933 +
19934 + assert("nikita-3092", node != NULL);
19935 + assert("nikita-3093", key != NULL);
19936 + assert("nikita-3094", jnode_is_unformatted(node));
19937 +
19938 + off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19939 + inode = mapping_jnode(node)->host;
19940 +
19941 + if (node->parent_item_id != 0)
19942 + iplug = item_plugin_by_id(node->parent_item_id);
19943 + else
19944 + iplug = NULL;
19945 +
19946 + if (iplug != NULL && iplug->f.key_by_offset)
19947 + iplug->f.key_by_offset(inode, off, key);
19948 + else {
19949 + file_plugin *fplug;
19950 +
19951 + fplug = inode_file_plugin(inode);
19952 + assert("zam-1007", fplug != NULL);
19953 + assert("zam-1008", fplug->key_by_inode != NULL);
19954 +
19955 + fplug->key_by_inode(inode, off, key);
19956 + }
19957 +
19958 + return key;
19959 +}
19960 +
19961 +/* ->parse() method for formatted nodes */
19962 +static int parse_znode(jnode * node)
19963 +{
19964 + return zparse(JZNODE(node));
19965 +}
19966 +
19967 +/* ->delete() method for formatted nodes */
19968 +static void delete_znode(jnode * node, reiser4_tree * tree)
19969 +{
19970 + znode *z;
19971 +
19972 + assert_rw_write_locked(&(tree->tree_lock));
19973 + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19974 +
19975 + z = JZNODE(node);
19976 + assert("vs-899", z->c_count == 0);
19977 +
19978 + /* delete znode from sibling list. */
19979 + sibling_list_remove(z);
19980 +
19981 + znode_remove(z, tree);
19982 +}
19983 +
19984 +/* ->remove() method for formatted nodes */
19985 +static int remove_znode(jnode * node, reiser4_tree * tree)
19986 +{
19987 + znode *z;
19988 +
19989 + assert_rw_write_locked(&(tree->tree_lock));
19990 + z = JZNODE(node);
19991 +
19992 + if (z->c_count == 0) {
19993 + /* detach znode from sibling list. */
19994 + sibling_list_drop(z);
19995 + /* this is called with tree spin-lock held, so call
19996 + znode_remove() directly (rather than znode_lock_remove()). */
19997 + znode_remove(z, tree);
19998 + return 0;
19999 + }
20000 + return RETERR(-EBUSY);
20001 +}
20002 +
20003 +/* ->init() method for formatted nodes */
20004 +static int init_znode(jnode * node)
20005 +{
20006 + znode *z;
20007 +
20008 + z = JZNODE(node);
20009 + /* call node plugin to do actual initialization */
20010 + return z->nplug->init(z);
20011 +}
20012 +
20013 +/* ->clone() method for formatted nodes */
20014 +static jnode *clone_formatted(jnode * node)
20015 +{
20016 + znode *clone;
20017 +
20018 + assert("vs-1430", jnode_is_znode(node));
20019 + clone = zalloc(reiser4_ctx_gfp_mask_get());
20020 + if (clone == NULL)
20021 + return ERR_PTR(RETERR(-ENOMEM));
20022 + zinit(clone, NULL, current_tree);
20023 + jnode_set_block(ZJNODE(clone), jnode_get_block(node));
20024 + /* ZJNODE(clone)->key.z is not initialized */
20025 + clone->level = JZNODE(node)->level;
20026 +
20027 + return ZJNODE(clone);
20028 +}
20029 +
20030 +/* jplug->clone for unformatted nodes */
20031 +static jnode *clone_unformatted(jnode * node)
20032 +{
20033 + jnode *clone;
20034 +
20035 + assert("vs-1431", jnode_is_unformatted(node));
20036 + clone = jalloc();
20037 + if (clone == NULL)
20038 + return ERR_PTR(RETERR(-ENOMEM));
20039 +
20040 + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
20041 + jnode_set_block(clone, jnode_get_block(node));
20042 +
20043 + return clone;
20044 +
20045 +}
20046 +
20047 +/*
20048 + * Setup jnode plugin methods for various jnode types.
20049 + */
20050 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
20051 + [JNODE_UNFORMATTED_BLOCK] = {
20052 + .h = {
20053 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20054 + .id = JNODE_UNFORMATTED_BLOCK,
20055 + .pops = NULL,
20056 + .label = "unformatted",
20057 + .desc = "unformatted node",
20058 + .linkage = {NULL, NULL}
20059 + },
20060 + .init = init_noinit,
20061 + .parse = parse_noparse,
20062 + .mapping = mapping_jnode,
20063 + .index = index_jnode,
20064 + .clone = clone_unformatted
20065 + },
20066 + [JNODE_FORMATTED_BLOCK] = {
20067 + .h = {
20068 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20069 + .id = JNODE_FORMATTED_BLOCK,
20070 + .pops = NULL,
20071 + .label = "formatted",
20072 + .desc = "formatted tree node",
20073 + .linkage = {NULL, NULL}
20074 + },
20075 + .init = init_znode,
20076 + .parse = parse_znode,
20077 + .mapping = mapping_znode,
20078 + .index = index_znode,
20079 + .clone = clone_formatted
20080 + },
20081 + [JNODE_BITMAP] = {
20082 + .h = {
20083 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20084 + .id = JNODE_BITMAP,
20085 + .pops = NULL,
20086 + .label = "bitmap",
20087 + .desc = "bitmap node",
20088 + .linkage = {NULL, NULL}
20089 + },
20090 + .init = init_noinit,
20091 + .parse = parse_noparse,
20092 + .mapping = mapping_bitmap,
20093 + .index = index_is_address,
20094 + .clone = NULL
20095 + },
20096 + [JNODE_IO_HEAD] = {
20097 + .h = {
20098 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20099 + .id = JNODE_IO_HEAD,
20100 + .pops = NULL,
20101 + .label = "io head",
20102 + .desc = "io head",
20103 + .linkage = {NULL, NULL}
20104 + },
20105 + .init = init_noinit,
20106 + .parse = parse_noparse,
20107 + .mapping = mapping_bitmap,
20108 + .index = index_is_address,
20109 + .clone = NULL
20110 + },
20111 + [JNODE_INODE] = {
20112 + .h = {
20113 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20114 + .id = JNODE_INODE,
20115 + .pops = NULL,
20116 + .label = "inode",
20117 + .desc = "inode's builtin jnode",
20118 + .linkage = {NULL, NULL}
20119 + },
20120 + .init = NULL,
20121 + .parse = NULL,
20122 + .mapping = NULL,
20123 + .index = NULL,
20124 + .clone = NULL
20125 + }
20126 +};
20127 +
20128 +/*
20129 + * jnode destruction.
20130 + *
20131 + * Thread may use a jnode after it acquired a reference to it. References are
20132 + * counted in ->x_count field. Reference protects jnode from being
20133 + * recycled. This is different from protecting jnode data (that are stored in
20134 + * jnode page) from being evicted from memory. Data are protected by jload()
20135 + * and released by jrelse().
20136 + *
20137 + * If thread already possesses a reference to the jnode it can acquire another
20138 + * one through jref(). Initial reference is obtained (usually) by locating
20139 + * jnode in some indexing structure that depends on jnode type: formatted
20140 + * nodes are kept in global hash table, where they are indexed by block
20141 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20142 + * table, which is indexed by oid and offset within file, and in per-inode
20143 + * radix tree.
20144 + *
20145 + * Reference to jnode is released by jput(). If last reference is released,
20146 + * jput_final() is called. This function determines whether jnode has to be
20147 + * deleted (this happens when corresponding node is removed from the file
20148 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20149 + * should be just "removed" (deleted from memory).
20150 + *
20151 + * Jnode destruction is signally delicate dance because of locking and RCU.
20152 + */
20153 +
20154 +/*
20155 + * Returns true if jnode cannot be removed right now. This check is called
20156 + * under tree lock. If it returns true, jnode is irrevocably committed to be
20157 + * deleted/removed.
20158 + */
20159 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20160 +{
20161 + /* if other thread managed to acquire a reference to this jnode, don't
20162 + * free it. */
20163 + if (atomic_read(&node->x_count) > 0)
20164 + return 1;
20165 + /* also, don't free znode that has children in memory */
20166 + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20167 + return 1;
20168 + return 0;
20169 +}
20170 +
20171 +/*
20172 + * this is called as part of removing jnode. Based on jnode type, call
20173 + * corresponding function that removes jnode from indices and returns it back
20174 + * to the appropriate slab (through RCU).
20175 + */
20176 +static inline void
20177 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20178 +{
20179 + switch (jtype) {
20180 + case JNODE_UNFORMATTED_BLOCK:
20181 + remove_jnode(node, tree);
20182 + break;
20183 + case JNODE_IO_HEAD:
20184 + case JNODE_BITMAP:
20185 + break;
20186 + case JNODE_INODE:
20187 + break;
20188 + case JNODE_FORMATTED_BLOCK:
20189 + remove_znode(node, tree);
20190 + break;
20191 + default:
20192 + wrong_return_value("nikita-3196", "Wrong jnode type");
20193 + }
20194 +}
20195 +
20196 +/*
20197 + * this is called as part of deleting jnode. Based on jnode type, call
20198 + * corresponding function that removes jnode from indices and returns it back
20199 + * to the appropriate slab (through RCU).
20200 + *
20201 + * This differs from jnode_remove() only for formatted nodes---for them
20202 + * sibling list handling is different for removal and deletion.
20203 + */
20204 +static inline void
20205 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20206 +{
20207 + switch (jtype) {
20208 + case JNODE_UNFORMATTED_BLOCK:
20209 + remove_jnode(node, tree);
20210 + break;
20211 + case JNODE_IO_HEAD:
20212 + case JNODE_BITMAP:
20213 + break;
20214 + case JNODE_FORMATTED_BLOCK:
20215 + delete_znode(node, tree);
20216 + break;
20217 + case JNODE_INODE:
20218 + default:
20219 + wrong_return_value("nikita-3195", "Wrong jnode type");
20220 + }
20221 +}
20222 +
20223 +#if REISER4_DEBUG
20224 +/*
20225 + * remove jnode from the debugging list of all jnodes hanging off super-block.
20226 + */
20227 +void jnode_list_remove(jnode * node)
20228 +{
20229 + reiser4_super_info_data *sbinfo;
20230 +
20231 + sbinfo = get_super_private(jnode_get_tree(node)->super);
20232 +
20233 + spin_lock_irq(&sbinfo->all_guard);
20234 + assert("nikita-2422", !list_empty(&node->jnodes));
20235 + list_del_init(&node->jnodes);
20236 + spin_unlock_irq(&sbinfo->all_guard);
20237 +}
20238 +#endif
20239 +
20240 +/*
20241 + * this is called by jput_final() to remove jnode when last reference to it is
20242 + * released.
20243 + */
20244 +static int jnode_try_drop(jnode * node)
20245 +{
20246 + int result;
20247 + reiser4_tree *tree;
20248 + jnode_type jtype;
20249 +
20250 + assert("nikita-2491", node != NULL);
20251 + assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20252 +
20253 + tree = jnode_get_tree(node);
20254 + jtype = jnode_get_type(node);
20255 +
20256 + spin_lock_jnode(node);
20257 + write_lock_tree(tree);
20258 + /*
20259 + * if jnode has a page---leave it alone. Memory pressure will
20260 + * eventually kill page and jnode.
20261 + */
20262 + if (jnode_page(node) != NULL) {
20263 + write_unlock_tree(tree);
20264 + spin_unlock_jnode(node);
20265 + JF_CLR(node, JNODE_RIP);
20266 + return RETERR(-EBUSY);
20267 + }
20268 +
20269 + /* re-check ->x_count under tree lock. */
20270 + result = jnode_is_busy(node, jtype);
20271 + if (result == 0) {
20272 + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20273 + assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20274 +
20275 + spin_unlock_jnode(node);
20276 + /* no page and no references---despatch him. */
20277 + jnode_remove(node, jtype, tree);
20278 + write_unlock_tree(tree);
20279 + jnode_free(node, jtype);
20280 + } else {
20281 + /* busy check failed: reference was acquired by concurrent
20282 + * thread. */
20283 + write_unlock_tree(tree);
20284 + spin_unlock_jnode(node);
20285 + JF_CLR(node, JNODE_RIP);
20286 + }
20287 + return result;
20288 +}
20289 +
20290 +/* jdelete() -- Delete jnode from the tree and file system */
20291 +static int jdelete(jnode * node/* jnode to finish with */)
20292 +{
20293 + struct page *page;
20294 + int result;
20295 + reiser4_tree *tree;
20296 + jnode_type jtype;
20297 +
20298 + assert("nikita-467", node != NULL);
20299 + assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20300 +
20301 + jtype = jnode_get_type(node);
20302 +
20303 + page = jnode_lock_page(node);
20304 + assert_spin_locked(&(node->guard));
20305 +
20306 + tree = jnode_get_tree(node);
20307 +
20308 + write_lock_tree(tree);
20309 + /* re-check ->x_count under tree lock. */
20310 + result = jnode_is_busy(node, jtype);
20311 + if (likely(!result)) {
20312 + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20313 + assert("jmacd-511", atomic_read(&node->d_count) == 0);
20314 +
20315 + /* detach page */
20316 + if (page != NULL) {
20317 + /*
20318 + * FIXME this is racy against jnode_extent_write().
20319 + */
20320 + page_clear_jnode(page, node);
20321 + }
20322 + spin_unlock_jnode(node);
20323 + /* goodbye */
20324 + jnode_delete(node, jtype, tree);
20325 + write_unlock_tree(tree);
20326 + jnode_free(node, jtype);
20327 + /* @node is no longer valid pointer */
20328 + if (page != NULL)
20329 + reiser4_drop_page(page);
20330 + } else {
20331 + /* busy check failed: reference was acquired by concurrent
20332 + * thread. */
20333 + JF_CLR(node, JNODE_RIP);
20334 + write_unlock_tree(tree);
20335 + spin_unlock_jnode(node);
20336 + if (page != NULL)
20337 + unlock_page(page);
20338 + }
20339 + return result;
20340 +}
20341 +
20342 +/* drop jnode on the floor.
20343 +
20344 + Return value:
20345 +
20346 + -EBUSY: failed to drop jnode, because there are still references to it
20347 +
20348 + 0: successfully dropped jnode
20349 +
20350 +*/
20351 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20352 +{
20353 + struct page *page;
20354 + jnode_type jtype;
20355 + int result;
20356 +
20357 + assert("zam-602", node != NULL);
20358 + assert_rw_not_read_locked(&(tree->tree_lock));
20359 + assert_rw_not_write_locked(&(tree->tree_lock));
20360 + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20361 +
20362 + jtype = jnode_get_type(node);
20363 +
20364 + page = jnode_lock_page(node);
20365 + assert_spin_locked(&(node->guard));
20366 +
20367 + write_lock_tree(tree);
20368 +
20369 + /* re-check ->x_count under tree lock. */
20370 + result = jnode_is_busy(node, jtype);
20371 + if (!result) {
20372 + assert("nikita-2488", page == jnode_page(node));
20373 + assert("nikita-2533", atomic_read(&node->d_count) == 0);
20374 + if (page != NULL) {
20375 + assert("nikita-2126", !PageDirty(page));
20376 + assert("nikita-2127", PageUptodate(page));
20377 + assert("nikita-2181", PageLocked(page));
20378 + page_clear_jnode(page, node);
20379 + }
20380 + spin_unlock_jnode(node);
20381 + jnode_remove(node, jtype, tree);
20382 + write_unlock_tree(tree);
20383 + jnode_free(node, jtype);
20384 + if (page != NULL)
20385 + reiser4_drop_page(page);
20386 + } else {
20387 + /* busy check failed: reference was acquired by concurrent
20388 + * thread. */
20389 + JF_CLR(node, JNODE_RIP);
20390 + write_unlock_tree(tree);
20391 + spin_unlock_jnode(node);
20392 + if (page != NULL)
20393 + unlock_page(page);
20394 + }
20395 + return result;
20396 +}
20397 +
20398 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20399 + be 0 (where applicable). */
20400 +void jdrop(jnode * node)
20401 +{
20402 + jdrop_in_tree(node, jnode_get_tree(node));
20403 +}
20404 +
20405 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20406 + functionality (these j-nodes are not in any hash table) just for reading
20407 + from and writing to disk. */
20408 +
20409 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20410 +{
20411 + jnode *jal = jalloc();
20412 +
20413 + if (jal != NULL) {
20414 + jnode_init(jal, current_tree, JNODE_IO_HEAD);
20415 + jnode_set_block(jal, block);
20416 + }
20417 +
20418 + jref(jal);
20419 +
20420 + return jal;
20421 +}
20422 +
20423 +void reiser4_drop_io_head(jnode * node)
20424 +{
20425 + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20426 +
20427 + jput(node);
20428 + jdrop(node);
20429 +}
20430 +
20431 +/* protect keep jnode data from reiser4_releasepage() */
20432 +void pin_jnode_data(jnode * node)
20433 +{
20434 + assert("zam-671", jnode_page(node) != NULL);
20435 + page_cache_get(jnode_page(node));
20436 +}
20437 +
20438 +/* make jnode data free-able again */
20439 +void unpin_jnode_data(jnode * node)
20440 +{
20441 + assert("zam-672", jnode_page(node) != NULL);
20442 + page_cache_release(jnode_page(node));
20443 +}
20444 +
20445 +struct address_space *jnode_get_mapping(const jnode * node)
20446 +{
20447 + assert("nikita-3162", node != NULL);
20448 + return jnode_ops(node)->mapping(node);
20449 +}
20450 +
20451 +#if REISER4_DEBUG
20452 +/* debugging aid: jnode invariant */
20453 +int jnode_invariant_f(const jnode * node, char const **msg)
20454 +{
20455 +#define _ergo(ant, con) \
20456 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20457 +#define _check(exp) ((*msg) = #exp, (exp))
20458 +
20459 + return _check(node != NULL) &&
20460 + /* [jnode-queued] */
20461 + /* only relocated node can be queued, except that when znode
20462 + * is being deleted, its JNODE_RELOC bit is cleared */
20463 + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20464 + JF_ISSET(node, JNODE_RELOC) ||
20465 + JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20466 + _check(node->jnodes.prev != NULL) &&
20467 + _check(node->jnodes.next != NULL) &&
20468 + /* [jnode-dirty] invariant */
20469 + /* dirty inode is part of atom */
20470 + _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20471 + /* [jnode-oid] invariant */
20472 + /* for unformatted node ->objectid and ->mapping fields are
20473 + * consistent */
20474 + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20475 + node->key.j.objectid ==
20476 + get_inode_oid(node->key.j.mapping->host)) &&
20477 + /* [jnode-atom-valid] invariant */
20478 + /* node atom has valid state */
20479 + _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20480 + /* [jnode-page-binding] invariant */
20481 + /* if node points to page, it points back to node */
20482 + _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20483 + /* [jnode-refs] invariant */
20484 + /* only referenced jnode can be loaded */
20485 + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20486 +
20487 +}
20488 +
20489 +static const char *jnode_type_name(jnode_type type)
20490 +{
20491 + switch (type) {
20492 + case JNODE_UNFORMATTED_BLOCK:
20493 + return "unformatted";
20494 + case JNODE_FORMATTED_BLOCK:
20495 + return "formatted";
20496 + case JNODE_BITMAP:
20497 + return "bitmap";
20498 + case JNODE_IO_HEAD:
20499 + return "io head";
20500 + case JNODE_INODE:
20501 + return "inode";
20502 + case LAST_JNODE_TYPE:
20503 + return "last";
20504 + default:{
20505 + static char unknown[30];
20506 +
20507 + sprintf(unknown, "unknown %i", type);
20508 + return unknown;
20509 + }
20510 + }
20511 +}
20512 +
20513 +#define jnode_state_name(node, flag) \
20514 + (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "")
20515 +
20516 +/* debugging aid: output human readable information about @node */
20517 +static void info_jnode(const char *prefix /* prefix to print */ ,
20518 + const jnode * node/* node to print */)
20519 +{
20520 + assert("umka-068", prefix != NULL);
20521 +
20522 + if (node == NULL) {
20523 + printk("%s: null\n", prefix);
20524 + return;
20525 + }
20526 +
20527 + printk
20528 + ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20529 + " block: %s, d_count: %d, x_count: %d, "
20530 + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20531 + node->state,
20532 + jnode_state_name(node, JNODE_PARSED),
20533 + jnode_state_name(node, JNODE_HEARD_BANSHEE),
20534 + jnode_state_name(node, JNODE_LEFT_CONNECTED),
20535 + jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20536 + jnode_state_name(node, JNODE_ORPHAN),
20537 + jnode_state_name(node, JNODE_CREATED),
20538 + jnode_state_name(node, JNODE_RELOC),
20539 + jnode_state_name(node, JNODE_OVRWR),
20540 + jnode_state_name(node, JNODE_DIRTY),
20541 + jnode_state_name(node, JNODE_IS_DYING),
20542 + jnode_state_name(node, JNODE_RIP),
20543 + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20544 + jnode_state_name(node, JNODE_WRITEBACK),
20545 + jnode_state_name(node, JNODE_NEW),
20546 + jnode_state_name(node, JNODE_DKSET),
20547 + jnode_state_name(node, JNODE_REPACK),
20548 + jnode_state_name(node, JNODE_CLUSTER_PAGE),
20549 + jnode_get_level(node), sprint_address(jnode_get_block(node)),
20550 + atomic_read(&node->d_count), atomic_read(&node->x_count),
20551 + jnode_page(node), node->atom, 0, 0,
20552 + jnode_type_name(jnode_get_type(node)));
20553 + if (jnode_is_unformatted(node)) {
20554 + printk("inode: %llu, index: %lu, ",
20555 + node->key.j.objectid, node->key.j.index);
20556 + }
20557 +}
20558 +
20559 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20560 +static int jnode_invariant(jnode * node, int tlocked, int jlocked)
20561 +{
20562 + char const *failed_msg;
20563 + int result;
20564 + reiser4_tree *tree;
20565 +
20566 + tree = jnode_get_tree(node);
20567 +
20568 + assert("umka-063312", node != NULL);
20569 + assert("umka-064321", tree != NULL);
20570 +
20571 + if (!jlocked && !tlocked)
20572 + spin_lock_jnode((jnode *) node);
20573 + if (!tlocked)
20574 + read_lock_tree(jnode_get_tree(node));
20575 + result = jnode_invariant_f(node, &failed_msg);
20576 + if (!result) {
20577 + info_jnode("corrupted node", node);
20578 + warning("jmacd-555", "Condition %s failed", failed_msg);
20579 + }
20580 + if (!tlocked)
20581 + read_unlock_tree(jnode_get_tree(node));
20582 + if (!jlocked && !tlocked)
20583 + spin_unlock_jnode((jnode *) node);
20584 + return result;
20585 +}
20586 +
20587 +#endif /* REISER4_DEBUG */
20588 +
20589 +/* Make Linus happy.
20590 + Local variables:
20591 + c-indentation-style: "K&R"
20592 + mode-name: "LC"
20593 + c-basic-offset: 8
20594 + tab-width: 8
20595 + fill-column: 80
20596 + End:
20597 +*/
20598 diff -urN linux-2.6.33.orig/fs/reiser4/jnode.h linux-2.6.33/fs/reiser4/jnode.h
20599 --- linux-2.6.33.orig/fs/reiser4/jnode.h 1970-01-01 01:00:00.000000000 +0100
20600 +++ linux-2.6.33/fs/reiser4/jnode.h 2010-03-04 19:33:22.000000000 +0100
20601 @@ -0,0 +1,704 @@
20602 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20603 + * reiser4/README */
20604 +
20605 +/* Declaration of jnode. See jnode.c for details. */
20606 +
20607 +#ifndef __JNODE_H__
20608 +#define __JNODE_H__
20609 +
20610 +#include "forward.h"
20611 +#include "type_safe_hash.h"
20612 +#include "txnmgr.h"
20613 +#include "key.h"
20614 +#include "debug.h"
20615 +#include "dformat.h"
20616 +#include "page_cache.h"
20617 +#include "context.h"
20618 +
20619 +#include "plugin/plugin.h"
20620 +
20621 +#include <linux/fs.h>
20622 +#include <linux/mm.h>
20623 +#include <linux/spinlock.h>
20624 +#include <asm/atomic.h>
20625 +#include <linux/bitops.h>
20626 +#include <linux/list.h>
20627 +#include <linux/rcupdate.h>
20628 +
20629 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20630 + nodes) */
20631 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20632 +
20633 +/* declare hash table of znodes */
20634 +TYPE_SAFE_HASH_DECLARE(z, znode);
20635 +
20636 +struct jnode_key {
20637 + __u64 objectid;
20638 + unsigned long index;
20639 + struct address_space *mapping;
20640 +};
20641 +
20642 +/*
20643 + Jnode is the "base class" of other nodes in reiser4. It is also happens to
20644 + be exactly the node we use for unformatted tree nodes.
20645 +
20646 + Jnode provides following basic functionality:
20647 +
20648 + . reference counting and indexing.
20649 +
20650 + . integration with page cache. Jnode has ->pg reference to which page can
20651 + be attached.
20652 +
20653 + . interface to transaction manager. It is jnode that is kept in transaction
20654 + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20655 + means, there should be special type of jnode for inode.)
20656 +
20657 + Locking:
20658 +
20659 + Spin lock: the following fields are protected by the per-jnode spin lock:
20660 +
20661 + ->state
20662 + ->atom
20663 + ->capture_link
20664 +
20665 + Following fields are protected by the global tree lock:
20666 +
20667 + ->link
20668 + ->key.z (content of ->key.z is only changed in znode_rehash())
20669 + ->key.j
20670 +
20671 + Atomic counters
20672 +
20673 + ->x_count
20674 + ->d_count
20675 +
20676 + ->pg, and ->data are protected by spin lock for unused jnode and are
20677 + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20678 + is false).
20679 +
20680 + ->tree is immutable after creation
20681 +
20682 + Unclear
20683 +
20684 + ->blocknr: should be under jnode spin-lock, but current interface is based
20685 + on passing of block address.
20686 +
20687 + If you ever need to spin lock two nodes at once, do this in "natural"
20688 + memory order: lock znode with lower address first. (See lock_two_nodes().)
20689 +
20690 + Invariants involving this data-type:
20691 +
20692 + [jnode-dirty]
20693 + [jnode-refs]
20694 + [jnode-oid]
20695 + [jnode-queued]
20696 + [jnode-atom-valid]
20697 + [jnode-page-binding]
20698 +*/
20699 +
20700 +struct jnode {
20701 +#if REISER4_DEBUG
20702 +#define JMAGIC 0x52654973 /* "ReIs" */
20703 + int magic;
20704 +#endif
20705 + /* FIRST CACHE LINE (16 bytes): data used by jload */
20706 +
20707 + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20708 + /* 0 */ unsigned long state;
20709 +
20710 + /* lock, protecting jnode's fields. */
20711 + /* 4 */ spinlock_t load;
20712 +
20713 + /* counter of references to jnode itself. Increased on jref().
20714 + Decreased on jput().
20715 + */
20716 + /* 8 */ atomic_t x_count;
20717 +
20718 + /* counter of references to jnode's data. Pin data page(s) in
20719 + memory while this is greater than 0. Increased on jload().
20720 + Decreased on jrelse().
20721 + */
20722 + /* 12 */ atomic_t d_count;
20723 +
20724 + /* SECOND CACHE LINE: data used by hash table lookups */
20725 +
20726 + /* 16 */ union {
20727 + /* znodes are hashed by block number */
20728 + reiser4_block_nr z;
20729 + /* unformatted nodes are hashed by mapping plus offset */
20730 + struct jnode_key j;
20731 + } key;
20732 +
20733 + /* THIRD CACHE LINE */
20734 +
20735 + /* 32 */ union {
20736 + /* pointers to maintain hash-table */
20737 + z_hash_link z;
20738 + j_hash_link j;
20739 + } link;
20740 +
20741 + /* pointer to jnode page. */
20742 + /* 36 */ struct page *pg;
20743 + /* pointer to node itself. This is page_address(node->pg) when page is
20744 + attached to the jnode
20745 + */
20746 + /* 40 */ void *data;
20747 +
20748 + /* 44 */ reiser4_tree *tree;
20749 +
20750 + /* FOURTH CACHE LINE: atom related fields */
20751 +
20752 + /* 48 */ spinlock_t guard;
20753 +
20754 + /* atom the block is in, if any */
20755 + /* 52 */ txn_atom *atom;
20756 +
20757 + /* capture list */
20758 + /* 56 */ struct list_head capture_link;
20759 +
20760 + /* FIFTH CACHE LINE */
20761 +
20762 + /* 64 */ struct rcu_head rcu;
20763 + /* crosses cache line */
20764 +
20765 + /* SIXTH CACHE LINE */
20766 +
20767 + /* the real blocknr (where io is going to/from) */
20768 + /* 80 */ reiser4_block_nr blocknr;
20769 + /* Parent item type, unformatted and CRC need it for
20770 + * offset => key conversion. */
20771 + /* NOTE: this parent_item_id looks like jnode type. */
20772 + /* 88 */ reiser4_plugin_id parent_item_id;
20773 + /* 92 */
20774 +#if REISER4_DEBUG
20775 + /* list of all jnodes for debugging purposes. */
20776 + struct list_head jnodes;
20777 + /* how many times this jnode was written in one transaction */
20778 + int written;
20779 + /* this indicates which atom's list the jnode is on */
20780 + atom_list list;
20781 +#endif
20782 +} __attribute__ ((aligned(16)));
20783 +
20784 +/*
20785 + * jnode types. Enumeration of existing jnode types.
20786 + */
20787 +typedef enum {
20788 + JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20789 + JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20790 + JNODE_BITMAP, /* bitmap */
20791 + JNODE_IO_HEAD, /* jnode representing a block in the
20792 + * wandering log */
20793 + JNODE_INODE, /* jnode embedded into inode */
20794 + LAST_JNODE_TYPE
20795 +} jnode_type;
20796 +
20797 +/* jnode states */
20798 +typedef enum {
20799 + /* jnode's page is loaded and data checked */
20800 + JNODE_PARSED = 0,
20801 + /* node was deleted, not all locks on it were released. This
20802 + node is empty and is going to be removed from the tree
20803 + shortly. */
20804 + JNODE_HEARD_BANSHEE = 1,
20805 + /* left sibling pointer is valid */
20806 + JNODE_LEFT_CONNECTED = 2,
20807 + /* right sibling pointer is valid */
20808 + JNODE_RIGHT_CONNECTED = 3,
20809 +
20810 + /* znode was just created and doesn't yet have a pointer from
20811 + its parent */
20812 + JNODE_ORPHAN = 4,
20813 +
20814 + /* this node was created by its transaction and has not been assigned
20815 + a block address. */
20816 + JNODE_CREATED = 5,
20817 +
20818 + /* this node is currently relocated */
20819 + JNODE_RELOC = 6,
20820 + /* this node is currently wandered */
20821 + JNODE_OVRWR = 7,
20822 +
20823 + /* this znode has been modified */
20824 + JNODE_DIRTY = 8,
20825 +
20826 + /* znode lock is being invalidated */
20827 + JNODE_IS_DYING = 9,
20828 +
20829 + /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20830 +
20831 + /* jnode is queued for flushing. */
20832 + JNODE_FLUSH_QUEUED = 12,
20833 +
20834 + /* In the following bits jnode type is encoded. */
20835 + JNODE_TYPE_1 = 13,
20836 + JNODE_TYPE_2 = 14,
20837 + JNODE_TYPE_3 = 15,
20838 +
20839 + /* jnode is being destroyed */
20840 + JNODE_RIP = 16,
20841 +
20842 + /* znode was not captured during locking (it might so be because
20843 + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20844 + JNODE_MISSED_IN_CAPTURE = 17,
20845 +
20846 + /* write is in progress */
20847 + JNODE_WRITEBACK = 18,
20848 +
20849 + /* FIXME: now it is used by crypto-compress plugin only */
20850 + JNODE_NEW = 19,
20851 +
20852 + /* delimiting keys are already set for this znode. */
20853 + JNODE_DKSET = 20,
20854 +
20855 + /* when this bit is set page and jnode can not be disconnected */
20856 + JNODE_WRITE_PREPARED = 21,
20857 +
20858 + JNODE_CLUSTER_PAGE = 22,
20859 + /* Jnode is marked for repacking, that means the reiser4 flush and the
20860 + * block allocator should process this node special way */
20861 + JNODE_REPACK = 23,
20862 + /* node should be converted by flush in squalloc phase */
20863 + JNODE_CONVERTIBLE = 24,
20864 + /*
20865 + * When jnode is dirtied for the first time in given transaction,
20866 + * do_jnode_make_dirty() checks whether this jnode can possible became
20867 + * member of overwrite set. If so, this bit is set, and one block is
20868 + * reserved in the ->flush_reserved space of atom.
20869 + *
20870 + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20871 + *
20872 + * (1) flush decides that we want this block to go into relocate
20873 + * set after all.
20874 + *
20875 + * (2) wandering log is allocated (by log writer)
20876 + *
20877 + * (3) extent is allocated
20878 + *
20879 + */
20880 + JNODE_FLUSH_RESERVED = 29
20881 +} reiser4_jnode_state;
20882 +
20883 +/* Macros for accessing the jnode state. */
20884 +
20885 +static inline void JF_CLR(jnode * j, int f)
20886 +{
20887 + assert("unknown-1", j->magic == JMAGIC);
20888 + clear_bit(f, &j->state);
20889 +}
20890 +static inline int JF_ISSET(const jnode * j, int f)
20891 +{
20892 + assert("unknown-2", j->magic == JMAGIC);
20893 + return test_bit(f, &((jnode *) j)->state);
20894 +}
20895 +static inline void JF_SET(jnode * j, int f)
20896 +{
20897 + assert("unknown-3", j->magic == JMAGIC);
20898 + set_bit(f, &j->state);
20899 +}
20900 +
20901 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20902 +{
20903 + assert("unknown-4", j->magic == JMAGIC);
20904 + return test_and_set_bit(f, &j->state);
20905 +}
20906 +
20907 +static inline void spin_lock_jnode(jnode *node)
20908 +{
20909 + /* check that spinlocks of lower priorities are not held */
20910 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20911 + LOCK_CNT_NIL(spin_locked_txnh) &&
20912 + LOCK_CNT_NIL(spin_locked_zlock) &&
20913 + LOCK_CNT_NIL(rw_locked_dk) &&
20914 + LOCK_CNT_LT(spin_locked_jnode, 2)));
20915 +
20916 + spin_lock(&(node->guard));
20917 +
20918 + LOCK_CNT_INC(spin_locked_jnode);
20919 + LOCK_CNT_INC(spin_locked);
20920 +}
20921 +
20922 +static inline void spin_unlock_jnode(jnode *node)
20923 +{
20924 + assert_spin_locked(&(node->guard));
20925 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20926 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20927 +
20928 + LOCK_CNT_DEC(spin_locked_jnode);
20929 + LOCK_CNT_DEC(spin_locked);
20930 +
20931 + spin_unlock(&(node->guard));
20932 +}
20933 +
20934 +static inline int jnode_is_in_deleteset(const jnode * node)
20935 +{
20936 + return JF_ISSET(node, JNODE_RELOC);
20937 +}
20938 +
20939 +extern int init_jnodes(void);
20940 +extern void done_jnodes(void);
20941 +
20942 +/* Jnode routines */
20943 +extern jnode *jalloc(void);
20944 +extern void jfree(jnode * node) NONNULL;
20945 +extern jnode *jclone(jnode *);
20946 +extern jnode *jlookup(reiser4_tree * tree,
20947 + oid_t objectid, unsigned long ind) NONNULL;
20948 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20949 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20950 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20951 +void jnode_attach_page(jnode * node, struct page *pg);
20952 +
20953 +void unhash_unformatted_jnode(jnode *);
20954 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20955 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20956 +extern void jnode_make_dirty(jnode * node) NONNULL;
20957 +extern void jnode_make_clean(jnode * node) NONNULL;
20958 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20959 +extern void jnode_make_wander(jnode *) NONNULL;
20960 +extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL;
20961 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20962 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20963 +
20964 +/**
20965 + * jnode_get_block
20966 + * @node: jnode to query
20967 + *
20968 + */
20969 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20970 +{
20971 + assert("nikita-528", node != NULL);
20972 +
20973 + return &node->blocknr;
20974 +}
20975 +
20976 +/**
20977 + * jnode_set_block
20978 + * @node: jnode to update
20979 + * @blocknr: new block nr
20980 + */
20981 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20982 +{
20983 + assert("nikita-2020", node != NULL);
20984 + assert("umka-055", blocknr != NULL);
20985 + node->blocknr = *blocknr;
20986 +}
20987 +
20988 +
20989 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20990 + * jnode was emergency flushed---then block number chosen by eflush is
20991 + * used. */
20992 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20993 +{
20994 + assert("nikita-2768", node != NULL);
20995 + assert_spin_locked(&(node->guard));
20996 +
20997 + return jnode_get_block(node);
20998 +}
20999 +
21000 +/* Jnode flush interface. */
21001 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos);
21002 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos);
21003 +
21004 +/* FIXME-VS: these are used in plugin/item/extent.c */
21005 +
21006 +/* does extent_get_block have to be called */
21007 +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
21008 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
21009 +
21010 +/* the node should be converted during flush squalloc phase */
21011 +#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
21012 +#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
21013 +
21014 +/* Macros to convert from jnode to znode, znode to jnode. These are macros
21015 + because C doesn't allow overloading of const prototypes. */
21016 +#define ZJNODE(x) (&(x)->zjnode)
21017 +#define JZNODE(x) \
21018 +({ \
21019 + typeof(x) __tmp_x; \
21020 + \
21021 + __tmp_x = (x); \
21022 + assert("jmacd-1300", jnode_is_znode(__tmp_x)); \
21023 + (znode*) __tmp_x; \
21024 +})
21025 +
21026 +extern int jnodes_tree_init(reiser4_tree * tree);
21027 +extern int jnodes_tree_done(reiser4_tree * tree);
21028 +
21029 +#if REISER4_DEBUG
21030 +
21031 +extern int znode_is_any_locked(const znode * node);
21032 +extern void jnode_list_remove(jnode * node);
21033 +
21034 +#else
21035 +
21036 +#define jnode_list_remove(node) noop
21037 +
21038 +#endif
21039 +
21040 +int znode_is_root(const znode * node) NONNULL;
21041 +
21042 +/* bump reference counter on @node */
21043 +static inline void add_x_ref(jnode * node/* node to increase x_count of */)
21044 +{
21045 + assert("nikita-1911", node != NULL);
21046 +
21047 + atomic_inc(&node->x_count);
21048 + LOCK_CNT_INC(x_refs);
21049 +}
21050 +
21051 +static inline void dec_x_ref(jnode * node)
21052 +{
21053 + assert("nikita-3215", node != NULL);
21054 + assert("nikita-3216", atomic_read(&node->x_count) > 0);
21055 +
21056 + atomic_dec(&node->x_count);
21057 + assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
21058 + LOCK_CNT_DEC(x_refs);
21059 +}
21060 +
21061 +/* jref() - increase counter of references to jnode/znode (x_count) */
21062 +static inline jnode *jref(jnode * node)
21063 +{
21064 + assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21065 + add_x_ref(node);
21066 + return node;
21067 +}
21068 +
21069 +/* get the page of jnode */
21070 +static inline struct page *jnode_page(const jnode * node)
21071 +{
21072 + return node->pg;
21073 +}
21074 +
21075 +/* return pointer to jnode data */
21076 +static inline char *jdata(const jnode * node)
21077 +{
21078 + assert("nikita-1415", node != NULL);
21079 + assert("nikita-3198", jnode_page(node) != NULL);
21080 + return node->data;
21081 +}
21082 +
21083 +static inline int jnode_is_loaded(const jnode * node)
21084 +{
21085 + assert("zam-506", node != NULL);
21086 + return atomic_read(&node->d_count) > 0;
21087 +}
21088 +
21089 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21090 +
21091 +static inline void jnode_set_reloc(jnode * node)
21092 +{
21093 + assert("nikita-2431", node != NULL);
21094 + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21095 + JF_SET(node, JNODE_RELOC);
21096 +}
21097 +
21098 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21099 +
21100 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21101 +
21102 +static inline int jload(jnode *node)
21103 +{
21104 + return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
21105 +}
21106 +
21107 +extern int jinit_new(jnode *, gfp_t) NONNULL;
21108 +extern int jstartio(jnode *) NONNULL;
21109 +
21110 +extern void jdrop(jnode *) NONNULL;
21111 +extern int jwait_io(jnode *, int rw) NONNULL;
21112 +
21113 +void jload_prefetch(jnode *);
21114 +
21115 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
21116 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
21117 +
21118 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
21119 +{
21120 + assert("nikita-2691", node != NULL);
21121 + return node->tree;
21122 +}
21123 +
21124 +extern void pin_jnode_data(jnode *);
21125 +extern void unpin_jnode_data(jnode *);
21126 +
21127 +static inline jnode_type jnode_get_type(const jnode * node)
21128 +{
21129 + static const unsigned long state_mask =
21130 + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21131 +
21132 + static jnode_type mask_to_type[] = {
21133 + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21134 +
21135 + /* 000 */
21136 + [0] = JNODE_FORMATTED_BLOCK,
21137 + /* 001 */
21138 + [1] = JNODE_UNFORMATTED_BLOCK,
21139 + /* 010 */
21140 + [2] = JNODE_BITMAP,
21141 + /* 011 */
21142 + [3] = LAST_JNODE_TYPE, /*invalid */
21143 + /* 100 */
21144 + [4] = JNODE_INODE,
21145 + /* 101 */
21146 + [5] = LAST_JNODE_TYPE,
21147 + /* 110 */
21148 + [6] = JNODE_IO_HEAD,
21149 + /* 111 */
21150 + [7] = LAST_JNODE_TYPE, /* invalid */
21151 + };
21152 +
21153 + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21154 +}
21155 +
21156 +/* returns true if node is a znode */
21157 +static inline int jnode_is_znode(const jnode * node)
21158 +{
21159 + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21160 +}
21161 +
21162 +static inline int jnode_is_flushprepped(jnode * node)
21163 +{
21164 + assert("jmacd-78212", node != NULL);
21165 + assert_spin_locked(&(node->guard));
21166 + return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21167 + JF_ISSET(node, JNODE_OVRWR);
21168 +}
21169 +
21170 +/* Return true if @node has already been processed by the squeeze and allocate
21171 + process. This implies the block address has been finalized for the
21172 + duration of this atom (or it is clean and will remain in place). If this
21173 + returns true you may use the block number as a hint. */
21174 +static inline int jnode_check_flushprepped(jnode * node)
21175 +{
21176 + int result;
21177 +
21178 + /* It must be clean or relocated or wandered. New allocations are set
21179 + * to relocate. */
21180 + spin_lock_jnode(node);
21181 + result = jnode_is_flushprepped(node);
21182 + spin_unlock_jnode(node);
21183 + return result;
21184 +}
21185 +
21186 +/* returns true if node is unformatted */
21187 +static inline int jnode_is_unformatted(const jnode * node)
21188 +{
21189 + assert("jmacd-0123", node != NULL);
21190 + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21191 +}
21192 +
21193 +/* returns true if node represents a cluster cache page */
21194 +static inline int jnode_is_cluster_page(const jnode * node)
21195 +{
21196 + assert("edward-50", node != NULL);
21197 + return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21198 +}
21199 +
21200 +/* returns true is node is builtin inode's jnode */
21201 +static inline int jnode_is_inode(const jnode * node)
21202 +{
21203 + assert("vs-1240", node != NULL);
21204 + return jnode_get_type(node) == JNODE_INODE;
21205 +}
21206 +
21207 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21208 +{
21209 + assert("nikita-2367", type < LAST_JNODE_TYPE);
21210 + return jnode_plugin_by_id((reiser4_plugin_id) type);
21211 +}
21212 +
21213 +static inline jnode_plugin *jnode_ops(const jnode * node)
21214 +{
21215 + assert("nikita-2366", node != NULL);
21216 +
21217 + return jnode_ops_of(jnode_get_type(node));
21218 +}
21219 +
21220 +/* Get the index of a block. */
21221 +static inline unsigned long jnode_get_index(jnode * node)
21222 +{
21223 + return jnode_ops(node)->index(node);
21224 +}
21225 +
21226 +/* return true if "node" is the root */
21227 +static inline int jnode_is_root(const jnode * node)
21228 +{
21229 + return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21230 +}
21231 +
21232 +extern struct address_space *mapping_jnode(const jnode * node);
21233 +extern unsigned long index_jnode(const jnode * node);
21234 +
21235 +static inline void jput(jnode * node);
21236 +extern void jput_final(jnode * node);
21237 +
21238 +/* bump data counter on @node */
21239 +static inline void add_d_ref(jnode * node/* node to increase d_count of */)
21240 +{
21241 + assert("nikita-1962", node != NULL);
21242 +
21243 + atomic_inc(&node->d_count);
21244 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
21245 + LOCK_CNT_INC(d_refs);
21246 +}
21247 +
21248 +/* jput() - decrement x_count reference counter on znode.
21249 +
21250 + Count may drop to 0, jnode stays in cache until memory pressure causes the
21251 + eviction of its page. The c_count variable also ensures that children are
21252 + pressured out of memory before the parent. The jnode remains hashed as
21253 + long as the VM allows its page to stay in memory.
21254 +*/
21255 +static inline void jput(jnode * node)
21256 +{
21257 + assert("jmacd-509", node != NULL);
21258 + assert("jmacd-510", atomic_read(&node->x_count) > 0);
21259 + assert("zam-926", reiser4_schedulable());
21260 + LOCK_CNT_DEC(x_refs);
21261 +
21262 + rcu_read_lock();
21263 + /*
21264 + * we don't need any kind of lock here--jput_final() uses RCU.
21265 + */
21266 + if (unlikely(atomic_dec_and_test(&node->x_count)))
21267 + jput_final(node);
21268 + else
21269 + rcu_read_unlock();
21270 + assert("nikita-3473", reiser4_schedulable());
21271 +}
21272 +
21273 +extern void jrelse(jnode * node);
21274 +extern void jrelse_tail(jnode * node);
21275 +
21276 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21277 +
21278 +/* resolve race with jput */
21279 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21280 +{
21281 + if (unlikely(JF_ISSET(node, JNODE_RIP)))
21282 + node = jnode_rip_sync(tree, node);
21283 + return node;
21284 +}
21285 +
21286 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21287 +
21288 +#if REISER4_DEBUG
21289 +extern int jnode_invariant_f(const jnode *node, char const **msg);
21290 +#endif
21291 +
21292 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21293 +
21294 +/* __JNODE_H__ */
21295 +#endif
21296 +
21297 +/* Make Linus happy.
21298 + Local variables:
21299 + c-indentation-style: "K&R"
21300 + mode-name: "LC"
21301 + c-basic-offset: 8
21302 + tab-width: 8
21303 + fill-column: 120
21304 + End:
21305 +*/
21306 diff -urN linux-2.6.33.orig/fs/reiser4/kassign.c linux-2.6.33/fs/reiser4/kassign.c
21307 --- linux-2.6.33.orig/fs/reiser4/kassign.c 1970-01-01 01:00:00.000000000 +0100
21308 +++ linux-2.6.33/fs/reiser4/kassign.c 2010-03-04 19:33:22.000000000 +0100
21309 @@ -0,0 +1,677 @@
21310 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21311 + * reiser4/README */
21312 +
21313 +/* Key assignment policy implementation */
21314 +
21315 +/*
21316 + * In reiser4 every piece of file system data and meta-data has a key. Keys
21317 + * are used to store information in and retrieve it from reiser4 internal
21318 + * tree. In addition to this, keys define _ordering_ of all file system
21319 + * information: things having close keys are placed into the same or
21320 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21321 + * tries to respect tree order (see flush.c), keys also define order in which
21322 + * things are laid out on the disk, and hence, affect performance directly.
21323 + *
21324 + * Obviously, assignment of keys to data and meta-data should be consistent
21325 + * across whole file system. Algorithm that calculates a key for a given piece
21326 + * of data or meta-data is referred to as "key assignment".
21327 + *
21328 + * Key assignment is too expensive to be implemented as a plugin (that is,
21329 + * with an ability to support different key assignment schemas in the same
21330 + * compiled kernel image). As a compromise, all key-assignment functions and
21331 + * data-structures are collected in this single file, so that modifications to
21332 + * key assignment algorithm can be localized. Additional changes may be
21333 + * required in key.[ch].
21334 + *
21335 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21336 + * may guess, there is "Plan B" too.
21337 + *
21338 + */
21339 +
21340 +/*
21341 + * Additional complication with key assignment implementation is a requirement
21342 + * to support different key length.
21343 + */
21344 +
21345 +/*
21346 + * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21347 + *
21348 + * DIRECTORY ITEMS
21349 + *
21350 + * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21351 + * +--------------+---+---+-+-------------+------------------+-----------------+
21352 + * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21353 + * +--------------+---+---+-+-------------+------------------+-----------------+
21354 + * | | | | |
21355 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21356 + *
21357 + * dirid objectid of directory this item is for
21358 + *
21359 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21360 + *
21361 + * H 1 if last 8 bytes of the key contain hash,
21362 + * 0 if last 8 bytes of the key contain prefix-3
21363 + *
21364 + * prefix-1 first 7 characters of file name.
21365 + * Padded by zeroes if name is not long enough.
21366 + *
21367 + * prefix-2 next 8 characters of the file name.
21368 + *
21369 + * prefix-3 next 8 characters of the file name.
21370 + *
21371 + * hash hash of the rest of file name (i.e., portion of file
21372 + * name not included into prefix-1 and prefix-2).
21373 + *
21374 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21375 + * in the key. Such file names are called "short". They are distinguished by H
21376 + * bit set 0 in the key.
21377 + *
21378 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21379 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21380 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21381 + * characters of the name.
21382 + *
21383 + * This key assignment reaches following important goals:
21384 + *
21385 + * (1) directory entries are sorted in approximately lexicographical
21386 + * order.
21387 + *
21388 + * (2) collisions (when multiple directory items have the same key), while
21389 + * principally unavoidable in a tree with fixed length keys, are rare.
21390 + *
21391 + * STAT DATA
21392 + *
21393 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21394 + * +--------------+---+-----------------+---+--------------+-----------------+
21395 + * | locality id | 1 | ordering | 0 | objectid | 0 |
21396 + * +--------------+---+-----------------+---+--------------+-----------------+
21397 + * | | | | |
21398 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21399 + *
21400 + * locality id object id of a directory where first name was created for
21401 + * the object
21402 + *
21403 + * ordering copy of second 8-byte portion of the key of directory
21404 + * entry for the first name of this object. Ordering has a form
21405 + * {
21406 + * fibration :7;
21407 + * h :1;
21408 + * prefix1 :56;
21409 + * }
21410 + * see description of key for directory entry above.
21411 + *
21412 + * objectid object id for this object
21413 + *
21414 + * This key assignment policy is designed to keep stat-data in the same order
21415 + * as corresponding directory items, thus speeding up readdir/stat types of
21416 + * workload.
21417 + *
21418 + * FILE BODY
21419 + *
21420 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21421 + * +--------------+---+-----------------+---+--------------+-----------------+
21422 + * | locality id | 4 | ordering | 0 | objectid | offset |
21423 + * +--------------+---+-----------------+---+--------------+-----------------+
21424 + * | | | | |
21425 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21426 + *
21427 + * locality id object id of a directory where first name was created for
21428 + * the object
21429 + *
21430 + * ordering the same as in the key of stat-data for this object
21431 + *
21432 + * objectid object id for this object
21433 + *
21434 + * offset logical offset from the beginning of this file.
21435 + * Measured in bytes.
21436 + *
21437 + *
21438 + * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21439 + *
21440 + * DIRECTORY ITEMS
21441 + *
21442 + * | 60 | 4 | 7 |1| 56 | 64 |
21443 + * +--------------+---+---+-+-------------+-----------------+
21444 + * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21445 + * +--------------+---+---+-+-------------+-----------------+
21446 + * | | | |
21447 + * | 8 bytes | 8 bytes | 8 bytes |
21448 + *
21449 + * dirid objectid of directory this item is for
21450 + *
21451 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21452 + *
21453 + * H 1 if last 8 bytes of the key contain hash,
21454 + * 0 if last 8 bytes of the key contain prefix-2
21455 + *
21456 + * prefix-1 first 7 characters of file name.
21457 + * Padded by zeroes if name is not long enough.
21458 + *
21459 + * prefix-2 next 8 characters of the file name.
21460 + *
21461 + * hash hash of the rest of file name (i.e., portion of file
21462 + * name not included into prefix-1).
21463 + *
21464 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21465 + * the key. Such file names are called "short". They are distinguished by H
21466 + * bit set in the key.
21467 + *
21468 + * Other file names are "long". For long name, H bit is 0, and first 7
21469 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21470 + * key are occupied by hash of the remaining characters of the name.
21471 + *
21472 + * STAT DATA
21473 + *
21474 + * | 60 | 4 | 4 | 60 | 64 |
21475 + * +--------------+---+---+--------------+-----------------+
21476 + * | locality id | 1 | 0 | objectid | 0 |
21477 + * +--------------+---+---+--------------+-----------------+
21478 + * | | | |
21479 + * | 8 bytes | 8 bytes | 8 bytes |
21480 + *
21481 + * locality id object id of a directory where first name was created for
21482 + * the object
21483 + *
21484 + * objectid object id for this object
21485 + *
21486 + * FILE BODY
21487 + *
21488 + * | 60 | 4 | 4 | 60 | 64 |
21489 + * +--------------+---+---+--------------+-----------------+
21490 + * | locality id | 4 | 0 | objectid | offset |
21491 + * +--------------+---+---+--------------+-----------------+
21492 + * | | | |
21493 + * | 8 bytes | 8 bytes | 8 bytes |
21494 + *
21495 + * locality id object id of a directory where first name was created for
21496 + * the object
21497 + *
21498 + * objectid object id for this object
21499 + *
21500 + * offset logical offset from the beginning of this file.
21501 + * Measured in bytes.
21502 + *
21503 + *
21504 + */
21505 +
21506 +#include "debug.h"
21507 +#include "key.h"
21508 +#include "kassign.h"
21509 +#include "vfs_ops.h"
21510 +#include "inode.h"
21511 +#include "super.h"
21512 +#include "dscale.h"
21513 +
21514 +#include <linux/types.h> /* for __u?? */
21515 +#include <linux/fs.h> /* for struct super_block, etc */
21516 +
21517 +/* bitmask for H bit (see comment at the beginning of this file */
21518 +static const __u64 longname_mark = 0x0100000000000000ull;
21519 +/* bitmask for F and H portions of the key. */
21520 +static const __u64 fibration_mask = 0xff00000000000000ull;
21521 +
21522 +/* return true if name is not completely encoded in @key */
21523 +int is_longname_key(const reiser4_key * key)
21524 +{
21525 + __u64 highpart;
21526 +
21527 + assert("nikita-2863", key != NULL);
21528 + if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21529 + reiser4_print_key("oops", key);
21530 + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21531 +
21532 + if (REISER4_LARGE_KEY)
21533 + highpart = get_key_ordering(key);
21534 + else
21535 + highpart = get_key_objectid(key);
21536 +
21537 + return (highpart & longname_mark) ? 1 : 0;
21538 +}
21539 +
21540 +/* return true if @name is too long to be completely encoded in the key */
21541 +int is_longname(const char *name UNUSED_ARG, int len)
21542 +{
21543 + if (REISER4_LARGE_KEY)
21544 + return len > 23;
21545 + else
21546 + return len > 15;
21547 +}
21548 +
21549 +/* code ascii string into __u64.
21550 +
21551 + Put characters of @name into result (@str) one after another starting
21552 + from @start_idx-th highest (arithmetically) byte. This produces
21553 + endian-safe encoding. memcpy(2) will not do.
21554 +
21555 +*/
21556 +static __u64 pack_string(const char *name /* string to encode */ ,
21557 + int start_idx /* highest byte in result from
21558 + * which to start encoding */ )
21559 +{
21560 + unsigned i;
21561 + __u64 str;
21562 +
21563 + str = 0;
21564 + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21565 + str <<= 8;
21566 + str |= (unsigned char)name[i];
21567 + }
21568 + str <<= (sizeof str - i - start_idx) << 3;
21569 + return str;
21570 +}
21571 +
21572 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21573 + * string encoded in it and stores result in @buf */
21574 +char *reiser4_unpack_string(__u64 value, char *buf)
21575 +{
21576 + do {
21577 + *buf = value >> (64 - 8);
21578 + if (*buf)
21579 + ++buf;
21580 + value <<= 8;
21581 + } while (value != 0);
21582 + *buf = 0;
21583 + return buf;
21584 +}
21585 +
21586 +/* obtain name encoded in @key and store it in @buf */
21587 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21588 +{
21589 + char *c;
21590 +
21591 + assert("nikita-2868", !is_longname_key(key));
21592 +
21593 + c = buf;
21594 + if (REISER4_LARGE_KEY) {
21595 + c = reiser4_unpack_string(get_key_ordering(key) &
21596 + ~fibration_mask, c);
21597 + c = reiser4_unpack_string(get_key_fulloid(key), c);
21598 + } else
21599 + c = reiser4_unpack_string(get_key_fulloid(key) &
21600 + ~fibration_mask, c);
21601 + reiser4_unpack_string(get_key_offset(key), c);
21602 + return buf;
21603 +}
21604 +
21605 +/**
21606 + * complete_entry_key - calculate entry key by name
21607 + * @dir: directory where entry is (or will be) in
21608 + * @name: name to calculate key of
21609 + * @len: lenth of name
21610 + * @result: place to store result in
21611 + *
21612 + * Sets fields of entry key @result which depend on file name.
21613 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21614 + * objectid and offset. Otherwise, objectid and offset are set.
21615 + */
21616 +void complete_entry_key(const struct inode *dir, const char *name,
21617 + int len, reiser4_key *result)
21618 +{
21619 +#if REISER4_LARGE_KEY
21620 + __u64 ordering;
21621 + __u64 objectid;
21622 + __u64 offset;
21623 +
21624 + assert("nikita-1139", dir != NULL);
21625 + assert("nikita-1142", result != NULL);
21626 + assert("nikita-2867", strlen(name) == len);
21627 +
21628 + /*
21629 + * key allocation algorithm for directory entries in case of large
21630 + * keys:
21631 + *
21632 + * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21633 + * characters into ordering field of key, next 8 charactes (if any)
21634 + * into objectid field of key and next 8 ones (of any) into offset
21635 + * field of key
21636 + *
21637 + * If file name is longer than 23 characters, put first 7 characters
21638 + * into key's ordering, next 8 to objectid and hash of remaining
21639 + * characters into offset field.
21640 + *
21641 + * To distinguish above cases, in latter set up unused high bit in
21642 + * ordering field.
21643 + */
21644 +
21645 + /* [0-6] characters to ordering */
21646 + ordering = pack_string(name, 1);
21647 + if (len > 7) {
21648 + /* [7-14] characters to objectid */
21649 + objectid = pack_string(name + 7, 0);
21650 + if (len > 15) {
21651 + if (len <= 23) {
21652 + /* [15-23] characters to offset */
21653 + offset = pack_string(name + 15, 0);
21654 + } else {
21655 + /* note in a key the fact that offset contains
21656 + * hash */
21657 + ordering |= longname_mark;
21658 +
21659 + /* offset is the hash of the file name's tail */
21660 + offset = inode_hash_plugin(dir)->hash(name + 15,
21661 + len - 15);
21662 + }
21663 + } else {
21664 + offset = 0ull;
21665 + }
21666 + } else {
21667 + objectid = 0ull;
21668 + offset = 0ull;
21669 + }
21670 +
21671 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21672 + ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21673 +
21674 + set_key_ordering(result, ordering);
21675 + set_key_fulloid(result, objectid);
21676 + set_key_offset(result, offset);
21677 + return;
21678 +
21679 +#else
21680 + __u64 objectid;
21681 + __u64 offset;
21682 +
21683 + assert("nikita-1139", dir != NULL);
21684 + assert("nikita-1142", result != NULL);
21685 + assert("nikita-2867", strlen(name) == len);
21686 +
21687 + /*
21688 + * key allocation algorithm for directory entries in case of not large
21689 + * keys:
21690 + *
21691 + * If name is not longer than 7 + 8 = 15 characters, put first 7
21692 + * characters into objectid field of key, next 8 charactes (if any)
21693 + * into offset field of key
21694 + *
21695 + * If file name is longer than 15 characters, put first 7 characters
21696 + * into key's objectid, and hash of remaining characters into offset
21697 + * field.
21698 + *
21699 + * To distinguish above cases, in latter set up unused high bit in
21700 + * objectid field.
21701 + */
21702 +
21703 + /* [0-6] characters to objectid */
21704 + objectid = pack_string(name, 1);
21705 + if (len > 7) {
21706 + if (len <= 15) {
21707 + /* [7-14] characters to offset */
21708 + offset = pack_string(name + 7, 0);
21709 + } else {
21710 + /* note in a key the fact that offset contains hash. */
21711 + objectid |= longname_mark;
21712 +
21713 + /* offset is the hash of the file name. */
21714 + offset = inode_hash_plugin(dir)->hash(name + 7,
21715 + len - 7);
21716 + }
21717 + } else
21718 + offset = 0ull;
21719 +
21720 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21721 + objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21722 +
21723 + set_key_fulloid(result, objectid);
21724 + set_key_offset(result, offset);
21725 + return;
21726 +#endif /* ! REISER4_LARGE_KEY */
21727 +}
21728 +
21729 +/* true, if @key is the key of "." */
21730 +int is_dot_key(const reiser4_key * key/* key to check */)
21731 +{
21732 + assert("nikita-1717", key != NULL);
21733 + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21734 + return
21735 + (get_key_ordering(key) == 0ull) &&
21736 + (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21737 +}
21738 +
21739 +/* build key for stat-data.
21740 +
21741 + return key of stat-data of this object. This should became sd plugin
21742 + method in the future. For now, let it be here.
21743 +
21744 +*/
21745 +reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ ,
21746 + reiser4_key * result /* resulting key of @target
21747 + stat-data */ )
21748 +{
21749 + assert("nikita-261", result != NULL);
21750 +
21751 + reiser4_key_init(result);
21752 + set_key_locality(result, reiser4_inode_data(target)->locality_id);
21753 + set_key_ordering(result, get_inode_ordering(target));
21754 + set_key_objectid(result, get_inode_oid(target));
21755 + set_key_type(result, KEY_SD_MINOR);
21756 + set_key_offset(result, (__u64) 0);
21757 + return result;
21758 +}
21759 +
21760 +/* encode part of key into &obj_key_id
21761 +
21762 + This encodes into @id part of @key sufficient to restore @key later,
21763 + given that latter is key of object (key of stat-data).
21764 +
21765 + See &obj_key_id
21766 +*/
21767 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21768 + obj_key_id * id/* id where key is encoded in */)
21769 +{
21770 + assert("nikita-1151", key != NULL);
21771 + assert("nikita-1152", id != NULL);
21772 +
21773 + memcpy(id, key, sizeof *id);
21774 + return 0;
21775 +}
21776 +
21777 +/* encode reference to @obj in @id.
21778 +
21779 + This is like build_obj_key_id() above, but takes inode as parameter. */
21780 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21781 + obj_key_id * id/* result */)
21782 +{
21783 + reiser4_key sdkey;
21784 +
21785 + assert("nikita-1166", obj != NULL);
21786 + assert("nikita-1167", id != NULL);
21787 +
21788 + build_sd_key(obj, &sdkey);
21789 + build_obj_key_id(&sdkey, id);
21790 + return 0;
21791 +}
21792 +
21793 +/* decode @id back into @key
21794 +
21795 + Restore key of object stat-data from @id. This is dual to
21796 + build_obj_key_id() above.
21797 +*/
21798 +int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21799 + * from */ ,
21800 + reiser4_key * key/* result */)
21801 +{
21802 + assert("nikita-1153", id != NULL);
21803 + assert("nikita-1154", key != NULL);
21804 +
21805 + reiser4_key_init(key);
21806 + memcpy(key, id, sizeof *id);
21807 + return 0;
21808 +}
21809 +
21810 +/* extract objectid of directory from key of directory entry within said
21811 + directory.
21812 + */
21813 +oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21814 + * directory
21815 + * entry */ )
21816 +{
21817 + assert("nikita-1314", de_key != NULL);
21818 + return get_key_locality(de_key);
21819 +}
21820 +
21821 +/* encode into @id key of directory entry.
21822 +
21823 + Encode into @id information sufficient to later distinguish directory
21824 + entries within the same directory. This is not whole key, because all
21825 + directory entries within directory item share locality which is equal
21826 + to objectid of their directory.
21827 +
21828 +*/
21829 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21830 + const struct qstr *name /* name to be given to @obj by
21831 + * directory entry being
21832 + * constructed */ ,
21833 + de_id * id/* short key of directory entry */)
21834 +{
21835 + reiser4_key key;
21836 +
21837 + assert("nikita-1290", dir != NULL);
21838 + assert("nikita-1292", id != NULL);
21839 +
21840 + /* NOTE-NIKITA this is suboptimal. */
21841 + inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21842 + return build_de_id_by_key(&key, id);
21843 +}
21844 +
21845 +/* encode into @id key of directory entry.
21846 +
21847 + Encode into @id information sufficient to later distinguish directory
21848 + entries within the same directory. This is not whole key, because all
21849 + directory entries within directory item share locality which is equal
21850 + to objectid of their directory.
21851 +
21852 +*/
21853 +int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21854 + * entry */ ,
21855 + de_id * id/* short key of directory entry */)
21856 +{
21857 + memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21858 + return 0;
21859 +}
21860 +
21861 +/* restore from @id key of directory entry.
21862 +
21863 + Function dual to build_de_id(): given @id and locality, build full
21864 + key of directory entry within directory item.
21865 +
21866 +*/
21867 +int extract_key_from_de_id(const oid_t locality /* locality of directory
21868 + * entry */ ,
21869 + const de_id * id /* directory entry id */ ,
21870 + reiser4_key * key/* result */)
21871 +{
21872 + /* no need to initialise key here: all fields are overwritten */
21873 + memcpy(((__u64 *) key) + 1, id, sizeof *id);
21874 + set_key_locality(key, locality);
21875 + set_key_type(key, KEY_FILE_NAME_MINOR);
21876 + return 0;
21877 +}
21878 +
21879 +/* compare two &de_id's */
21880 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21881 + const de_id * id2/* second &de_id to compare */)
21882 +{
21883 + /* NOTE-NIKITA ugly implementation */
21884 + reiser4_key k1;
21885 + reiser4_key k2;
21886 +
21887 + extract_key_from_de_id((oid_t) 0, id1, &k1);
21888 + extract_key_from_de_id((oid_t) 0, id2, &k2);
21889 + return keycmp(&k1, &k2);
21890 +}
21891 +
21892 +/* compare &de_id with key */
21893 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21894 + const reiser4_key * key/* key to compare */)
21895 +{
21896 + cmp_t result;
21897 + reiser4_key *k1;
21898 +
21899 + k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21900 + result = KEY_DIFF_EL(k1, key, 1);
21901 + if (result == EQUAL_TO) {
21902 + result = KEY_DIFF_EL(k1, key, 2);
21903 + if (REISER4_LARGE_KEY && result == EQUAL_TO)
21904 + result = KEY_DIFF_EL(k1, key, 3);
21905 + }
21906 + return result;
21907 +}
21908 +
21909 +/*
21910 + * return number of bytes necessary to encode @inode identity.
21911 + */
21912 +int inode_onwire_size(const struct inode *inode)
21913 +{
21914 + int result;
21915 +
21916 + result = dscale_bytes_to_write(get_inode_oid(inode));
21917 + result += dscale_bytes_to_write(get_inode_locality(inode));
21918 +
21919 + /*
21920 + * ordering is large (it usually has highest bits set), so it makes
21921 + * little sense to dscale it.
21922 + */
21923 + if (REISER4_LARGE_KEY)
21924 + result += sizeof(get_inode_ordering(inode));
21925 + return result;
21926 +}
21927 +
21928 +/*
21929 + * encode @inode identity at @start
21930 + */
21931 +char *build_inode_onwire(const struct inode *inode, char *start)
21932 +{
21933 + start += dscale_write(start, get_inode_locality(inode));
21934 + start += dscale_write(start, get_inode_oid(inode));
21935 +
21936 + if (REISER4_LARGE_KEY) {
21937 + put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21938 + start += sizeof(get_inode_ordering(inode));
21939 + }
21940 + return start;
21941 +}
21942 +
21943 +/*
21944 + * extract key that was previously encoded by build_inode_onwire() at @addr
21945 + */
21946 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21947 +{
21948 + __u64 val;
21949 +
21950 + addr += dscale_read(addr, &val);
21951 + val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21952 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21953 + addr += dscale_read(addr, &val);
21954 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21955 +#if REISER4_LARGE_KEY
21956 + memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21957 + addr += sizeof key_id->ordering;
21958 +#endif
21959 + return addr;
21960 +}
21961 +
21962 +/*
21963 + * skip a key that was previously encoded by build_inode_onwire() at @addr
21964 + * FIXME: handle IO errors.
21965 + */
21966 +char * locate_obj_key_id_onwire(char * addr)
21967 +{
21968 + /* locality */
21969 + addr += dscale_bytes_to_read(addr);
21970 + /* objectid */
21971 + addr += dscale_bytes_to_read(addr);
21972 +#if REISER4_LARGE_KEY
21973 + addr += sizeof ((obj_key_id *)0)->ordering;
21974 +#endif
21975 + return addr;
21976 +}
21977 +
21978 +/* Make Linus happy.
21979 + Local variables:
21980 + c-indentation-style: "K&R"
21981 + mode-name: "LC"
21982 + c-basic-offset: 8
21983 + tab-width: 8
21984 + fill-column: 120
21985 + End:
21986 +*/
21987 diff -urN linux-2.6.33.orig/fs/reiser4/kassign.h linux-2.6.33/fs/reiser4/kassign.h
21988 --- linux-2.6.33.orig/fs/reiser4/kassign.h 1970-01-01 01:00:00.000000000 +0100
21989 +++ linux-2.6.33/fs/reiser4/kassign.h 2010-03-04 19:33:22.000000000 +0100
21990 @@ -0,0 +1,111 @@
21991 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21992 + * reiser4/README */
21993 +
21994 +/* Key assignment policy interface. See kassign.c for details. */
21995 +
21996 +#if !defined(__KASSIGN_H__)
21997 +#define __KASSIGN_H__
21998 +
21999 +#include "forward.h"
22000 +#include "key.h"
22001 +#include "dformat.h"
22002 +
22003 +#include <linux/types.h> /* for __u?? */
22004 +#include <linux/fs.h> /* for struct super_block, etc */
22005 +#include <linux/dcache.h> /* for struct qstr */
22006 +
22007 +/* key assignment functions */
22008 +
22009 +/* Information from which key of file stat-data can be uniquely
22010 + restored. This depends on key assignment policy for
22011 + stat-data. Currently it's enough to store object id and locality id
22012 + (60+60==120) bits, because minor packing locality and offset of
22013 + stat-data key are always known constants: KEY_SD_MINOR and 0
22014 + respectively. For simplicity 4 bits are wasted in each id, and just
22015 + two 64 bit integers are stored.
22016 +
22017 + This field has to be byte-aligned, because we don't want to waste
22018 + space in directory entries. There is another side of a coin of
22019 + course: we waste CPU and bus bandwidth in stead, by copying data back
22020 + and forth.
22021 +
22022 + Next optimization: &obj_key_id is mainly used to address stat data from
22023 + directory entries. Under the assumption that majority of files only have
22024 + only name (one hard link) from *the* parent directory it seems reasonable
22025 + to only store objectid of stat data and take its locality from key of
22026 + directory item.
22027 +
22028 + This requires some flag to be added to the &obj_key_id to distinguish
22029 + between these two cases. Remaining bits in flag byte are then asking to be
22030 + used to store file type.
22031 +
22032 + This optimization requires changes in directory item handling code.
22033 +
22034 +*/
22035 +typedef struct obj_key_id {
22036 + d8 locality[sizeof(__u64)];
22037 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
22038 + )
22039 + d8 objectid[sizeof(__u64)];
22040 +}
22041 +obj_key_id;
22042 +
22043 +/* Information sufficient to uniquely identify directory entry within
22044 + compressed directory item.
22045 +
22046 + For alignment issues see &obj_key_id above.
22047 +*/
22048 +typedef struct de_id {
22049 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
22050 + d8 objectid[sizeof(__u64)];
22051 + d8 offset[sizeof(__u64)];
22052 +}
22053 +de_id;
22054 +
22055 +extern int inode_onwire_size(const struct inode *obj);
22056 +extern char *build_inode_onwire(const struct inode *obj, char *area);
22057 +extern char *locate_obj_key_id_onwire(char *area);
22058 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
22059 +
22060 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
22061 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
22062 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
22063 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
22064 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
22065 + de_id * id);
22066 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
22067 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
22068 + reiser4_key * key);
22069 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
22070 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
22071 +
22072 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
22073 +extern void build_entry_key_common(const struct inode *dir,
22074 + const struct qstr *name,
22075 + reiser4_key * result);
22076 +extern void build_entry_key_stable_entry(const struct inode *dir,
22077 + const struct qstr *name,
22078 + reiser4_key * result);
22079 +extern int is_dot_key(const reiser4_key * key);
22080 +extern reiser4_key *build_sd_key(const struct inode *target,
22081 + reiser4_key * result);
22082 +
22083 +extern int is_longname_key(const reiser4_key * key);
22084 +extern int is_longname(const char *name, int len);
22085 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22086 +extern char *reiser4_unpack_string(__u64 value, char *buf);
22087 +extern void complete_entry_key(const struct inode *dir, const char *name,
22088 + int len, reiser4_key *result);
22089 +
22090 +/* __KASSIGN_H__ */
22091 +#endif
22092 +
22093 +/* Make Linus happy.
22094 + Local variables:
22095 + c-indentation-style: "K&R"
22096 + mode-name: "LC"
22097 + c-basic-offset: 8
22098 + tab-width: 8
22099 + fill-column: 120
22100 + End:
22101 +*/
22102 diff -urN linux-2.6.33.orig/fs/reiser4/Kconfig linux-2.6.33/fs/reiser4/Kconfig
22103 --- linux-2.6.33.orig/fs/reiser4/Kconfig 1970-01-01 01:00:00.000000000 +0100
22104 +++ linux-2.6.33/fs/reiser4/Kconfig 2010-03-04 19:33:22.000000000 +0100
22105 @@ -0,0 +1,34 @@
22106 +config REISER4_FS
22107 + tristate "Reiser4 (EXPERIMENTAL)"
22108 + depends on EXPERIMENTAL
22109 + select ZLIB_INFLATE
22110 + select ZLIB_DEFLATE
22111 + select LZO_COMPRESS
22112 + select LZO_DECOMPRESS
22113 + select CRYPTO
22114 + help
22115 + Reiser4 is a filesystem that performs all filesystem operations
22116 + as atomic transactions, which means that it either performs a
22117 + write, or it does not, and in the event of a crash it does not
22118 + partially perform it or corrupt it.
22119 +
22120 + It stores files in dancing trees, which are like balanced trees but
22121 + faster. It packs small files together so that they share blocks
22122 + without wasting space. This means you can use it to store really
22123 + small files. It also means that it saves you disk space. It avoids
22124 + hassling you with anachronisms like having a maximum number of
22125 + inodes, and wasting space if you use less than that number.
22126 +
22127 + Reiser4 is a distinct filesystem type from reiserfs (V3).
22128 + It's therefore not possible to use reiserfs file systems
22129 + with reiser4.
22130 +
22131 + To learn more about reiser4, go to http://www.namesys.com
22132 +
22133 +config REISER4_DEBUG
22134 + bool "Enable reiser4 debug mode"
22135 + depends on REISER4_FS
22136 + help
22137 + Don't use this unless you are debugging reiser4.
22138 +
22139 + If unsure, say N.
22140 diff -urN linux-2.6.33.orig/fs/reiser4/key.c linux-2.6.33/fs/reiser4/key.c
22141 --- linux-2.6.33.orig/fs/reiser4/key.c 1970-01-01 01:00:00.000000000 +0100
22142 +++ linux-2.6.33/fs/reiser4/key.c 2010-03-04 19:33:22.000000000 +0100
22143 @@ -0,0 +1,138 @@
22144 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22145 + * reiser4/README */
22146 +
22147 +/* Key manipulations. */
22148 +
22149 +#include "debug.h"
22150 +#include "key.h"
22151 +#include "super.h"
22152 +#include "reiser4.h"
22153 +
22154 +#include <linux/types.h> /* for __u?? */
22155 +
22156 +/* Minimal possible key: all components are zero. It is presumed that this is
22157 + independent of key scheme. */
22158 +static const reiser4_key MINIMAL_KEY = {
22159 + .el = {
22160 + 0ull,
22161 + ON_LARGE_KEY(0ull,)
22162 + 0ull,
22163 + 0ull
22164 + }
22165 +};
22166 +
22167 +/* Maximal possible key: all components are ~0. It is presumed that this is
22168 + independent of key scheme. */
22169 +static const reiser4_key MAXIMAL_KEY = {
22170 + .el = {
22171 + __constant_cpu_to_le64(~0ull),
22172 + ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22173 + __constant_cpu_to_le64(~0ull),
22174 + __constant_cpu_to_le64(~0ull)
22175 + }
22176 +};
22177 +
22178 +/* Initialize key. */
22179 +void reiser4_key_init(reiser4_key * key/* key to init */)
22180 +{
22181 + assert("nikita-1169", key != NULL);
22182 + memset(key, 0, sizeof *key);
22183 +}
22184 +
22185 +/* minimal possible key in the tree. Return pointer to the static storage. */
22186 +const reiser4_key * reiser4_min_key(void)
22187 +{
22188 + return &MINIMAL_KEY;
22189 +}
22190 +
22191 +/* maximum possible key in the tree. Return pointer to the static storage. */
22192 +const reiser4_key * reiser4_max_key(void)
22193 +{
22194 + return &MAXIMAL_KEY;
22195 +}
22196 +
22197 +#if REISER4_DEBUG
22198 +/* debugging aid: print symbolic name of key type */
22199 +static const char *type_name(unsigned int key_type/* key type */)
22200 +{
22201 + switch (key_type) {
22202 + case KEY_FILE_NAME_MINOR:
22203 + return "file name";
22204 + case KEY_SD_MINOR:
22205 + return "stat data";
22206 + case KEY_ATTR_NAME_MINOR:
22207 + return "attr name";
22208 + case KEY_ATTR_BODY_MINOR:
22209 + return "attr body";
22210 + case KEY_BODY_MINOR:
22211 + return "file body";
22212 + default:
22213 + return "unknown";
22214 + }
22215 +}
22216 +
22217 +/* debugging aid: print human readable information about key */
22218 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
22219 + const reiser4_key * key/* key to print */)
22220 +{
22221 + /* turn bold on */
22222 + /* printf ("\033[1m"); */
22223 + if (key == NULL)
22224 + printk("%s: null key\n", prefix);
22225 + else {
22226 + if (REISER4_LARGE_KEY)
22227 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22228 + get_key_locality(key),
22229 + get_key_type(key),
22230 + get_key_ordering(key),
22231 + get_key_band(key),
22232 + get_key_objectid(key), get_key_offset(key));
22233 + else
22234 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22235 + get_key_locality(key),
22236 + get_key_type(key),
22237 + get_key_band(key),
22238 + get_key_objectid(key), get_key_offset(key));
22239 + /*
22240 + * if this is a key of directory entry, try to decode part of
22241 + * a name stored in the key, and output it.
22242 + */
22243 + if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22244 + char buf[DE_NAME_BUF_LEN];
22245 + char *c;
22246 +
22247 + c = buf;
22248 + c = reiser4_unpack_string(get_key_ordering(key), c);
22249 + reiser4_unpack_string(get_key_fulloid(key), c);
22250 + printk("[%s", buf);
22251 + if (is_longname_key(key))
22252 + /*
22253 + * only part of the name is stored in the key.
22254 + */
22255 + printk("...]\n");
22256 + else {
22257 + /*
22258 + * whole name is stored in the key.
22259 + */
22260 + reiser4_unpack_string(get_key_offset(key), buf);
22261 + printk("%s]\n", buf);
22262 + }
22263 + } else {
22264 + printk("[%s]\n", type_name(get_key_type(key)));
22265 + }
22266 + }
22267 + /* turn bold off */
22268 + /* printf ("\033[m\017"); */
22269 +}
22270 +
22271 +#endif
22272 +
22273 +/* Make Linus happy.
22274 + Local variables:
22275 + c-indentation-style: "K&R"
22276 + mode-name: "LC"
22277 + c-basic-offset: 8
22278 + tab-width: 8
22279 + fill-column: 120
22280 + End:
22281 +*/
22282 diff -urN linux-2.6.33.orig/fs/reiser4/key.h linux-2.6.33/fs/reiser4/key.h
22283 --- linux-2.6.33.orig/fs/reiser4/key.h 1970-01-01 01:00:00.000000000 +0100
22284 +++ linux-2.6.33/fs/reiser4/key.h 2010-03-04 19:33:22.000000000 +0100
22285 @@ -0,0 +1,392 @@
22286 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by
22287 + * reiser4/README */
22288 +
22289 +/* Declarations of key-related data-structures and operations on keys. */
22290 +
22291 +#if !defined(__REISER4_KEY_H__)
22292 +#define __REISER4_KEY_H__
22293 +
22294 +#include "dformat.h"
22295 +#include "forward.h"
22296 +#include "debug.h"
22297 +
22298 +#include <linux/types.h> /* for __u?? */
22299 +
22300 +/* Operations on keys in reiser4 tree */
22301 +
22302 +/* No access to any of these fields shall be done except via a
22303 + wrapping macro/function, and that wrapping macro/function shall
22304 + convert to little endian order. Compare keys will consider cpu byte order. */
22305 +
22306 +/* A storage layer implementation difference between a regular unix file body
22307 + and its attributes is in the typedef below which causes all of the attributes
22308 + of a file to be near in key to all of the other attributes for all of the
22309 + files within that directory, and not near to the file itself. It is
22310 + interesting to consider whether this is the wrong approach, and whether there
22311 + should be no difference at all. For current usage patterns this choice is
22312 + probably the right one. */
22313 +
22314 +/* possible values for minor packing locality (4 bits required) */
22315 +typedef enum {
22316 + /* file name */
22317 + KEY_FILE_NAME_MINOR = 0,
22318 + /* stat-data */
22319 + KEY_SD_MINOR = 1,
22320 + /* file attribute name */
22321 + KEY_ATTR_NAME_MINOR = 2,
22322 + /* file attribute value */
22323 + KEY_ATTR_BODY_MINOR = 3,
22324 + /* file body (tail or extent) */
22325 + KEY_BODY_MINOR = 4,
22326 +} key_minor_locality;
22327 +
22328 +/* Everything stored in the tree has a unique key, which means that the tree is
22329 + (logically) fully ordered by key. Physical order is determined by dynamic
22330 + heuristics that attempt to reflect key order when allocating available space,
22331 + and by the repacker. It is stylistically better to put aggregation
22332 + information into the key. Thus, if you want to segregate extents from tails,
22333 + it is better to give them distinct minor packing localities rather than
22334 + changing block_alloc.c to check the node type when deciding where to allocate
22335 + the node.
22336 +
22337 + The need to randomly displace new directories and large files disturbs this
22338 + symmetry unfortunately. However, it should be noted that this is a need that
22339 + is not clearly established given the existence of a repacker. Also, in our
22340 + current implementation tails have a different minor packing locality from
22341 + extents, and no files have both extents and tails, so maybe symmetry can be
22342 + had without performance cost after all. Symmetry is what we ship for now....
22343 +*/
22344 +
22345 +/* Arbitrary major packing localities can be assigned to objects using
22346 + the reiser4(filenameA/..packing<=some_number) system call.
22347 +
22348 + In reiser4, the creat() syscall creates a directory
22349 +
22350 + whose default flow (that which is referred to if the directory is
22351 + read as a file) is the traditional unix file body.
22352 +
22353 + whose directory plugin is the 'filedir'
22354 +
22355 + whose major packing locality is that of the parent of the object created.
22356 +
22357 + The static_stat item is a particular commonly used directory
22358 + compression (the one for normal unix files).
22359 +
22360 + The filedir plugin checks to see if the static_stat item exists.
22361 + There is a unique key for static_stat. If yes, then it uses the
22362 + static_stat item for all of the values that it contains. The
22363 + static_stat item contains a flag for each stat it contains which
22364 + indicates whether one should look outside the static_stat item for its
22365 + contents.
22366 +*/
22367 +
22368 +/* offset of fields in reiser4_key. Value of each element of this enum
22369 + is index within key (thought as array of __u64's) where this field
22370 + is. */
22371 +typedef enum {
22372 + /* major "locale", aka dirid. Sits in 1st element */
22373 + KEY_LOCALITY_INDEX = 0,
22374 + /* minor "locale", aka item type. Sits in 1st element */
22375 + KEY_TYPE_INDEX = 0,
22376 + ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22377 + /* "object band". Sits in 2nd element */
22378 + KEY_BAND_INDEX,
22379 + /* objectid. Sits in 2nd element */
22380 + KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22381 + /* full objectid. Sits in 2nd element */
22382 + KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22383 + /* Offset. Sits in 3rd element */
22384 + KEY_OFFSET_INDEX,
22385 + /* Name hash. Sits in 3rd element */
22386 + KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22387 + KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22388 + KEY_LAST_INDEX
22389 +} reiser4_key_field_index;
22390 +
22391 +/* key in reiser4 internal "balanced" tree. It is just array of three
22392 + 64bit integers in disk byte order (little-endian by default). This
22393 + array is actually indexed by reiser4_key_field. Each __u64 within
22394 + this array is called "element". Logical key component encoded within
22395 + elements are called "fields".
22396 +
22397 + We declare this as union with second component dummy to suppress
22398 + inconvenient array<->pointer casts implied in C. */
22399 +union reiser4_key {
22400 + __le64 el[KEY_LAST_INDEX];
22401 + int pad;
22402 +};
22403 +
22404 +/* bitmasks showing where within reiser4_key particular key is stored. */
22405 +/* major locality occupies higher 60 bits of the first element */
22406 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22407 +
22408 +/* minor locality occupies lower 4 bits of the first element */
22409 +#define KEY_TYPE_MASK 0xfull
22410 +
22411 +/* controversial band occupies higher 4 bits of the 2nd element */
22412 +#define KEY_BAND_MASK 0xf000000000000000ull
22413 +
22414 +/* objectid occupies lower 60 bits of the 2nd element */
22415 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22416 +
22417 +/* full 64bit objectid*/
22418 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22419 +
22420 +/* offset is just 3rd L.M.Nt itself */
22421 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22422 +
22423 +/* ordering is whole second element */
22424 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22425 +
22426 +/* how many bits key element should be shifted to left to get particular field
22427 + */
22428 +typedef enum {
22429 + KEY_LOCALITY_SHIFT = 4,
22430 + KEY_TYPE_SHIFT = 0,
22431 + KEY_BAND_SHIFT = 60,
22432 + KEY_OBJECTID_SHIFT = 0,
22433 + KEY_FULLOID_SHIFT = 0,
22434 + KEY_OFFSET_SHIFT = 0,
22435 + KEY_ORDERING_SHIFT = 0,
22436 +} reiser4_key_field_shift;
22437 +
22438 +static inline __u64
22439 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22440 +{
22441 + assert("nikita-753", key != NULL);
22442 + assert("nikita-754", off < KEY_LAST_INDEX);
22443 + return le64_to_cpu(get_unaligned(&key->el[off]));
22444 +}
22445 +
22446 +static inline void
22447 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22448 +{
22449 + assert("nikita-755", key != NULL);
22450 + assert("nikita-756", off < KEY_LAST_INDEX);
22451 + put_unaligned(cpu_to_le64(value), &key->el[off]);
22452 +}
22453 +
22454 +/* macro to define getter and setter functions for field F with type T */
22455 +#define DEFINE_KEY_FIELD(L, U, T) \
22456 +static inline T get_key_ ## L(const reiser4_key *key) \
22457 +{ \
22458 + assert("nikita-750", key != NULL); \
22459 + return (T) (get_key_el(key, KEY_ ## U ## _INDEX) & \
22460 + KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT; \
22461 +} \
22462 + \
22463 +static inline void set_key_ ## L(reiser4_key * key, T loc) \
22464 +{ \
22465 + __u64 el; \
22466 + \
22467 + assert("nikita-752", key != NULL); \
22468 + \
22469 + el = get_key_el(key, KEY_ ## U ## _INDEX); \
22470 + /* clear field bits in the key */ \
22471 + el &= ~KEY_ ## U ## _MASK; \
22472 + /* actually it should be \
22473 + \
22474 + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22475 + \
22476 + but we trust user to never pass values that wouldn't fit \
22477 + into field. Clearing extra bits is one operation, but this \
22478 + function is time-critical. \
22479 + But check this in assertion. */ \
22480 + assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) & \
22481 + ~KEY_ ## U ## _MASK) == 0); \
22482 + el |= (loc << KEY_ ## U ## _SHIFT); \
22483 + set_key_el(key, KEY_ ## U ## _INDEX, el); \
22484 +}
22485 +
22486 +typedef __u64 oid_t;
22487 +
22488 +/* define get_key_locality(), set_key_locality() */
22489 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22490 +/* define get_key_type(), set_key_type() */
22491 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22492 +/* define get_key_band(), set_key_band() */
22493 +DEFINE_KEY_FIELD(band, BAND, __u64);
22494 +/* define get_key_objectid(), set_key_objectid() */
22495 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22496 +/* define get_key_fulloid(), set_key_fulloid() */
22497 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22498 +/* define get_key_offset(), set_key_offset() */
22499 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22500 +#if (REISER4_LARGE_KEY)
22501 +/* define get_key_ordering(), set_key_ordering() */
22502 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22503 +#else
22504 +static inline __u64 get_key_ordering(const reiser4_key * key)
22505 +{
22506 + return 0;
22507 +}
22508 +
22509 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22510 +{
22511 +}
22512 +#endif
22513 +
22514 +/* key comparison result */
22515 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22516 + EQUAL_TO = 0, /* if keys are equal */
22517 + GREATER_THAN = +1 /* if first key is greater than second */
22518 +} cmp_t;
22519 +
22520 +void reiser4_key_init(reiser4_key * key);
22521 +
22522 +/* minimal possible key in the tree. Return pointer to the static storage. */
22523 +extern const reiser4_key *reiser4_min_key(void);
22524 +extern const reiser4_key *reiser4_max_key(void);
22525 +
22526 +/* helper macro for keycmp() */
22527 +#define KEY_DIFF(k1, k2, field) \
22528 +({ \
22529 + typeof(get_key_ ## field(k1)) f1; \
22530 + typeof(get_key_ ## field(k2)) f2; \
22531 + \
22532 + f1 = get_key_ ## field(k1); \
22533 + f2 = get_key_ ## field(k2); \
22534 + \
22535 + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22536 +})
22537 +
22538 +/* helper macro for keycmp() */
22539 +#define KEY_DIFF_EL(k1, k2, off) \
22540 +({ \
22541 + __u64 e1; \
22542 + __u64 e2; \
22543 + \
22544 + e1 = get_key_el(k1, off); \
22545 + e2 = get_key_el(k2, off); \
22546 + \
22547 + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22548 +})
22549 +
22550 +/* compare `k1' and `k2'. This function is a heart of "key allocation
22551 + policy". All you need to implement new policy is to add yet another
22552 + clause here. */
22553 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22554 + const reiser4_key * k2/* second key to compare */)
22555 +{
22556 + cmp_t result;
22557 +
22558 + /*
22559 + * This function is the heart of reiser4 tree-routines. Key comparison
22560 + * is among most heavily used operations in the file system.
22561 + */
22562 +
22563 + assert("nikita-439", k1 != NULL);
22564 + assert("nikita-440", k2 != NULL);
22565 +
22566 + /* there is no actual branch here: condition is compile time constant
22567 + * and constant folding and propagation ensures that only one branch
22568 + * is actually compiled in. */
22569 +
22570 + if (REISER4_PLANA_KEY_ALLOCATION) {
22571 + /* if physical order of fields in a key is identical
22572 + with logical order, we can implement key comparison
22573 + as three 64bit comparisons. */
22574 + /* logical order of fields in plan-a:
22575 + locality->type->objectid->offset. */
22576 + /* compare locality and type at once */
22577 + result = KEY_DIFF_EL(k1, k2, 0);
22578 + if (result == EQUAL_TO) {
22579 + /* compare objectid (and band if it's there) */
22580 + result = KEY_DIFF_EL(k1, k2, 1);
22581 + /* compare offset */
22582 + if (result == EQUAL_TO) {
22583 + result = KEY_DIFF_EL(k1, k2, 2);
22584 + if (REISER4_LARGE_KEY && result == EQUAL_TO)
22585 + result = KEY_DIFF_EL(k1, k2, 3);
22586 + }
22587 + }
22588 + } else if (REISER4_3_5_KEY_ALLOCATION) {
22589 + result = KEY_DIFF(k1, k2, locality);
22590 + if (result == EQUAL_TO) {
22591 + result = KEY_DIFF(k1, k2, objectid);
22592 + if (result == EQUAL_TO) {
22593 + result = KEY_DIFF(k1, k2, type);
22594 + if (result == EQUAL_TO)
22595 + result = KEY_DIFF(k1, k2, offset);
22596 + }
22597 + }
22598 + } else
22599 + impossible("nikita-441", "Unknown key allocation scheme!");
22600 + return result;
22601 +}
22602 +
22603 +/* true if @k1 equals @k2 */
22604 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22605 + const reiser4_key * k2/* second key to compare */)
22606 +{
22607 + assert("nikita-1879", k1 != NULL);
22608 + assert("nikita-1880", k2 != NULL);
22609 + return !memcmp(k1, k2, sizeof *k1);
22610 +}
22611 +
22612 +/* true if @k1 is less than @k2 */
22613 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22614 + const reiser4_key * k2/* second key to compare */)
22615 +{
22616 + assert("nikita-1952", k1 != NULL);
22617 + assert("nikita-1953", k2 != NULL);
22618 + return keycmp(k1, k2) == LESS_THAN;
22619 +}
22620 +
22621 +/* true if @k1 is less than or equal to @k2 */
22622 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22623 + const reiser4_key * k2/* second key to compare */)
22624 +{
22625 + assert("nikita-1954", k1 != NULL);
22626 + assert("nikita-1955", k2 != NULL);
22627 + return keycmp(k1, k2) != GREATER_THAN;
22628 +}
22629 +
22630 +/* true if @k1 is greater than @k2 */
22631 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22632 + const reiser4_key * k2/* second key to compare */)
22633 +{
22634 + assert("nikita-1959", k1 != NULL);
22635 + assert("nikita-1960", k2 != NULL);
22636 + return keycmp(k1, k2) == GREATER_THAN;
22637 +}
22638 +
22639 +/* true if @k1 is greater than or equal to @k2 */
22640 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22641 + const reiser4_key * k2/* second key to compare */)
22642 +{
22643 + assert("nikita-1956", k1 != NULL);
22644 + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22645 + * November 3: Laika */
22646 + return keycmp(k1, k2) != LESS_THAN;
22647 +}
22648 +
22649 +static inline void prefetchkey(reiser4_key * key)
22650 +{
22651 + prefetch(key);
22652 + prefetch(&key->el[KEY_CACHELINE_END]);
22653 +}
22654 +
22655 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22656 + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22657 +/* size of a buffer suitable to hold human readable key representation */
22658 +#define KEY_BUF_LEN (80)
22659 +
22660 +#if REISER4_DEBUG
22661 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22662 +#else
22663 +#define reiser4_print_key(p, k) noop
22664 +#endif
22665 +
22666 +/* __FS_REISERFS_KEY_H__ */
22667 +#endif
22668 +
22669 +/* Make Linus happy.
22670 + Local variables:
22671 + c-indentation-style: "K&R"
22672 + mode-name: "LC"
22673 + c-basic-offset: 8
22674 + tab-width: 8
22675 + fill-column: 120
22676 + End:
22677 +*/
22678 diff -urN linux-2.6.33.orig/fs/reiser4/ktxnmgrd.c linux-2.6.33/fs/reiser4/ktxnmgrd.c
22679 --- linux-2.6.33.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 01:00:00.000000000 +0100
22680 +++ linux-2.6.33/fs/reiser4/ktxnmgrd.c 2010-03-04 19:33:22.000000000 +0100
22681 @@ -0,0 +1,215 @@
22682 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22683 +/* Transaction manager daemon. */
22684 +
22685 +/*
22686 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22687 + * needed/important for the following reasons:
22688 + *
22689 + * 1. in reiser4 atom is not committed immediately when last transaction
22690 + * handle closes, unless atom is either too old or too large (see
22691 + * atom_should_commit()). This is done to avoid committing too frequently.
22692 + * because:
22693 + *
22694 + * 2. sometimes we don't want to commit atom when closing last transaction
22695 + * handle even if it is old and fat enough. For example, because we are at
22696 + * this point under directory semaphore, and committing would stall all
22697 + * accesses to this directory.
22698 + *
22699 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22700 + * either due to (tunable) timeout or because it was explicitly woken up by
22701 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22702 + * eligible.
22703 + *
22704 + */
22705 +
22706 +#include "debug.h"
22707 +#include "txnmgr.h"
22708 +#include "tree.h"
22709 +#include "ktxnmgrd.h"
22710 +#include "super.h"
22711 +#include "reiser4.h"
22712 +
22713 +#include <linux/sched.h> /* for struct task_struct */
22714 +#include <linux/wait.h>
22715 +#include <linux/suspend.h>
22716 +#include <linux/kernel.h>
22717 +#include <linux/writeback.h>
22718 +#include <linux/kthread.h>
22719 +#include <linux/freezer.h>
22720 +
22721 +static int scan_mgr(struct super_block *);
22722 +
22723 +/*
22724 + * change current->comm so that ps, top, and friends will see changed
22725 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22726 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22727 + */
22728 +#define set_comm(state) \
22729 + snprintf(current->comm, sizeof(current->comm), \
22730 + "%s:%s:%s", __FUNCTION__, (super)->s_id, (state))
22731 +
22732 +/**
22733 + * ktxnmgrd - kernel txnmgr daemon
22734 + * @arg: pointer to super block
22735 + *
22736 + * The background transaction manager daemon, started as a kernel thread during
22737 + * reiser4 initialization.
22738 + */
22739 +static int ktxnmgrd(void *arg)
22740 +{
22741 + struct super_block *super;
22742 + ktxnmgrd_context *ctx;
22743 + txn_mgr *mgr;
22744 + int done = 0;
22745 +
22746 + super = arg;
22747 + mgr = &get_super_private(super)->tmgr;
22748 +
22749 + /*
22750 + * do_fork() just copies task_struct into the new thread. ->fs_context
22751 + * shouldn't be copied of course. This shouldn't be a problem for the
22752 + * rest of the code though.
22753 + */
22754 + current->journal_info = NULL;
22755 + ctx = mgr->daemon;
22756 + while (1) {
22757 + try_to_freeze();
22758 + set_comm("wait");
22759 + {
22760 + DEFINE_WAIT(__wait);
22761 +
22762 + prepare_to_wait(&ctx->wait, &__wait,
22763 + TASK_INTERRUPTIBLE);
22764 + if (kthread_should_stop())
22765 + done = 1;
22766 + else
22767 + schedule_timeout(ctx->timeout);
22768 + finish_wait(&ctx->wait, &__wait);
22769 + }
22770 + if (done)
22771 + break;
22772 + set_comm("run");
22773 + spin_lock(&ctx->guard);
22774 + /*
22775 + * wait timed out or ktxnmgrd was woken up by explicit request
22776 + * to commit something. Scan list of atoms in txnmgr and look
22777 + * for too old atoms.
22778 + */
22779 + do {
22780 + ctx->rescan = 0;
22781 + scan_mgr(super);
22782 + spin_lock(&ctx->guard);
22783 + if (ctx->rescan) {
22784 + /*
22785 + * the list could be modified while ctx
22786 + * spinlock was released, we have to repeat
22787 + * scanning from the beginning
22788 + */
22789 + break;
22790 + }
22791 + } while (ctx->rescan);
22792 + spin_unlock(&ctx->guard);
22793 + }
22794 + return 0;
22795 +}
22796 +
22797 +#undef set_comm
22798 +
22799 +/**
22800 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22801 + * @super: pointer to super block
22802 + *
22803 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22804 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22805 + */
22806 +int reiser4_init_ktxnmgrd(struct super_block *super)
22807 +{
22808 + txn_mgr *mgr;
22809 + ktxnmgrd_context *ctx;
22810 +
22811 + mgr = &get_super_private(super)->tmgr;
22812 +
22813 + assert("zam-1014", mgr->daemon == NULL);
22814 +
22815 + ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22816 + if (!ctx)
22817 + return RETERR(-ENOMEM);
22818 +
22819 + assert("nikita-2442", ctx != NULL);
22820 +
22821 + init_waitqueue_head(&ctx->wait);
22822 +
22823 + /*kcond_init(&ctx->startup);*/
22824 + spin_lock_init(&ctx->guard);
22825 + ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22826 + ctx->rescan = 1;
22827 + mgr->daemon = ctx;
22828 +
22829 + ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22830 + if (IS_ERR(ctx->tsk)) {
22831 + int ret = PTR_ERR(ctx->tsk);
22832 + mgr->daemon = NULL;
22833 + kfree(ctx);
22834 + return RETERR(ret);
22835 + }
22836 + return 0;
22837 +}
22838 +
22839 +void ktxnmgrd_kick(txn_mgr *mgr)
22840 +{
22841 + assert("nikita-3234", mgr != NULL);
22842 + assert("nikita-3235", mgr->daemon != NULL);
22843 + wake_up(&mgr->daemon->wait);
22844 +}
22845 +
22846 +int is_current_ktxnmgrd(void)
22847 +{
22848 + return (get_current_super_private()->tmgr.daemon->tsk == current);
22849 +}
22850 +
22851 +/**
22852 + * scan_mgr - commit atoms which are to be committed
22853 + * @super: super block to commit atoms of
22854 + *
22855 + * Commits old atoms.
22856 + */
22857 +static int scan_mgr(struct super_block *super)
22858 +{
22859 + int ret;
22860 + reiser4_context ctx;
22861 +
22862 + init_stack_context(&ctx, super);
22863 +
22864 + ret = commit_some_atoms(&get_super_private(super)->tmgr);
22865 +
22866 + reiser4_exit_context(&ctx);
22867 + return ret;
22868 +}
22869 +
22870 +/**
22871 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22872 + * @mgr:
22873 + *
22874 + * This is called on umount. Stops ktxnmgrd and free t
22875 + */
22876 +void reiser4_done_ktxnmgrd(struct super_block *super)
22877 +{
22878 + txn_mgr *mgr;
22879 +
22880 + mgr = &get_super_private(super)->tmgr;
22881 + assert("zam-1012", mgr->daemon != NULL);
22882 +
22883 + kthread_stop(mgr->daemon->tsk);
22884 + kfree(mgr->daemon);
22885 + mgr->daemon = NULL;
22886 +}
22887 +
22888 +/*
22889 + * Local variables:
22890 + * c-indentation-style: "K&R"
22891 + * mode-name: "LC"
22892 + * c-basic-offset: 8
22893 + * tab-width: 8
22894 + * fill-column: 120
22895 + * End:
22896 + */
22897 diff -urN linux-2.6.33.orig/fs/reiser4/ktxnmgrd.h linux-2.6.33/fs/reiser4/ktxnmgrd.h
22898 --- linux-2.6.33.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 01:00:00.000000000 +0100
22899 +++ linux-2.6.33/fs/reiser4/ktxnmgrd.h 2010-03-04 19:33:22.000000000 +0100
22900 @@ -0,0 +1,52 @@
22901 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22902 + * reiser4/README */
22903 +
22904 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22905 +
22906 +#ifndef __KTXNMGRD_H__
22907 +#define __KTXNMGRD_H__
22908 +
22909 +#include "txnmgr.h"
22910 +
22911 +#include <linux/fs.h>
22912 +#include <linux/wait.h>
22913 +#include <linux/completion.h>
22914 +#include <linux/spinlock.h>
22915 +#include <asm/atomic.h>
22916 +#include <linux/sched.h> /* for struct task_struct */
22917 +
22918 +/* in this structure all data necessary to start up, shut down and communicate
22919 + * with ktxnmgrd are kept. */
22920 +struct ktxnmgrd_context {
22921 + /* wait queue head on which ktxnmgrd sleeps */
22922 + wait_queue_head_t wait;
22923 + /* spin lock protecting all fields of this structure */
22924 + spinlock_t guard;
22925 + /* timeout of sleeping on ->wait */
22926 + signed long timeout;
22927 + /* kernel thread running ktxnmgrd */
22928 + struct task_struct *tsk;
22929 + /* list of all file systems served by this ktxnmgrd */
22930 + struct list_head queue;
22931 + /* should ktxnmgrd repeat scanning of atoms? */
22932 + unsigned int rescan:1;
22933 +};
22934 +
22935 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22936 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22937 +
22938 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22939 +extern int is_current_ktxnmgrd(void);
22940 +
22941 +/* __KTXNMGRD_H__ */
22942 +#endif
22943 +
22944 +/* Make Linus happy.
22945 + Local variables:
22946 + c-indentation-style: "K&R"
22947 + mode-name: "LC"
22948 + c-basic-offset: 8
22949 + tab-width: 8
22950 + fill-column: 120
22951 + End:
22952 +*/
22953 diff -urN linux-2.6.33.orig/fs/reiser4/lock.c linux-2.6.33/fs/reiser4/lock.c
22954 --- linux-2.6.33.orig/fs/reiser4/lock.c 1970-01-01 01:00:00.000000000 +0100
22955 +++ linux-2.6.33/fs/reiser4/lock.c 2010-03-04 19:33:22.000000000 +0100
22956 @@ -0,0 +1,1237 @@
22957 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22958 + * reiser4/README */
22959 +
22960 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22961 + order. V4 balances the tree from the bottom up, and searches the tree from
22962 + the top down, and that is really the way we want it, so tradition won't work
22963 + for us.
22964 +
22965 + Instead we have two lock orderings, a high priority lock ordering, and a low
22966 + priority lock ordering. Each node in the tree has a lock in its znode.
22967 +
22968 + Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22969 + has a set (maybe empty) of already locked nodes ("process locked set"). Each
22970 + process may have a pending lock request to a node locked by another process.
22971 + Note: we lock and unlock, but do not transfer locks: it is possible
22972 + transferring locks instead would save some bus locking....
22973 +
22974 + Deadlock occurs when we have a loop constructed from process locked sets and
22975 + lock request vectors.
22976 +
22977 + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22978 + memory is extended with "znodes" with which we connect nodes with their left
22979 + and right neighbors using sibling pointers stored in the znodes. When we
22980 + perform balancing operations we often go from left to right and from right to
22981 + left.
22982 +
22983 + +-P1-+ +-P3-+
22984 + |+--+| V1 |+--+|
22985 + ||N1|| -------> ||N3||
22986 + |+--+| |+--+|
22987 + +----+ +----+
22988 + ^ |
22989 + |V2 |V3
22990 + | v
22991 + +---------P2---------+
22992 + |+--+ +--+|
22993 + ||N2| -------- |N4||
22994 + |+--+ +--+|
22995 + +--------------------+
22996 +
22997 + We solve this by ensuring that only low priority processes lock in top to
22998 + bottom order and from right to left, and high priority processes lock from
22999 + bottom to top and left to right.
23000 +
23001 + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
23002 + kill those damn busy loops.
23003 + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
23004 + stage) cannot be ordered that way. There are no rules what nodes can belong
23005 + to the atom and what nodes cannot. We cannot define what is right or left
23006 + direction, what is top or bottom. We can take immediate parent or side
23007 + neighbor of one node, but nobody guarantees that, say, left neighbor node is
23008 + not a far right neighbor for other nodes from the same atom. It breaks
23009 + deadlock avoidance rules and hi-low priority locking cannot be applied for
23010 + atom locks.
23011 +
23012 + How does it help to avoid deadlocks ?
23013 +
23014 + Suppose we have a deadlock with n processes. Processes from one priority
23015 + class never deadlock because they take locks in one consistent
23016 + order.
23017 +
23018 + So, any possible deadlock loop must have low priority as well as high
23019 + priority processes. There are no other lock priority levels except low and
23020 + high. We know that any deadlock loop contains at least one node locked by a
23021 + low priority process and requested by a high priority process. If this
23022 + situation is caught and resolved it is sufficient to avoid deadlocks.
23023 +
23024 + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
23025 +
23026 + The deadlock prevention algorithm is based on comparing
23027 + priorities of node owners (processes which keep znode locked) and
23028 + requesters (processes which want to acquire a lock on znode). We
23029 + implement a scheme where low-priority owners yield locks to
23030 + high-priority requesters. We created a signal passing system that
23031 + is used to ask low-priority processes to yield one or more locked
23032 + znodes.
23033 +
23034 + The condition when a znode needs to change its owners is described by the
23035 + following formula:
23036 +
23037 + #############################################
23038 + # #
23039 + # (number of high-priority requesters) > 0 #
23040 + # AND #
23041 + # (numbers of high-priority owners) == 0 #
23042 + # #
23043 + #############################################
23044 +
23045 + Note that a low-priority process delays node releasing if another
23046 + high-priority process owns this node. So, slightly more strictly speaking,
23047 + to have a deadlock capable cycle you must have a loop in which a high
23048 + priority process is waiting on a low priority process to yield a node, which
23049 + is slightly different from saying a high priority process is waiting on a
23050 + node owned by a low priority process.
23051 +
23052 + It is enough to avoid deadlocks if we prevent any low-priority process from
23053 + falling asleep if its locked set contains a node which satisfies the
23054 + deadlock condition.
23055 +
23056 + That condition is implicitly or explicitly checked in all places where new
23057 + high-priority requests may be added or removed from node request queue or
23058 + high-priority process takes or releases a lock on node. The main
23059 + goal of these checks is to never lose the moment when node becomes "has
23060 + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
23061 + at that time.
23062 +
23063 + The information about received signals is stored in the per-process
23064 + structure (lock stack) and analyzed before a low-priority process goes to
23065 + sleep but after a "fast" attempt to lock a node fails. Any signal wakes
23066 + sleeping process up and forces him to re-check lock status and received
23067 + signal info. If "must-yield-this-lock" signals were received the locking
23068 + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
23069 +
23070 + V4 LOCKING DRAWBACKS
23071 +
23072 + If we have already balanced on one level, and we are propagating our changes
23073 + upward to a higher level, it could be very messy to surrender all locks on
23074 + the lower level because we put so much computational work into it, and
23075 + reverting them to their state before they were locked might be very complex.
23076 + We also don't want to acquire all locks before performing balancing because
23077 + that would either be almost as much work as the balancing, or it would be
23078 + too conservative and lock too much. We want balancing to be done only at
23079 + high priority. Yet, we might want to go to the left one node and use some
23080 + of its empty space... So we make one attempt at getting the node to the left
23081 + using try_lock, and if it fails we do without it, because we didn't really
23082 + need it, it was only a nice to have.
23083 +
23084 + LOCK STRUCTURES DESCRIPTION
23085 +
23086 + The following data structures are used in the reiser4 locking
23087 + implementation:
23088 +
23089 + All fields related to long-term locking are stored in znode->lock.
23090 +
23091 + The lock stack is a per thread object. It owns all znodes locked by the
23092 + thread. One znode may be locked by several threads in case of read lock or
23093 + one znode may be write locked by one thread several times. The special link
23094 + objects (lock handles) support n<->m relation between znodes and lock
23095 + owners.
23096 +
23097 + <Thread 1> <Thread 2>
23098 +
23099 + +---------+ +---------+
23100 + | LS1 | | LS2 |
23101 + +---------+ +---------+
23102 + ^ ^
23103 + |---------------+ +----------+
23104 + v v v v
23105 + +---------+ +---------+ +---------+ +---------+
23106 + | LH1 | | LH2 | | LH3 | | LH4 |
23107 + +---------+ +---------+ +---------+ +---------+
23108 + ^ ^ ^ ^
23109 + | +------------+ |
23110 + v v v
23111 + +---------+ +---------+ +---------+
23112 + | Z1 | | Z2 | | Z3 |
23113 + +---------+ +---------+ +---------+
23114 +
23115 + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
23116 + picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
23117 + LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
23118 + Z1 is locked by only one thread, znode has only one lock handle LH1 on its
23119 + list, similar situation is for Z3 which is locked by the thread 2 only. Z2
23120 + is locked (for read) twice by different threads and two lock handles are on
23121 + its list. Each lock handle represents a single relation of a locking of a
23122 + znode by a thread. Locking of a znode is an establishing of a locking
23123 + relation between the lock stack and the znode by adding of a new lock handle
23124 + to a list of lock handles, the lock stack. The lock stack links all lock
23125 + handles for all znodes locked by the lock stack. The znode list groups all
23126 + lock handles for all locks stacks which locked the znode.
23127 +
23128 + Yet another relation may exist between znode and lock owners. If lock
23129 + procedure cannot immediately take lock on an object it adds the lock owner
23130 + on special `requestors' list belongs to znode. That list represents a
23131 + queue of pending lock requests. Because one lock owner may request only
23132 + only one lock object at a time, it is a 1->n relation between lock objects
23133 + and a lock owner implemented as it is described above. Full information
23134 + (priority, pointers to lock and link objects) about each lock request is
23135 + stored in lock owner structure in `request' field.
23136 +
23137 + SHORT_TERM LOCKING
23138 +
23139 + This is a list of primitive operations over lock stacks / lock handles /
23140 + znodes and locking descriptions for them.
23141 +
23142 + 1. locking / unlocking which is done by two list insertion/deletion, one
23143 + to/from znode's list of lock handles, another one is to/from lock stack's
23144 + list of lock handles. The first insertion is protected by
23145 + znode->lock.guard spinlock. The list owned by the lock stack can be
23146 + modified only by thread who owns the lock stack and nobody else can
23147 + modify/read it. There is nothing to be protected by a spinlock or
23148 + something else.
23149 +
23150 + 2. adding/removing a lock request to/from znode requesters list. The rule is
23151 + that znode->lock.guard spinlock should be taken for this.
23152 +
23153 + 3. we can traverse list of lock handles and use references to lock stacks who
23154 + locked given znode if znode->lock.guard spinlock is taken.
23155 +
23156 + 4. If a lock stack is associated with a znode as a lock requestor or lock
23157 + owner its existence is guaranteed by znode->lock.guard spinlock. Some its
23158 + (lock stack's) fields should be protected from being accessed in parallel
23159 + by two or more threads. Please look at lock_stack structure definition
23160 + for the info how those fields are protected. */
23161 +
23162 +/* Znode lock and capturing intertwining. */
23163 +/* In current implementation we capture formatted nodes before locking
23164 + them. Take a look on longterm lock znode, reiser4_try_capture() request
23165 + precedes locking requests. The longterm_lock_znode function unconditionally
23166 + captures znode before even checking of locking conditions.
23167 +
23168 + Another variant is to capture znode after locking it. It was not tested, but
23169 + at least one deadlock condition is supposed to be there. One thread has
23170 + locked a znode (Node-1) and calls reiser4_try_capture() for it.
23171 + reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
23172 + Second thread is a flushing thread, its current atom is the atom Node-1
23173 + belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
23174 + is locked by the first thread. The described situation is a deadlock. */
23175 +
23176 +#include "debug.h"
23177 +#include "txnmgr.h"
23178 +#include "znode.h"
23179 +#include "jnode.h"
23180 +#include "tree.h"
23181 +#include "plugin/node/node.h"
23182 +#include "super.h"
23183 +
23184 +#include <linux/spinlock.h>
23185 +
23186 +#if REISER4_DEBUG
23187 +static int request_is_deadlock_safe(znode * , znode_lock_mode,
23188 + znode_lock_request);
23189 +#endif
23190 +
23191 +/* Returns a lock owner associated with current thread */
23192 +lock_stack *get_current_lock_stack(void)
23193 +{
23194 + return &get_current_context()->stack;
23195 +}
23196 +
23197 +/* Wakes up all low priority owners informing them about possible deadlock */
23198 +static void wake_up_all_lopri_owners(znode * node)
23199 +{
23200 + lock_handle *handle;
23201 +
23202 + assert_spin_locked(&(node->lock.guard));
23203 + list_for_each_entry(handle, &node->lock.owners, owners_link) {
23204 + assert("nikita-1832", handle->node == node);
23205 + /* count this signal in owner->nr_signaled */
23206 + if (!handle->signaled) {
23207 + handle->signaled = 1;
23208 + atomic_inc(&handle->owner->nr_signaled);
23209 + /* Wake up a single process */
23210 + reiser4_wake_up(handle->owner);
23211 + }
23212 + }
23213 +}
23214 +
23215 +/* Adds a lock to a lock owner, which means creating a link to the lock and
23216 + putting the link into the two lists all links are on (the doubly linked list
23217 + that forms the lock_stack, and the doubly linked list of links attached
23218 + to a lock.
23219 +*/
23220 +static inline void
23221 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
23222 +{
23223 + assert("jmacd-810", handle->owner == NULL);
23224 + assert_spin_locked(&(node->lock.guard));
23225 +
23226 + handle->owner = owner;
23227 + handle->node = node;
23228 +
23229 + assert("reiser4-4",
23230 + ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23231 +
23232 + /* add lock handle to the end of lock_stack's list of locks */
23233 + list_add_tail(&handle->locks_link, &owner->locks);
23234 + ON_DEBUG(owner->nr_locks++);
23235 + reiser4_ctx_gfp_mask_set();
23236 +
23237 + /* add lock handle to the head of znode's list of owners */
23238 + list_add(&handle->owners_link, &node->lock.owners);
23239 + handle->signaled = 0;
23240 +}
23241 +
23242 +/* Breaks a relation between a lock and its owner */
23243 +static inline void unlink_object(lock_handle * handle)
23244 +{
23245 + assert("zam-354", handle->owner != NULL);
23246 + assert("nikita-1608", handle->node != NULL);
23247 + assert_spin_locked(&(handle->node->lock.guard));
23248 + assert("nikita-1829", handle->owner == get_current_lock_stack());
23249 + assert("reiser4-5", handle->owner->nr_locks > 0);
23250 +
23251 + /* remove lock handle from lock_stack's list of locks */
23252 + list_del(&handle->locks_link);
23253 + ON_DEBUG(handle->owner->nr_locks--);
23254 + reiser4_ctx_gfp_mask_set();
23255 + assert("reiser4-6",
23256 + ergo(list_empty_careful(&handle->owner->locks),
23257 + handle->owner->nr_locks == 0));
23258 + /* remove lock handle from znode's list of owners */
23259 + list_del(&handle->owners_link);
23260 + /* indicates that lock handle is free now */
23261 + handle->node = NULL;
23262 +#if REISER4_DEBUG
23263 + INIT_LIST_HEAD(&handle->locks_link);
23264 + INIT_LIST_HEAD(&handle->owners_link);
23265 + handle->owner = NULL;
23266 +#endif
23267 +}
23268 +
23269 +/* Actually locks an object knowing that we are able to do this */
23270 +static void lock_object(lock_stack * owner)
23271 +{
23272 + struct lock_request *request;
23273 + znode *node;
23274 +
23275 + request = &owner->request;
23276 + node = request->node;
23277 + assert_spin_locked(&(node->lock.guard));
23278 + if (request->mode == ZNODE_READ_LOCK) {
23279 + node->lock.nr_readers++;
23280 + } else {
23281 + /* check that we don't switched from read to write lock */
23282 + assert("nikita-1840", node->lock.nr_readers <= 0);
23283 + /* We allow recursive locking; a node can be locked several
23284 + times for write by same process */
23285 + node->lock.nr_readers--;
23286 + }
23287 +
23288 + link_object(request->handle, owner, node);
23289 +
23290 + if (owner->curpri)
23291 + node->lock.nr_hipri_owners++;
23292 +}
23293 +
23294 +/* Check for recursive write locking */
23295 +static int recursive(lock_stack * owner)
23296 +{
23297 + int ret;
23298 + znode *node;
23299 + lock_handle *lh;
23300 +
23301 + node = owner->request.node;
23302 +
23303 + /* Owners list is not empty for a locked node */
23304 + assert("zam-314", !list_empty_careful(&node->lock.owners));
23305 + assert("nikita-1841", owner == get_current_lock_stack());
23306 + assert_spin_locked(&(node->lock.guard));
23307 +
23308 + lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23309 + ret = (lh->owner == owner);
23310 +
23311 + /* Recursive read locking should be done usual way */
23312 + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23313 + /* mixing of read/write locks is not allowed */
23314 + assert("zam-341", !ret || znode_is_wlocked(node));
23315 +
23316 + return ret;
23317 +}
23318 +
23319 +#if REISER4_DEBUG
23320 +/* Returns true if the lock is held by the calling thread. */
23321 +int znode_is_any_locked(const znode * node)
23322 +{
23323 + lock_handle *handle;
23324 + lock_stack *stack;
23325 + int ret;
23326 +
23327 + if (!znode_is_locked(node))
23328 + return 0;
23329 +
23330 + stack = get_current_lock_stack();
23331 +
23332 + spin_lock_stack(stack);
23333 +
23334 + ret = 0;
23335 +
23336 + list_for_each_entry(handle, &stack->locks, locks_link) {
23337 + if (handle->node == node) {
23338 + ret = 1;
23339 + break;
23340 + }
23341 + }
23342 +
23343 + spin_unlock_stack(stack);
23344 +
23345 + return ret;
23346 +}
23347 +
23348 +#endif
23349 +
23350 +/* Returns true if a write lock is held by the calling thread. */
23351 +int znode_is_write_locked(const znode * node)
23352 +{
23353 + lock_stack *stack;
23354 + lock_handle *handle;
23355 +
23356 + assert("jmacd-8765", node != NULL);
23357 +
23358 + if (!znode_is_wlocked(node))
23359 + return 0;
23360 +
23361 + stack = get_current_lock_stack();
23362 +
23363 + /*
23364 + * When znode is write locked, all owner handles point to the same lock
23365 + * stack. Get pointer to lock stack from the first lock handle from
23366 + * znode's owner list
23367 + */
23368 + handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23369 +
23370 + return (handle->owner == stack);
23371 +}
23372 +
23373 +/* This "deadlock" condition is the essential part of reiser4 locking
23374 + implementation. This condition is checked explicitly by calling
23375 + check_deadlock_condition() or implicitly in all places where znode lock
23376 + state (set of owners and request queue) is changed. Locking code is
23377 + designed to use this condition to trigger procedure of passing object from
23378 + low priority owner(s) to high priority one(s).
23379 +
23380 + The procedure results in passing an event (setting lock_handle->signaled
23381 + flag) and counting this event in nr_signaled field of owner's lock stack
23382 + object and wakeup owner's process.
23383 +*/
23384 +static inline int check_deadlock_condition(znode * node)
23385 +{
23386 + assert_spin_locked(&(node->lock.guard));
23387 + return node->lock.nr_hipri_requests > 0
23388 + && node->lock.nr_hipri_owners == 0;
23389 +}
23390 +
23391 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23392 +{
23393 + zlock * lock = &node->lock;
23394 +
23395 + return mode == ZNODE_READ_LOCK &&
23396 + lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23397 +}
23398 +
23399 +/* checks lock/request compatibility */
23400 +static int can_lock_object(lock_stack * owner)
23401 +{
23402 + znode *node = owner->request.node;
23403 +
23404 + assert_spin_locked(&(node->lock.guard));
23405 +
23406 + /* See if the node is disconnected. */
23407 + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23408 + return RETERR(-EINVAL);
23409 +
23410 + /* Do not ever try to take a lock if we are going in low priority
23411 + direction and a node have a high priority request without high
23412 + priority owners. */
23413 + if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23414 + return RETERR(-E_REPEAT);
23415 + if (unlikely(owner->curpri &&
23416 + check_livelock_condition(node, owner->request.mode)))
23417 + return RETERR(-E_REPEAT);
23418 + if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23419 + return RETERR(-E_REPEAT);
23420 + return 0;
23421 +}
23422 +
23423 +/* Setting of a high priority to the process. It clears "signaled" flags
23424 + because znode locked by high-priority process can't satisfy our "deadlock
23425 + condition". */
23426 +static void set_high_priority(lock_stack * owner)
23427 +{
23428 + assert("nikita-1846", owner == get_current_lock_stack());
23429 + /* Do nothing if current priority is already high */
23430 + if (!owner->curpri) {
23431 + /* We don't need locking for owner->locks list, because, this
23432 + * function is only called with the lock stack of the current
23433 + * thread, and no other thread can play with owner->locks list
23434 + * and/or change ->node pointers of lock handles in this list.
23435 + *
23436 + * (Interrupts also are not involved.)
23437 + */
23438 + lock_handle *item = list_entry(owner->locks.next, lock_handle,
23439 + locks_link);
23440 + while (&owner->locks != &item->locks_link) {
23441 + znode *node = item->node;
23442 +
23443 + spin_lock_zlock(&node->lock);
23444 +
23445 + node->lock.nr_hipri_owners++;
23446 +
23447 + /* we can safely set signaled to zero, because
23448 + previous statement (nr_hipri_owners ++) guarantees
23449 + that signaled will be never set again. */
23450 + item->signaled = 0;
23451 + spin_unlock_zlock(&node->lock);
23452 +
23453 + item = list_entry(item->locks_link.next, lock_handle,
23454 + locks_link);
23455 + }
23456 + owner->curpri = 1;
23457 + atomic_set(&owner->nr_signaled, 0);
23458 + }
23459 +}
23460 +
23461 +/* Sets a low priority to the process. */
23462 +static void set_low_priority(lock_stack * owner)
23463 +{
23464 + assert("nikita-3075", owner == get_current_lock_stack());
23465 + /* Do nothing if current priority is already low */
23466 + if (owner->curpri) {
23467 + /* scan all locks (lock handles) held by @owner, which is
23468 + actually current thread, and check whether we are reaching
23469 + deadlock possibility anywhere.
23470 + */
23471 + lock_handle *handle = list_entry(owner->locks.next, lock_handle,
23472 + locks_link);
23473 + while (&owner->locks != &handle->locks_link) {
23474 + znode *node = handle->node;
23475 + spin_lock_zlock(&node->lock);
23476 + /* this thread just was hipri owner of @node, so
23477 + nr_hipri_owners has to be greater than zero. */
23478 + assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23479 + node->lock.nr_hipri_owners--;
23480 + /* If we have deadlock condition, adjust a nr_signaled
23481 + field. It is enough to set "signaled" flag only for
23482 + current process, other low-pri owners will be
23483 + signaled and waken up after current process unlocks
23484 + this object and any high-priority requestor takes
23485 + control. */
23486 + if (check_deadlock_condition(node)
23487 + && !handle->signaled) {
23488 + handle->signaled = 1;
23489 + atomic_inc(&owner->nr_signaled);
23490 + }
23491 + spin_unlock_zlock(&node->lock);
23492 + handle = list_entry(handle->locks_link.next,
23493 + lock_handle, locks_link);
23494 + }
23495 + owner->curpri = 0;
23496 + }
23497 +}
23498 +
23499 +static void remove_lock_request(lock_stack * requestor)
23500 +{
23501 + zlock * lock = &requestor->request.node->lock;
23502 +
23503 + if (requestor->curpri) {
23504 + assert("nikita-1838", lock->nr_hipri_requests > 0);
23505 + lock->nr_hipri_requests--;
23506 + if (requestor->request.mode == ZNODE_WRITE_LOCK)
23507 + lock->nr_hipri_write_requests--;
23508 + }
23509 + list_del(&requestor->requestors_link);
23510 +}
23511 +
23512 +static void invalidate_all_lock_requests(znode * node)
23513 +{
23514 + lock_stack *requestor, *tmp;
23515 +
23516 + assert_spin_locked(&(node->lock.guard));
23517 +
23518 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
23519 + requestors_link) {
23520 + remove_lock_request(requestor);
23521 + requestor->request.ret_code = -EINVAL;
23522 + reiser4_wake_up(requestor);
23523 + requestor->request.mode = ZNODE_NO_LOCK;
23524 + }
23525 +}
23526 +
23527 +static void dispatch_lock_requests(znode * node)
23528 +{
23529 + lock_stack *requestor, *tmp;
23530 +
23531 + assert_spin_locked(&(node->lock.guard));
23532 +
23533 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
23534 + requestors_link) {
23535 + if (znode_is_write_locked(node))
23536 + break;
23537 + if (!can_lock_object(requestor)) {
23538 + lock_object(requestor);
23539 + remove_lock_request(requestor);
23540 + requestor->request.ret_code = 0;
23541 + reiser4_wake_up(requestor);
23542 + requestor->request.mode = ZNODE_NO_LOCK;
23543 + }
23544 + }
23545 +}
23546 +
23547 +/* release long-term lock, acquired by longterm_lock_znode() */
23548 +void longterm_unlock_znode(lock_handle * handle)
23549 +{
23550 + znode *node = handle->node;
23551 + lock_stack *oldowner = handle->owner;
23552 + int hipri;
23553 + int readers;
23554 + int rdelta;
23555 + int youdie;
23556 +
23557 + /*
23558 + * this is time-critical and highly optimized code. Modify carefully.
23559 + */
23560 +
23561 + assert("jmacd-1021", handle != NULL);
23562 + assert("jmacd-1022", handle->owner != NULL);
23563 + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23564 +
23565 + assert("zam-130", oldowner == get_current_lock_stack());
23566 +
23567 + LOCK_CNT_DEC(long_term_locked_znode);
23568 +
23569 + /*
23570 + * to minimize amount of operations performed under lock, pre-compute
23571 + * all variables used within critical section. This makes code
23572 + * obscure.
23573 + */
23574 +
23575 + /* was this lock of hi or lo priority */
23576 + hipri = oldowner->curpri ? 1 : 0;
23577 + /* number of readers */
23578 + readers = node->lock.nr_readers;
23579 + /* +1 if write lock, -1 if read lock */
23580 + rdelta = (readers > 0) ? -1 : +1;
23581 + /* true if node is to die and write lock is released */
23582 + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23583 +
23584 + spin_lock_zlock(&node->lock);
23585 +
23586 + assert("zam-101", znode_is_locked(node));
23587 +
23588 + /* Adjust a number of high priority owners of this lock */
23589 + assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23590 + node->lock.nr_hipri_owners -= hipri;
23591 +
23592 + /* Handle znode deallocation on last write-lock release. */
23593 + if (znode_is_wlocked_once(node)) {
23594 + if (youdie) {
23595 + forget_znode(handle);
23596 + assert("nikita-2191", znode_invariant(node));
23597 + zput(node);
23598 + return;
23599 + }
23600 + }
23601 +
23602 + if (handle->signaled)
23603 + atomic_dec(&oldowner->nr_signaled);
23604 +
23605 + /* Unlocking means owner<->object link deletion */
23606 + unlink_object(handle);
23607 +
23608 + /* This is enough to be sure whether an object is completely
23609 + unlocked. */
23610 + node->lock.nr_readers += rdelta;
23611 +
23612 + /* If the node is locked it must have an owners list. Likewise, if
23613 + the node is unlocked it must have an empty owners list. */
23614 + assert("zam-319", equi(znode_is_locked(node),
23615 + !list_empty_careful(&node->lock.owners)));
23616 +
23617 +#if REISER4_DEBUG
23618 + if (!znode_is_locked(node))
23619 + ++node->times_locked;
23620 +#endif
23621 +
23622 + /* If there are pending lock requests we wake up a requestor */
23623 + if (!znode_is_wlocked(node))
23624 + dispatch_lock_requests(node);
23625 + if (check_deadlock_condition(node))
23626 + wake_up_all_lopri_owners(node);
23627 + spin_unlock_zlock(&node->lock);
23628 +
23629 + /* minus one reference from handle->node */
23630 + assert("nikita-2190", znode_invariant(node));
23631 + ON_DEBUG(check_lock_data());
23632 + ON_DEBUG(check_lock_node_data(node));
23633 + zput(node);
23634 +}
23635 +
23636 +/* final portion of longterm-lock */
23637 +static int
23638 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23639 +{
23640 + znode *node = owner->request.node;
23641 +
23642 + assert_spin_locked(&(node->lock.guard));
23643 +
23644 + /* If we broke with (ok == 0) it means we can_lock, now do it. */
23645 + if (ok == 0) {
23646 + lock_object(owner);
23647 + owner->request.mode = 0;
23648 + /* count a reference from lockhandle->node
23649 +
23650 + znode was already referenced at the entry to this function,
23651 + hence taking spin-lock here is not necessary (see comment
23652 + in the zref()).
23653 + */
23654 + zref(node);
23655 +
23656 + LOCK_CNT_INC(long_term_locked_znode);
23657 + }
23658 + spin_unlock_zlock(&node->lock);
23659 + ON_DEBUG(check_lock_data());
23660 + ON_DEBUG(check_lock_node_data(node));
23661 + return ok;
23662 +}
23663 +
23664 +/*
23665 + * version of longterm_znode_lock() optimized for the most common case: read
23666 + * lock without any special flags. This is the kind of lock that any tree
23667 + * traversal takes on the root node of the tree, which is very frequent.
23668 + */
23669 +static int longterm_lock_tryfast(lock_stack * owner)
23670 +{
23671 + int result;
23672 + znode *node;
23673 + zlock *lock;
23674 +
23675 + node = owner->request.node;
23676 + lock = &node->lock;
23677 +
23678 + assert("nikita-3340", reiser4_schedulable());
23679 + assert("nikita-3341", request_is_deadlock_safe(node,
23680 + ZNODE_READ_LOCK,
23681 + ZNODE_LOCK_LOPRI));
23682 + spin_lock_zlock(lock);
23683 + result = can_lock_object(owner);
23684 + spin_unlock_zlock(lock);
23685 +
23686 + if (likely(result != -EINVAL)) {
23687 + spin_lock_znode(node);
23688 + result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23689 + spin_unlock_znode(node);
23690 + spin_lock_zlock(lock);
23691 + if (unlikely(result != 0)) {
23692 + owner->request.mode = 0;
23693 + } else {
23694 + result = can_lock_object(owner);
23695 + if (unlikely(result == -E_REPEAT)) {
23696 + /* fall back to longterm_lock_znode() */
23697 + spin_unlock_zlock(lock);
23698 + return 1;
23699 + }
23700 + }
23701 + return lock_tail(owner, result, ZNODE_READ_LOCK);
23702 + } else
23703 + return 1;
23704 +}
23705 +
23706 +/* locks given lock object */
23707 +int longterm_lock_znode(
23708 + /* local link object (allocated by lock owner
23709 + * thread, usually on its own stack) */
23710 + lock_handle * handle,
23711 + /* znode we want to lock. */
23712 + znode * node,
23713 + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23714 + znode_lock_mode mode,
23715 + /* {0, -EINVAL, -E_DEADLOCK}, see return codes
23716 + description. */
23717 + znode_lock_request request) {
23718 + int ret;
23719 + int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23720 + int non_blocking = 0;
23721 + int has_atom;
23722 + txn_capture cap_flags;
23723 + zlock *lock;
23724 + txn_handle *txnh;
23725 + tree_level level;
23726 +
23727 + /* Get current process context */
23728 + lock_stack *owner = get_current_lock_stack();
23729 +
23730 + /* Check that the lock handle is initialized and isn't already being
23731 + * used. */
23732 + assert("jmacd-808", handle->owner == NULL);
23733 + assert("nikita-3026", reiser4_schedulable());
23734 + assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23735 + assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23736 + /* long term locks are not allowed in the VM contexts (->writepage(),
23737 + * prune_{d,i}cache()).
23738 + *
23739 + * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23740 + * bug caused by d_splice_alias() only working for directories.
23741 + */
23742 + assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23743 + assert("zam-1055", mode != ZNODE_NO_LOCK);
23744 +
23745 + cap_flags = 0;
23746 + if (request & ZNODE_LOCK_NONBLOCK) {
23747 + cap_flags |= TXN_CAPTURE_NONBLOCKING;
23748 + non_blocking = 1;
23749 + }
23750 +
23751 + if (request & ZNODE_LOCK_DONT_FUSE)
23752 + cap_flags |= TXN_CAPTURE_DONT_FUSE;
23753 +
23754 + /* If we are changing our process priority we must adjust a number
23755 + of high priority owners for each znode that we already lock */
23756 + if (hipri) {
23757 + set_high_priority(owner);
23758 + } else {
23759 + set_low_priority(owner);
23760 + }
23761 +
23762 + level = znode_get_level(node);
23763 +
23764 + /* Fill request structure with our values. */
23765 + owner->request.mode = mode;
23766 + owner->request.handle = handle;
23767 + owner->request.node = node;
23768 +
23769 + txnh = get_current_context()->trans;
23770 + lock = &node->lock;
23771 +
23772 + if (mode == ZNODE_READ_LOCK && request == 0) {
23773 + ret = longterm_lock_tryfast(owner);
23774 + if (ret <= 0)
23775 + return ret;
23776 + }
23777 +
23778 + has_atom = (txnh->atom != NULL);
23779 +
23780 + /* Synchronize on node's zlock guard lock. */
23781 + spin_lock_zlock(lock);
23782 +
23783 + if (znode_is_locked(node) &&
23784 + mode == ZNODE_WRITE_LOCK && recursive(owner))
23785 + return lock_tail(owner, 0, mode);
23786 +
23787 + for (;;) {
23788 + /* Check the lock's availability: if it is unavaiable we get
23789 + E_REPEAT, 0 indicates "can_lock", otherwise the node is
23790 + invalid. */
23791 + ret = can_lock_object(owner);
23792 +
23793 + if (unlikely(ret == -EINVAL)) {
23794 + /* @node is dying. Leave it alone. */
23795 + break;
23796 + }
23797 +
23798 + if (unlikely(ret == -E_REPEAT && non_blocking)) {
23799 + /* either locking of @node by the current thread will
23800 + * lead to the deadlock, or lock modes are
23801 + * incompatible. */
23802 + break;
23803 + }
23804 +
23805 + assert("nikita-1844", (ret == 0)
23806 + || ((ret == -E_REPEAT) && !non_blocking));
23807 + /* If we can get the lock... Try to capture first before
23808 + taking the lock. */
23809 +
23810 + /* first handle commonest case where node and txnh are already
23811 + * in the same atom. */
23812 + /* safe to do without taking locks, because:
23813 + *
23814 + * 1. read of aligned word is atomic with respect to writes to
23815 + * this word
23816 + *
23817 + * 2. false negatives are handled in reiser4_try_capture().
23818 + *
23819 + * 3. false positives are impossible.
23820 + *
23821 + * PROOF: left as an exercise to the curious reader.
23822 + *
23823 + * Just kidding. Here is one:
23824 + *
23825 + * At the time T0 txnh->atom is stored in txnh_atom.
23826 + *
23827 + * At the time T1 node->atom is stored in node_atom.
23828 + *
23829 + * At the time T2 we observe that
23830 + *
23831 + * txnh_atom != NULL && node_atom == txnh_atom.
23832 + *
23833 + * Imagine that at this moment we acquire node and txnh spin
23834 + * lock in this order. Suppose that under spin lock we have
23835 + *
23836 + * node->atom != txnh->atom, (S1)
23837 + *
23838 + * at the time T3.
23839 + *
23840 + * txnh->atom != NULL still, because txnh is open by the
23841 + * current thread.
23842 + *
23843 + * Suppose node->atom == NULL, that is, node was un-captured
23844 + * between T1, and T3. But un-capturing of formatted node is
23845 + * always preceded by the call to reiser4_invalidate_lock(),
23846 + * which marks znode as JNODE_IS_DYING under zlock spin
23847 + * lock. Contradiction, because can_lock_object() above checks
23848 + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23849 + *
23850 + * Suppose that node->atom != node_atom, that is, atom, node
23851 + * belongs to was fused into another atom: node_atom was fused
23852 + * into node->atom. Atom of txnh was equal to node_atom at T2,
23853 + * which means that under spin lock, txnh->atom == node->atom,
23854 + * because txnh->atom can only follow fusion
23855 + * chain. Contradicts S1.
23856 + *
23857 + * The same for hypothesis txnh->atom != txnh_atom. Hence,
23858 + * node->atom == node_atom == txnh_atom == txnh->atom. Again
23859 + * contradicts S1. Hence S1 is false. QED.
23860 + *
23861 + */
23862 +
23863 + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23864 + ;
23865 + } else {
23866 + /*
23867 + * unlock zlock spin lock here. It is possible for
23868 + * longterm_unlock_znode() to sneak in here, but there
23869 + * is no harm: reiser4_invalidate_lock() will mark znode
23870 + * as JNODE_IS_DYING and this will be noted by
23871 + * can_lock_object() below.
23872 + */
23873 + spin_unlock_zlock(lock);
23874 + spin_lock_znode(node);
23875 + ret = reiser4_try_capture(ZJNODE(node), mode,
23876 + cap_flags);
23877 + spin_unlock_znode(node);
23878 + spin_lock_zlock(lock);
23879 + if (unlikely(ret != 0)) {
23880 + /* In the failure case, the txnmgr releases
23881 + the znode's lock (or in some cases, it was
23882 + released a while ago). There's no need to
23883 + reacquire it so we should return here,
23884 + avoid releasing the lock. */
23885 + owner->request.mode = 0;
23886 + break;
23887 + }
23888 +
23889 + /* Check the lock's availability again -- this is
23890 + because under some circumstances the capture code
23891 + has to release and reacquire the znode spinlock. */
23892 + ret = can_lock_object(owner);
23893 + }
23894 +
23895 + /* This time, a return of (ret == 0) means we can lock, so we
23896 + should break out of the loop. */
23897 + if (likely(ret != -E_REPEAT || non_blocking))
23898 + break;
23899 +
23900 + /* Lock is unavailable, we have to wait. */
23901 + ret = reiser4_prepare_to_sleep(owner);
23902 + if (unlikely(ret != 0))
23903 + break;
23904 +
23905 + assert_spin_locked(&(node->lock.guard));
23906 + if (hipri) {
23907 + /* If we are going in high priority direction then
23908 + increase high priority requests counter for the
23909 + node */
23910 + lock->nr_hipri_requests++;
23911 + if (mode == ZNODE_WRITE_LOCK)
23912 + lock->nr_hipri_write_requests++;
23913 + /* If there are no high priority owners for a node,
23914 + then immediately wake up low priority owners, so
23915 + they can detect possible deadlock */
23916 + if (lock->nr_hipri_owners == 0)
23917 + wake_up_all_lopri_owners(node);
23918 + }
23919 + list_add_tail(&owner->requestors_link, &lock->requestors);
23920 +
23921 + /* Ok, here we have prepared a lock request, so unlock
23922 + a znode ... */
23923 + spin_unlock_zlock(lock);
23924 + /* ... and sleep */
23925 + reiser4_go_to_sleep(owner);
23926 + if (owner->request.mode == ZNODE_NO_LOCK)
23927 + goto request_is_done;
23928 + spin_lock_zlock(lock);
23929 + if (owner->request.mode == ZNODE_NO_LOCK) {
23930 + spin_unlock_zlock(lock);
23931 +request_is_done:
23932 + if (owner->request.ret_code == 0) {
23933 + LOCK_CNT_INC(long_term_locked_znode);
23934 + zref(node);
23935 + }
23936 + return owner->request.ret_code;
23937 + }
23938 + remove_lock_request(owner);
23939 + }
23940 +
23941 + return lock_tail(owner, ret, mode);
23942 +}
23943 +
23944 +/* lock object invalidation means changing of lock object state to `INVALID'
23945 + and waiting for all other processes to cancel theirs lock requests. */
23946 +void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23947 + * owner and lock
23948 + * object is being
23949 + * invalidated. */ )
23950 +{
23951 + znode *node = handle->node;
23952 + lock_stack *owner = handle->owner;
23953 +
23954 + assert("zam-325", owner == get_current_lock_stack());
23955 + assert("zam-103", znode_is_write_locked(node));
23956 + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23957 + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23958 + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23959 + assert("nikita-3097", znode_is_wlocked_once(node));
23960 + assert_spin_locked(&(node->lock.guard));
23961 +
23962 + if (handle->signaled)
23963 + atomic_dec(&owner->nr_signaled);
23964 +
23965 + ZF_SET(node, JNODE_IS_DYING);
23966 + unlink_object(handle);
23967 + node->lock.nr_readers = 0;
23968 +
23969 + invalidate_all_lock_requests(node);
23970 + spin_unlock_zlock(&node->lock);
23971 +}
23972 +
23973 +/* Initializes lock_stack. */
23974 +void init_lock_stack(lock_stack * owner /* pointer to
23975 + * allocated
23976 + * structure. */ )
23977 +{
23978 + INIT_LIST_HEAD(&owner->locks);
23979 + INIT_LIST_HEAD(&owner->requestors_link);
23980 + spin_lock_init(&owner->sguard);
23981 + owner->curpri = 1;
23982 + init_waitqueue_head(&owner->wait);
23983 +}
23984 +
23985 +/* Initializes lock object. */
23986 +void reiser4_init_lock(zlock * lock /* pointer on allocated
23987 + * uninitialized lock object
23988 + * structure. */ )
23989 +{
23990 + memset(lock, 0, sizeof(zlock));
23991 + spin_lock_init(&lock->guard);
23992 + INIT_LIST_HEAD(&lock->requestors);
23993 + INIT_LIST_HEAD(&lock->owners);
23994 +}
23995 +
23996 +/* Transfer a lock handle (presumably so that variables can be moved between
23997 + stack and heap locations). */
23998 +static void
23999 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
24000 +{
24001 + znode *node = old->node;
24002 + lock_stack *owner = old->owner;
24003 + int signaled;
24004 +
24005 + /* locks_list, modified by link_object() is not protected by
24006 + anything. This is valid because only current thread ever modifies
24007 + locks_list of its lock_stack.
24008 + */
24009 + assert("nikita-1827", owner == get_current_lock_stack());
24010 + assert("nikita-1831", new->owner == NULL);
24011 +
24012 + spin_lock_zlock(&node->lock);
24013 +
24014 + signaled = old->signaled;
24015 + if (unlink_old) {
24016 + unlink_object(old);
24017 + } else {
24018 + if (node->lock.nr_readers > 0) {
24019 + node->lock.nr_readers += 1;
24020 + } else {
24021 + node->lock.nr_readers -= 1;
24022 + }
24023 + if (signaled)
24024 + atomic_inc(&owner->nr_signaled);
24025 + if (owner->curpri)
24026 + node->lock.nr_hipri_owners += 1;
24027 + LOCK_CNT_INC(long_term_locked_znode);
24028 +
24029 + zref(node);
24030 + }
24031 + link_object(new, owner, node);
24032 + new->signaled = signaled;
24033 +
24034 + spin_unlock_zlock(&node->lock);
24035 +}
24036 +
24037 +void move_lh(lock_handle * new, lock_handle * old)
24038 +{
24039 + move_lh_internal(new, old, /*unlink_old */ 1);
24040 +}
24041 +
24042 +void copy_lh(lock_handle * new, lock_handle * old)
24043 +{
24044 + move_lh_internal(new, old, /*unlink_old */ 0);
24045 +}
24046 +
24047 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false
24048 + */
24049 +int reiser4_check_deadlock(void)
24050 +{
24051 + lock_stack *owner = get_current_lock_stack();
24052 + return atomic_read(&owner->nr_signaled) != 0;
24053 +}
24054 +
24055 +/* Before going to sleep we re-check "release lock" requests which might come
24056 + from threads with hi-pri lock priorities. */
24057 +int reiser4_prepare_to_sleep(lock_stack * owner)
24058 +{
24059 + assert("nikita-1847", owner == get_current_lock_stack());
24060 +
24061 + /* We return -E_DEADLOCK if one or more "give me the lock" messages are
24062 + * counted in nr_signaled */
24063 + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
24064 + assert("zam-959", !owner->curpri);
24065 + return RETERR(-E_DEADLOCK);
24066 + }
24067 + return 0;
24068 +}
24069 +
24070 +/* Wakes up a single thread */
24071 +void __reiser4_wake_up(lock_stack * owner)
24072 +{
24073 + atomic_set(&owner->wakeup, 1);
24074 + wake_up(&owner->wait);
24075 +}
24076 +
24077 +/* Puts a thread to sleep */
24078 +void reiser4_go_to_sleep(lock_stack * owner)
24079 +{
24080 + /* Well, we might sleep here, so holding of any spinlocks is no-no */
24081 + assert("nikita-3027", reiser4_schedulable());
24082 +
24083 + wait_event(owner->wait, atomic_read(&owner->wakeup));
24084 + atomic_set(&owner->wakeup, 0);
24085 +}
24086 +
24087 +int lock_stack_isclean(lock_stack * owner)
24088 +{
24089 + if (list_empty_careful(&owner->locks)) {
24090 + assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
24091 + return 1;
24092 + }
24093 +
24094 + return 0;
24095 +}
24096 +
24097 +#if REISER4_DEBUG
24098 +
24099 +/*
24100 + * debugging functions
24101 + */
24102 +
24103 +static void list_check(struct list_head *head)
24104 +{
24105 + struct list_head *pos;
24106 +
24107 + list_for_each(pos, head)
24108 + assert("", (pos->prev != NULL && pos->next != NULL &&
24109 + pos->prev->next == pos && pos->next->prev == pos));
24110 +}
24111 +
24112 +/* check consistency of locking data-structures hanging of the @stack */
24113 +static void check_lock_stack(lock_stack * stack)
24114 +{
24115 + spin_lock_stack(stack);
24116 + /* check that stack->locks is not corrupted */
24117 + list_check(&stack->locks);
24118 + spin_unlock_stack(stack);
24119 +}
24120 +
24121 +/* check consistency of locking data structures */
24122 +void check_lock_data(void)
24123 +{
24124 + check_lock_stack(&get_current_context()->stack);
24125 +}
24126 +
24127 +/* check consistency of locking data structures for @node */
24128 +void check_lock_node_data(znode * node)
24129 +{
24130 + spin_lock_zlock(&node->lock);
24131 + list_check(&node->lock.owners);
24132 + list_check(&node->lock.requestors);
24133 + spin_unlock_zlock(&node->lock);
24134 +}
24135 +
24136 +/* check that given lock request is dead lock safe. This check is, of course,
24137 + * not exhaustive. */
24138 +static int
24139 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24140 + znode_lock_request request)
24141 +{
24142 + lock_stack *owner;
24143 +
24144 + owner = get_current_lock_stack();
24145 + /*
24146 + * check that hipri lock request is not issued when there are locked
24147 + * nodes at the higher levels.
24148 + */
24149 + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24150 + znode_get_level(node) != 0) {
24151 + lock_handle *item;
24152 +
24153 + list_for_each_entry(item, &owner->locks, locks_link) {
24154 + znode *other;
24155 +
24156 + other = item->node;
24157 +
24158 + if (znode_get_level(other) == 0)
24159 + continue;
24160 + if (znode_get_level(other) > znode_get_level(node))
24161 + return 0;
24162 + }
24163 + }
24164 + return 1;
24165 +}
24166 +
24167 +#endif
24168 +
24169 +/* return pointer to static storage with name of lock_mode. For
24170 + debugging */
24171 +const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */)
24172 +{
24173 + if (lock == ZNODE_READ_LOCK)
24174 + return "read";
24175 + else if (lock == ZNODE_WRITE_LOCK)
24176 + return "write";
24177 + else {
24178 + static char buf[30];
24179 +
24180 + sprintf(buf, "unknown: %i", lock);
24181 + return buf;
24182 + }
24183 +}
24184 +
24185 +/* Make Linus happy.
24186 + Local variables:
24187 + c-indentation-style: "K&R"
24188 + mode-name: "LC"
24189 + c-basic-offset: 8
24190 + tab-width: 8
24191 + fill-column: 79
24192 + End:
24193 +*/
24194 diff -urN linux-2.6.33.orig/fs/reiser4/lock.h linux-2.6.33/fs/reiser4/lock.h
24195 --- linux-2.6.33.orig/fs/reiser4/lock.h 1970-01-01 01:00:00.000000000 +0100
24196 +++ linux-2.6.33/fs/reiser4/lock.h 2010-03-04 19:33:22.000000000 +0100
24197 @@ -0,0 +1,250 @@
24198 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24199 + * reiser4/README */
24200 +
24201 +/* Long term locking data structures. See lock.c for details. */
24202 +
24203 +#ifndef __LOCK_H__
24204 +#define __LOCK_H__
24205 +
24206 +#include "forward.h"
24207 +#include "debug.h"
24208 +#include "dformat.h"
24209 +#include "key.h"
24210 +#include "coord.h"
24211 +#include "plugin/node/node.h"
24212 +#include "txnmgr.h"
24213 +#include "readahead.h"
24214 +
24215 +#include <linux/types.h>
24216 +#include <linux/spinlock.h>
24217 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
24218 +#include <asm/atomic.h>
24219 +#include <linux/wait.h>
24220 +
24221 +/* Per-znode lock object */
24222 +struct zlock {
24223 + spinlock_t guard;
24224 + /* The number of readers if positive; the number of recursively taken
24225 + write locks if negative. Protected by zlock spin lock. */
24226 + int nr_readers;
24227 + /* A number of processes (lock_stacks) that have this object
24228 + locked with high priority */
24229 + unsigned nr_hipri_owners;
24230 + /* A number of attempts to lock znode in high priority direction */
24231 + unsigned nr_hipri_requests;
24232 + /* A linked list of lock_handle objects that contains pointers
24233 + for all lock_stacks which have this lock object locked */
24234 + unsigned nr_hipri_write_requests;
24235 + struct list_head owners;
24236 + /* A linked list of lock_stacks that wait for this lock */
24237 + struct list_head requestors;
24238 +};
24239 +
24240 +static inline void spin_lock_zlock(zlock *lock)
24241 +{
24242 + /* check that zlock is not locked */
24243 + assert("", LOCK_CNT_NIL(spin_locked_zlock));
24244 + /* check that spinlocks of lower priorities are not held */
24245 + assert("", LOCK_CNT_NIL(spin_locked_stack));
24246 +
24247 + spin_lock(&lock->guard);
24248 +
24249 + LOCK_CNT_INC(spin_locked_zlock);
24250 + LOCK_CNT_INC(spin_locked);
24251 +}
24252 +
24253 +static inline void spin_unlock_zlock(zlock *lock)
24254 +{
24255 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24256 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24257 +
24258 + LOCK_CNT_DEC(spin_locked_zlock);
24259 + LOCK_CNT_DEC(spin_locked);
24260 +
24261 + spin_unlock(&lock->guard);
24262 +}
24263 +
24264 +#define lock_is_locked(lock) ((lock)->nr_readers != 0)
24265 +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
24266 +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
24267 +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
24268 +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >= 0)
24269 +#define lock_mode_compatible(lock, mode) \
24270 + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24271 + ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24272 +
24273 +/* Since we have R/W znode locks we need additional bidirectional `link'
24274 + objects to implement n<->m relationship between lock owners and lock
24275 + objects. We call them `lock handles'.
24276 +
24277 + Locking: see lock.c/"SHORT-TERM LOCKING"
24278 +*/
24279 +struct lock_handle {
24280 + /* This flag indicates that a signal to yield a lock was passed to
24281 + lock owner and counted in owner->nr_signalled
24282 +
24283 + Locking: this is accessed under spin lock on ->node.
24284 + */
24285 + int signaled;
24286 + /* A link to owner of a lock */
24287 + lock_stack *owner;
24288 + /* A link to znode locked */
24289 + znode *node;
24290 + /* A list of all locks for a process */
24291 + struct list_head locks_link;
24292 + /* A list of all owners for a znode */
24293 + struct list_head owners_link;
24294 +};
24295 +
24296 +struct lock_request {
24297 + /* A pointer to uninitialized link object */
24298 + lock_handle *handle;
24299 + /* A pointer to the object we want to lock */
24300 + znode *node;
24301 + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24302 + znode_lock_mode mode;
24303 + /* how dispatch_lock_requests() returns lock request result code */
24304 + int ret_code;
24305 +};
24306 +
24307 +/* A lock stack structure for accumulating locks owned by a process */
24308 +struct lock_stack {
24309 + /* A guard lock protecting a lock stack */
24310 + spinlock_t sguard;
24311 + /* number of znodes which were requested by high priority processes */
24312 + atomic_t nr_signaled;
24313 + /* Current priority of a process
24314 +
24315 + This is only accessed by the current thread and thus requires no
24316 + locking.
24317 + */
24318 + int curpri;
24319 + /* A list of all locks owned by this process. Elements can be added to
24320 + * this list only by the current thread. ->node pointers in this list
24321 + * can be only changed by the current thread. */
24322 + struct list_head locks;
24323 + /* When lock_stack waits for the lock, it puts itself on double-linked
24324 + requestors list of that lock */
24325 + struct list_head requestors_link;
24326 + /* Current lock request info.
24327 +
24328 + This is only accessed by the current thread and thus requires no
24329 + locking.
24330 + */
24331 + struct lock_request request;
24332 + /* the following two fields are the lock stack's
24333 + * synchronization object to use with the standard linux/wait.h
24334 + * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
24335 + * usage details. */
24336 + wait_queue_head_t wait;
24337 + atomic_t wakeup;
24338 +#if REISER4_DEBUG
24339 + int nr_locks; /* number of lock handles in the above list */
24340 +#endif
24341 +};
24342 +
24343 +/*
24344 + User-visible znode locking functions
24345 +*/
24346 +
24347 +extern int longterm_lock_znode(lock_handle * handle,
24348 + znode * node,
24349 + znode_lock_mode mode,
24350 + znode_lock_request request);
24351 +
24352 +extern void longterm_unlock_znode(lock_handle * handle);
24353 +
24354 +extern int reiser4_check_deadlock(void);
24355 +
24356 +extern lock_stack *get_current_lock_stack(void);
24357 +
24358 +extern void init_lock_stack(lock_stack * owner);
24359 +extern void reiser4_init_lock(zlock * lock);
24360 +
24361 +static inline void init_lh(lock_handle *lh)
24362 +{
24363 +#if REISER4_DEBUG
24364 + memset(lh, 0, sizeof *lh);
24365 + INIT_LIST_HEAD(&lh->locks_link);
24366 + INIT_LIST_HEAD(&lh->owners_link);
24367 +#else
24368 + lh->node = NULL;
24369 +#endif
24370 +}
24371 +
24372 +static inline void done_lh(lock_handle *lh)
24373 +{
24374 + assert("zam-342", lh != NULL);
24375 + if (lh->node != NULL)
24376 + longterm_unlock_znode(lh);
24377 +}
24378 +
24379 +extern void move_lh(lock_handle * new, lock_handle * old);
24380 +extern void copy_lh(lock_handle * new, lock_handle * old);
24381 +
24382 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
24383 +extern void reiser4_go_to_sleep(lock_stack * owner);
24384 +extern void __reiser4_wake_up(lock_stack * owner);
24385 +
24386 +extern int lock_stack_isclean(lock_stack * owner);
24387 +
24388 +/* zlock object state check macros: only used in assertions. Both forms imply
24389 + that the lock is held by the current thread. */
24390 +extern int znode_is_write_locked(const znode *);
24391 +extern void reiser4_invalidate_lock(lock_handle *);
24392 +
24393 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24394 +#define spin_ordering_pred_stack(stack) \
24395 + (LOCK_CNT_NIL(spin_locked_stack) && \
24396 + LOCK_CNT_NIL(spin_locked_txnmgr) && \
24397 + LOCK_CNT_NIL(spin_locked_inode) && \
24398 + LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24399 + LOCK_CNT_NIL(spin_locked_super_eflush))
24400 +
24401 +static inline void spin_lock_stack(lock_stack *stack)
24402 +{
24403 + assert("", spin_ordering_pred_stack(stack));
24404 + spin_lock(&(stack->sguard));
24405 + LOCK_CNT_INC(spin_locked_stack);
24406 + LOCK_CNT_INC(spin_locked);
24407 +}
24408 +
24409 +static inline void spin_unlock_stack(lock_stack *stack)
24410 +{
24411 + assert_spin_locked(&(stack->sguard));
24412 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24413 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24414 + LOCK_CNT_DEC(spin_locked_stack);
24415 + LOCK_CNT_DEC(spin_locked);
24416 + spin_unlock(&(stack->sguard));
24417 +}
24418 +
24419 +static inline void reiser4_wake_up(lock_stack * owner)
24420 +{
24421 + spin_lock_stack(owner);
24422 + __reiser4_wake_up(owner);
24423 + spin_unlock_stack(owner);
24424 +}
24425 +
24426 +const char *lock_mode_name(znode_lock_mode lock);
24427 +
24428 +#if REISER4_DEBUG
24429 +extern void check_lock_data(void);
24430 +extern void check_lock_node_data(znode * node);
24431 +#else
24432 +#define check_lock_data() noop
24433 +#define check_lock_node_data() noop
24434 +#endif
24435 +
24436 +/* __LOCK_H__ */
24437 +#endif
24438 +
24439 +/* Make Linus happy.
24440 + Local variables:
24441 + c-indentation-style: "K&R"
24442 + mode-name: "LC"
24443 + c-basic-offset: 8
24444 + tab-width: 8
24445 + fill-column: 120
24446 + End:
24447 +*/
24448 diff -urN linux-2.6.33.orig/fs/reiser4/Makefile linux-2.6.33/fs/reiser4/Makefile
24449 --- linux-2.6.33.orig/fs/reiser4/Makefile 1970-01-01 01:00:00.000000000 +0100
24450 +++ linux-2.6.33/fs/reiser4/Makefile 2010-03-04 19:33:22.000000000 +0100
24451 @@ -0,0 +1,98 @@
24452 +#
24453 +# reiser4/Makefile
24454 +#
24455 +
24456 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24457 +
24458 +reiser4-y := \
24459 + debug.o \
24460 + jnode.o \
24461 + znode.o \
24462 + key.o \
24463 + pool.o \
24464 + tree_mod.o \
24465 + estimate.o \
24466 + carry.o \
24467 + carry_ops.o \
24468 + lock.o \
24469 + tree.o \
24470 + context.o \
24471 + tap.o \
24472 + coord.o \
24473 + block_alloc.o \
24474 + txnmgr.o \
24475 + kassign.o \
24476 + flush.o \
24477 + wander.o \
24478 + eottl.o \
24479 + search.o \
24480 + page_cache.o \
24481 + seal.o \
24482 + dscale.o \
24483 + flush_queue.o \
24484 + ktxnmgrd.o \
24485 + blocknrset.o \
24486 + super.o \
24487 + super_ops.o \
24488 + fsdata.o \
24489 + export_ops.o \
24490 + oid.o \
24491 + tree_walk.o \
24492 + inode.o \
24493 + vfs_ops.o \
24494 + as_ops.o \
24495 + entd.o\
24496 + readahead.o \
24497 + status_flags.o \
24498 + init_super.o \
24499 + safe_link.o \
24500 + \
24501 + plugin/plugin.o \
24502 + plugin/plugin_set.o \
24503 + plugin/node/node.o \
24504 + plugin/object.o \
24505 + plugin/cluster.o \
24506 + plugin/inode_ops.o \
24507 + plugin/inode_ops_rename.o \
24508 + plugin/file_ops.o \
24509 + plugin/file_ops_readdir.o \
24510 + plugin/file_plugin_common.o \
24511 + plugin/file/file.o \
24512 + plugin/file/tail_conversion.o \
24513 + plugin/file/file_conversion.o \
24514 + plugin/file/symlink.o \
24515 + plugin/file/cryptcompress.o \
24516 + plugin/dir_plugin_common.o \
24517 + plugin/dir/hashed_dir.o \
24518 + plugin/dir/seekable_dir.o \
24519 + plugin/node/node40.o \
24520 + \
24521 + plugin/crypto/cipher.o \
24522 + plugin/crypto/digest.o \
24523 + \
24524 + plugin/compress/compress.o \
24525 + plugin/compress/compress_mode.o \
24526 + \
24527 + plugin/item/static_stat.o \
24528 + plugin/item/sde.o \
24529 + plugin/item/cde.o \
24530 + plugin/item/blackbox.o \
24531 + plugin/item/internal.o \
24532 + plugin/item/tail.o \
24533 + plugin/item/ctail.o \
24534 + plugin/item/extent.o \
24535 + plugin/item/extent_item_ops.o \
24536 + plugin/item/extent_file_ops.o \
24537 + plugin/item/extent_flush_ops.o \
24538 + \
24539 + plugin/hash.o \
24540 + plugin/fibration.o \
24541 + plugin/tail_policy.o \
24542 + plugin/item/item.o \
24543 + \
24544 + plugin/security/perm.o \
24545 + plugin/space/bitmap.o \
24546 + \
24547 + plugin/disk_format/disk_format40.o \
24548 + plugin/disk_format/disk_format.o
24549 +
24550 diff -urN linux-2.6.33.orig/fs/reiser4/oid.c linux-2.6.33/fs/reiser4/oid.c
24551 --- linux-2.6.33.orig/fs/reiser4/oid.c 1970-01-01 01:00:00.000000000 +0100
24552 +++ linux-2.6.33/fs/reiser4/oid.c 2010-03-04 19:33:22.000000000 +0100
24553 @@ -0,0 +1,141 @@
24554 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24555 +
24556 +#include "debug.h"
24557 +#include "super.h"
24558 +#include "txnmgr.h"
24559 +
24560 +/* we used to have oid allocation plugin. It was removed because it
24561 + was recognized as providing unneeded level of abstraction. If one
24562 + ever will find it useful - look at yet_unneeded_abstractions/oid
24563 +*/
24564 +
24565 +/*
24566 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24567 + * are provided by disk format plugin that reads them from the disk during
24568 + * mount.
24569 + */
24570 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24571 +{
24572 + reiser4_super_info_data *sbinfo;
24573 +
24574 + sbinfo = get_super_private(super);
24575 +
24576 + sbinfo->next_to_use = next;
24577 + sbinfo->oids_in_use = nr_files;
24578 + return 0;
24579 +}
24580 +
24581 +/*
24582 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24583 + * runs out of oids.
24584 + */
24585 +oid_t oid_allocate(struct super_block *super)
24586 +{
24587 + reiser4_super_info_data *sbinfo;
24588 + oid_t oid;
24589 +
24590 + sbinfo = get_super_private(super);
24591 +
24592 + spin_lock_reiser4_super(sbinfo);
24593 + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24594 + oid = sbinfo->next_to_use++;
24595 + sbinfo->oids_in_use++;
24596 + } else
24597 + oid = ABSOLUTE_MAX_OID;
24598 + spin_unlock_reiser4_super(sbinfo);
24599 + return oid;
24600 +}
24601 +
24602 +/*
24603 + * Tell oid allocator that @oid is now free.
24604 + */
24605 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24606 +{
24607 + reiser4_super_info_data *sbinfo;
24608 +
24609 + sbinfo = get_super_private(super);
24610 +
24611 + spin_lock_reiser4_super(sbinfo);
24612 + sbinfo->oids_in_use--;
24613 + spin_unlock_reiser4_super(sbinfo);
24614 + return 0;
24615 +}
24616 +
24617 +/*
24618 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24619 + * without actually allocating it. This is used by disk format plugin to save
24620 + * oid allocator state on the disk.
24621 + */
24622 +oid_t oid_next(const struct super_block *super)
24623 +{
24624 + reiser4_super_info_data *sbinfo;
24625 + oid_t oid;
24626 +
24627 + sbinfo = get_super_private(super);
24628 +
24629 + spin_lock_reiser4_super(sbinfo);
24630 + oid = sbinfo->next_to_use;
24631 + spin_unlock_reiser4_super(sbinfo);
24632 + return oid;
24633 +}
24634 +
24635 +/*
24636 + * returns number of currently used oids. This is used by statfs(2) to report
24637 + * number of "inodes" and by disk format plugin to save oid allocator state on
24638 + * the disk.
24639 + */
24640 +long oids_used(const struct super_block *super)
24641 +{
24642 + reiser4_super_info_data *sbinfo;
24643 + oid_t used;
24644 +
24645 + sbinfo = get_super_private(super);
24646 +
24647 + spin_lock_reiser4_super(sbinfo);
24648 + used = sbinfo->oids_in_use;
24649 + spin_unlock_reiser4_super(sbinfo);
24650 + if (used < (__u64) ((long)~0) >> 1)
24651 + return (long)used;
24652 + else
24653 + return (long)-1;
24654 +}
24655 +
24656 +/*
24657 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24658 + * at the point when we are irrevocably committed to creation of the new file
24659 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24660 + * error).
24661 + */
24662 +void oid_count_allocated(void)
24663 +{
24664 + txn_atom *atom;
24665 +
24666 + atom = get_current_atom_locked();
24667 + atom->nr_objects_created++;
24668 + spin_unlock_atom(atom);
24669 +}
24670 +
24671 +/*
24672 + * Count oid as free in atom. This is done after call to oid_release() at the
24673 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24674 + * when oid release cannot be any longer rolled back due to some error).
24675 + */
24676 +void oid_count_released(void)
24677 +{
24678 + txn_atom *atom;
24679 +
24680 + atom = get_current_atom_locked();
24681 + atom->nr_objects_deleted++;
24682 + spin_unlock_atom(atom);
24683 +}
24684 +
24685 +/*
24686 + Local variables:
24687 + c-indentation-style: "K&R"
24688 + mode-name: "LC"
24689 + c-basic-offset: 8
24690 + tab-width: 8
24691 + fill-column: 120
24692 + scroll-step: 1
24693 + End:
24694 +*/
24695 diff -urN linux-2.6.33.orig/fs/reiser4/page_cache.c linux-2.6.33/fs/reiser4/page_cache.c
24696 --- linux-2.6.33.orig/fs/reiser4/page_cache.c 1970-01-01 01:00:00.000000000 +0100
24697 +++ linux-2.6.33/fs/reiser4/page_cache.c 2010-03-04 19:33:22.000000000 +0100
24698 @@ -0,0 +1,693 @@
24699 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24700 + * reiser4/README */
24701 +
24702 +/* Memory pressure hooks. Fake inodes handling. */
24703 +
24704 +/* GLOSSARY
24705 +
24706 + . Formatted and unformatted nodes.
24707 + Elements of reiser4 balanced tree to store data and metadata.
24708 + Unformatted nodes are pointed to by extent pointers. Such nodes
24709 + are used to store data of large objects. Unlike unformatted nodes,
24710 + formatted ones have associated format described by node4X plugin.
24711 +
24712 + . Jnode (or journal node)
24713 + The in-memory header which is used to track formatted and unformatted
24714 + nodes, bitmap nodes, etc. In particular, jnodes are used to track
24715 + transactional information associated with each block(see reiser4/jnode.c
24716 + for details).
24717 +
24718 + . Znode
24719 + The in-memory header which is used to track formatted nodes. Contains
24720 + embedded jnode (see reiser4/znode.c for details).
24721 +*/
24722 +
24723 +/* We store all file system meta data (and data, of course) in the page cache.
24724 +
24725 + What does this mean? In stead of using bread/brelse we create special
24726 + "fake" inode (one per super block) and store content of formatted nodes
24727 + into pages bound to this inode in the page cache. In newer kernels bread()
24728 + already uses inode attached to block device (bd_inode). Advantage of having
24729 + our own fake inode is that we can install appropriate methods in its
24730 + address_space operations. Such methods are called by VM on memory pressure
24731 + (or during background page flushing) and we can use them to react
24732 + appropriately.
24733 +
24734 + In initial version we only support one block per page. Support for multiple
24735 + blocks per page is complicated by relocation.
24736 +
24737 + To each page, used by reiser4, jnode is attached. jnode is analogous to
24738 + buffer head. Difference is that jnode is bound to the page permanently:
24739 + jnode cannot be removed from memory until its backing page is.
24740 +
24741 + jnode contain pointer to page (->pg field) and page contain pointer to
24742 + jnode in ->private field. Pointer from jnode to page is protected to by
24743 + jnode's spinlock and pointer from page to jnode is protected by page lock
24744 + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24745 + lock. To go into reverse direction use jnode_lock_page() function that uses
24746 + standard try-lock-and-release device.
24747 +
24748 + Properties:
24749 +
24750 + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24751 + reference counter is increased.
24752 +
24753 + 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24754 + reference counter is decreased.
24755 +
24756 + 3. on jload() reference counter on jnode page is increased, page is
24757 + kmapped and `referenced'.
24758 +
24759 + 4. on jrelse() inverse operations are performed.
24760 +
24761 + 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24762 +
24763 + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24764 + historically.]
24765 +
24766 + [In the following discussion, `lock' invariably means long term lock on
24767 + znode.] (What about page locks?)
24768 +
24769 + There is some special class of deadlock possibilities related to memory
24770 + pressure. Locks acquired by other reiser4 threads are accounted for in
24771 + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24772 + invoked additional hidden arc is added to the locking graph: thread that
24773 + tries to allocate memory waits for ->vm_writeback() to finish. If this
24774 + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24775 + prevention is useless.
24776 +
24777 + Another related problem is possibility for ->vm_writeback() to run out of
24778 + memory itself. This is not a problem for ext2 and friends, because their
24779 + ->vm_writeback() don't allocate much memory, but reiser4 flush is
24780 + definitely able to allocate huge amounts of memory.
24781 +
24782 + It seems that there is no reliable way to cope with the problems above. In
24783 + stead it was decided that ->vm_writeback() (as invoked in the kswapd
24784 + context) wouldn't perform any flushing itself, but rather should just wake
24785 + up some auxiliary thread dedicated for this purpose (or, the same thread
24786 + that does periodic commit of old atoms (ktxnmgrd.c)).
24787 +
24788 + Details:
24789 +
24790 + 1. Page is called `reclaimable' against particular reiser4 mount F if this
24791 + page can be ultimately released by try_to_free_pages() under presumptions
24792 + that:
24793 +
24794 + a. ->vm_writeback() for F is no-op, and
24795 +
24796 + b. none of the threads accessing F are making any progress, and
24797 +
24798 + c. other reiser4 mounts obey the same memory reservation protocol as F
24799 + (described below).
24800 +
24801 + For example, clean un-pinned page, or page occupied by ext2 data are
24802 + reclaimable against any reiser4 mount.
24803 +
24804 + When there is more than one reiser4 mount in a system, condition (c) makes
24805 + reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24806 +
24807 + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24808 +
24809 + Fake inode is used to bound formatted nodes and each node is indexed within
24810 + fake inode by its block number. If block size of smaller than page size, it
24811 + may so happen that block mapped to the page with formatted node is occupied
24812 + by unformatted node or is unallocated. This lead to some complications,
24813 + because flushing whole page can lead to an incorrect overwrite of
24814 + unformatted node that is moreover, can be cached in some other place as
24815 + part of the file body. To avoid this, buffers for unformatted nodes are
24816 + never marked dirty. Also pages in the fake are never marked dirty. This
24817 + rules out usage of ->writepage() as memory pressure hook. In stead
24818 + ->releasepage() is used.
24819 +
24820 + Josh is concerned that page->buffer is going to die. This should not pose
24821 + significant problem though, because we need to add some data structures to
24822 + the page anyway (jnode) and all necessary book keeping can be put there.
24823 +
24824 +*/
24825 +
24826 +/* Life cycle of pages/nodes.
24827 +
24828 + jnode contains reference to page and page contains reference back to
24829 + jnode. This reference is counted in page ->count. Thus, page bound to jnode
24830 + cannot be released back into free pool.
24831 +
24832 + 1. Formatted nodes.
24833 +
24834 + 1. formatted node is represented by znode. When new znode is created its
24835 + ->pg pointer is NULL initially.
24836 +
24837 + 2. when node content is loaded into znode (by call to zload()) for the
24838 + first time following happens (in call to ->read_node() or
24839 + ->allocate_node()):
24840 +
24841 + 1. new page is added to the page cache.
24842 +
24843 + 2. this page is attached to znode and its ->count is increased.
24844 +
24845 + 3. page is kmapped.
24846 +
24847 + 3. if more calls to zload() follow (without corresponding zrelses), page
24848 + counter is left intact and in its stead ->d_count is increased in znode.
24849 +
24850 + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24851 + ->release_node() is called and page is kunmapped as result.
24852 +
24853 + 5. at some moment node can be captured by a transaction. Its ->x_count
24854 + is then increased by transaction manager.
24855 +
24856 + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24857 + bit set) following will happen (also see comment at the top of znode.c):
24858 +
24859 + 1. when last lock is released, node will be uncaptured from
24860 + transaction. This released reference that transaction manager acquired
24861 + at the step 5.
24862 +
24863 + 2. when last reference is released, zput() detects that node is
24864 + actually deleted and calls ->delete_node()
24865 + operation. page_cache_delete_node() implementation detaches jnode from
24866 + page and releases page.
24867 +
24868 + 7. otherwise (node wasn't removed from the tree), last reference to
24869 + znode will be released after transaction manager committed transaction
24870 + node was in. This implies squallocing of this node (see
24871 + flush.c). Nothing special happens at this point. Znode is still in the
24872 + hash table and page is still attached to it.
24873 +
24874 + 8. znode is actually removed from the memory because of the memory
24875 + pressure, or during umount (znodes_tree_done()). Anyway, znode is
24876 + removed by the call to zdrop(). At this moment, page is detached from
24877 + znode and removed from the inode address space.
24878 +
24879 +*/
24880 +
24881 +#include "debug.h"
24882 +#include "dformat.h"
24883 +#include "key.h"
24884 +#include "txnmgr.h"
24885 +#include "jnode.h"
24886 +#include "znode.h"
24887 +#include "block_alloc.h"
24888 +#include "tree.h"
24889 +#include "vfs_ops.h"
24890 +#include "inode.h"
24891 +#include "super.h"
24892 +#include "entd.h"
24893 +#include "page_cache.h"
24894 +#include "ktxnmgrd.h"
24895 +
24896 +#include <linux/types.h>
24897 +#include <linux/fs.h>
24898 +#include <linux/mm.h> /* for struct page */
24899 +#include <linux/swap.h> /* for struct page */
24900 +#include <linux/pagemap.h>
24901 +#include <linux/bio.h>
24902 +#include <linux/writeback.h>
24903 +#include <linux/blkdev.h>
24904 +
24905 +static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp);
24906 +
24907 +static struct address_space_operations formatted_fake_as_ops;
24908 +
24909 +static const oid_t fake_ino = 0x1;
24910 +static const oid_t bitmap_ino = 0x2;
24911 +static const oid_t cc_ino = 0x3;
24912 +
24913 +static void
24914 +init_fake_inode(struct super_block *super, struct inode *fake,
24915 + struct inode **pfake)
24916 +{
24917 + assert("nikita-2168", fake->i_state & I_NEW);
24918 + fake->i_mapping->a_ops = &formatted_fake_as_ops;
24919 + *pfake = fake;
24920 + /* NOTE-NIKITA something else? */
24921 + unlock_new_inode(fake);
24922 +}
24923 +
24924 +/**
24925 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24926 + * @super: super block to init fake inode for
24927 + *
24928 + * Initializes fake inode to which formatted nodes are bound in the page cache
24929 + * and inode for bitmaps.
24930 + */
24931 +int reiser4_init_formatted_fake(struct super_block *super)
24932 +{
24933 + struct inode *fake;
24934 + struct inode *bitmap;
24935 + struct inode *cc;
24936 + reiser4_super_info_data *sinfo;
24937 +
24938 + assert("nikita-1703", super != NULL);
24939 +
24940 + sinfo = get_super_private_nocheck(super);
24941 + fake = iget_locked(super, oid_to_ino(fake_ino));
24942 +
24943 + if (fake != NULL) {
24944 + init_fake_inode(super, fake, &sinfo->fake);
24945 +
24946 + bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24947 + if (bitmap != NULL) {
24948 + init_fake_inode(super, bitmap, &sinfo->bitmap);
24949 +
24950 + cc = iget_locked(super, oid_to_ino(cc_ino));
24951 + if (cc != NULL) {
24952 + init_fake_inode(super, cc, &sinfo->cc);
24953 + return 0;
24954 + } else {
24955 + iput(sinfo->fake);
24956 + iput(sinfo->bitmap);
24957 + sinfo->fake = NULL;
24958 + sinfo->bitmap = NULL;
24959 + }
24960 + } else {
24961 + iput(sinfo->fake);
24962 + sinfo->fake = NULL;
24963 + }
24964 + }
24965 + return RETERR(-ENOMEM);
24966 +}
24967 +
24968 +/**
24969 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24970 + * @super: super block to init fake inode for
24971 + *
24972 + * Releases inodes which were used as address spaces of bitmap and formatted
24973 + * nodes.
24974 + */
24975 +void reiser4_done_formatted_fake(struct super_block *super)
24976 +{
24977 + reiser4_super_info_data *sinfo;
24978 +
24979 + sinfo = get_super_private_nocheck(super);
24980 +
24981 + if (sinfo->fake != NULL) {
24982 + iput(sinfo->fake);
24983 + sinfo->fake = NULL;
24984 + }
24985 +
24986 + if (sinfo->bitmap != NULL) {
24987 + iput(sinfo->bitmap);
24988 + sinfo->bitmap = NULL;
24989 + }
24990 +
24991 + if (sinfo->cc != NULL) {
24992 + iput(sinfo->cc);
24993 + sinfo->cc = NULL;
24994 + }
24995 + return;
24996 +}
24997 +
24998 +void reiser4_wait_page_writeback(struct page *page)
24999 +{
25000 + assert("zam-783", PageLocked(page));
25001 +
25002 + do {
25003 + unlock_page(page);
25004 + wait_on_page_writeback(page);
25005 + lock_page(page);
25006 + } while (PageWriteback(page));
25007 +}
25008 +
25009 +/* return tree @page is in */
25010 +reiser4_tree *reiser4_tree_by_page(const struct page *page/* page to query */)
25011 +{
25012 + assert("nikita-2461", page != NULL);
25013 + return &get_super_private(page->mapping->host->i_sb)->tree;
25014 +}
25015 +
25016 +/* completion handler for single page bio-based read.
25017 +
25018 + mpage_end_io_read() would also do. But it's static.
25019 +
25020 +*/
25021 +static void
25022 +end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG)
25023 +{
25024 + struct page *page;
25025 +
25026 + page = bio->bi_io_vec[0].bv_page;
25027 +
25028 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
25029 + SetPageUptodate(page);
25030 + } else {
25031 + ClearPageUptodate(page);
25032 + SetPageError(page);
25033 + }
25034 + unlock_page(page);
25035 + bio_put(bio);
25036 +}
25037 +
25038 +/* completion handler for single page bio-based write.
25039 +
25040 + mpage_end_io_write() would also do. But it's static.
25041 +
25042 +*/
25043 +static void
25044 +end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG)
25045 +{
25046 + struct page *page;
25047 +
25048 + page = bio->bi_io_vec[0].bv_page;
25049 +
25050 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
25051 + SetPageError(page);
25052 + end_page_writeback(page);
25053 + bio_put(bio);
25054 +}
25055 +
25056 +/* ->readpage() method for formatted nodes */
25057 +static int formatted_readpage(struct file *f UNUSED_ARG,
25058 + struct page *page/* page to read */)
25059 +{
25060 + assert("nikita-2412", PagePrivate(page) && jprivate(page));
25061 + return reiser4_page_io(page, jprivate(page), READ,
25062 + reiser4_ctx_gfp_mask_get());
25063 +}
25064 +
25065 +/**
25066 + * reiser4_page_io - submit single-page bio request
25067 + * @page: page to perform io for
25068 + * @node: jnode of page
25069 + * @rw: read or write
25070 + * @gfp: gfp mask for bio allocation
25071 + *
25072 + * Submits single page read or write.
25073 + */
25074 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
25075 +{
25076 + struct bio *bio;
25077 + int result;
25078 +
25079 + assert("nikita-2094", page != NULL);
25080 + assert("nikita-2226", PageLocked(page));
25081 + assert("nikita-2634", node != NULL);
25082 + assert("nikita-2893", rw == READ || rw == WRITE);
25083 +
25084 + if (rw) {
25085 + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
25086 + unlock_page(page);
25087 + return 0;
25088 + }
25089 + }
25090 +
25091 + bio = page_bio(page, node, rw, gfp);
25092 + if (!IS_ERR(bio)) {
25093 + if (rw == WRITE) {
25094 + set_page_writeback(page);
25095 + unlock_page(page);
25096 + }
25097 + reiser4_submit_bio(rw, bio);
25098 + result = 0;
25099 + } else {
25100 + unlock_page(page);
25101 + result = PTR_ERR(bio);
25102 + }
25103 +
25104 + return result;
25105 +}
25106 +
25107 +/* helper function to construct bio for page */
25108 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
25109 +{
25110 + struct bio *bio;
25111 + assert("nikita-2092", page != NULL);
25112 + assert("nikita-2633", node != NULL);
25113 +
25114 + /* Simple implementation in the assumption that blocksize == pagesize.
25115 +
25116 + We only have to submit one block, but submit_bh() will allocate bio
25117 + anyway, so lets use all the bells-and-whistles of bio code.
25118 + */
25119 +
25120 + bio = bio_alloc(gfp, 1);
25121 + if (bio != NULL) {
25122 + int blksz;
25123 + struct super_block *super;
25124 + reiser4_block_nr blocknr;
25125 +
25126 + super = page->mapping->host->i_sb;
25127 + assert("nikita-2029", super != NULL);
25128 + blksz = super->s_blocksize;
25129 + assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
25130 +
25131 + spin_lock_jnode(node);
25132 + blocknr = *jnode_get_io_block(node);
25133 + spin_unlock_jnode(node);
25134 +
25135 + assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
25136 + assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
25137 +
25138 + bio->bi_bdev = super->s_bdev;
25139 + /* fill bio->bi_sector before calling bio_add_page(), because
25140 + * q->merge_bvec_fn may want to inspect it (see
25141 + * drivers/md/linear.c:linear_mergeable_bvec() for example. */
25142 + bio->bi_sector = blocknr * (blksz >> 9);
25143 +
25144 + if (!bio_add_page(bio, page, blksz, 0)) {
25145 + warning("nikita-3452",
25146 + "Single page bio cannot be constructed");
25147 + return ERR_PTR(RETERR(-EINVAL));
25148 + }
25149 +
25150 + /* bio -> bi_idx is filled by bio_init() */
25151 + bio->bi_end_io = (rw == READ) ?
25152 + end_bio_single_page_read : end_bio_single_page_write;
25153 +
25154 + return bio;
25155 + } else
25156 + return ERR_PTR(RETERR(-ENOMEM));
25157 +}
25158 +
25159 +#if 0
25160 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25161 +{
25162 + if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25163 + return 1;
25164 + if (ctx->super != s)
25165 + return 1;
25166 + if (get_super_private(s)->entd.tsk == current)
25167 + return 0;
25168 + if (!lock_stack_isclean(&ctx->stack))
25169 + return 0;
25170 + if (ctx->trans->atom != NULL)
25171 + return 0;
25172 + return 1;
25173 +}
25174 +#endif
25175 +
25176 +/**
25177 + * reiser4_writepage - writepage of struct address_space_operations
25178 + * @page: page to write
25179 + * @wbc:
25180 + *
25181 + *
25182 + */
25183 +/* Common memory pressure notification. */
25184 +int reiser4_writepage(struct page *page,
25185 + struct writeback_control *wbc)
25186 +{
25187 + /*
25188 + * assert("edward-1562",
25189 + * can_hit_entd(get_current_context_check(), sb));
25190 + */
25191 + assert("vs-828", PageLocked(page));
25192 +
25193 + wbc->sb = page->mapping->host->i_sb;
25194 + wbc->bdi = page->mapping->backing_dev_info;
25195 +
25196 + return write_page_by_ent(page, wbc);
25197 +}
25198 +
25199 +/* ->set_page_dirty() method of formatted address_space */
25200 +static int formatted_set_page_dirty(struct page *page)
25201 +{
25202 + assert("nikita-2173", page != NULL);
25203 + BUG();
25204 + return __set_page_dirty_nobuffers(page);
25205 +}
25206 +
25207 +/* writepages method of address space operations in reiser4 is used to involve
25208 + into transactions pages which are dirtied via mmap. Only regular files can
25209 + have such pages. Fake inode is used to access formatted nodes via page
25210 + cache. As formatted nodes can never be mmaped, fake inode's writepages has
25211 + nothing to do */
25212 +static int
25213 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25214 +{
25215 + return 0;
25216 +}
25217 +
25218 +/* address space operations for the fake inode */
25219 +static struct address_space_operations formatted_fake_as_ops = {
25220 + /* Perform a writeback of a single page as a memory-freeing
25221 + * operation. */
25222 + .writepage = reiser4_writepage,
25223 + /* this is called to read formatted node */
25224 + .readpage = formatted_readpage,
25225 + /* ->sync_page() method of fake inode address space operations. Called
25226 + from wait_on_page() and lock_page().
25227 +
25228 + This is most annoyingly misnomered method. Actually it is called
25229 + from wait_on_page_bit() and lock_page() and its purpose is to
25230 + actually start io by jabbing device drivers.
25231 + */
25232 + .sync_page = block_sync_page,
25233 + /* Write back some dirty pages from this mapping. Called from sync.
25234 + called during sync (pdflush) */
25235 + .writepages = writepages_fake,
25236 + /* Set a page dirty */
25237 + .set_page_dirty = formatted_set_page_dirty,
25238 + /* used for read-ahead. Not applicable */
25239 + .readpages = NULL,
25240 + .write_begin = NULL,
25241 + .write_end = NULL,
25242 + .bmap = NULL,
25243 + /* called just before page is being detached from inode mapping and
25244 + removed from memory. Called on truncate, cut/squeeze, and
25245 + umount. */
25246 + .invalidatepage = reiser4_invalidatepage,
25247 + /* this is called by shrink_cache() so that file system can try to
25248 + release objects (jnodes, buffers, journal heads) attached to page
25249 + and, may be made page itself free-able.
25250 + */
25251 + .releasepage = reiser4_releasepage,
25252 + .direct_IO = NULL
25253 +};
25254 +
25255 +/* called just before page is released (no longer used by reiser4). Callers:
25256 + jdelete() and extent2tail(). */
25257 +void reiser4_drop_page(struct page *page)
25258 +{
25259 + assert("nikita-2181", PageLocked(page));
25260 + clear_page_dirty_for_io(page);
25261 + ClearPageUptodate(page);
25262 +#if defined(PG_skipped)
25263 + ClearPageSkipped(page);
25264 +#endif
25265 + unlock_page(page);
25266 +}
25267 +
25268 +#define JNODE_GANG_SIZE (16)
25269 +
25270 +/* find all jnodes from range specified and invalidate them */
25271 +static int
25272 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25273 +{
25274 + reiser4_inode *info;
25275 + int truncated_jnodes;
25276 + reiser4_tree *tree;
25277 + unsigned long index;
25278 + unsigned long end;
25279 +
25280 + if (inode_file_plugin(inode) ==
25281 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
25282 + /*
25283 + * No need to get rid of jnodes here: if the single jnode of
25284 + * page cluster did not have page, then it was found and killed
25285 + * before in
25286 + * truncate_complete_page_cluster()->jput()->jput_final(),
25287 + * otherwise it will be dropped by reiser4_invalidatepage()
25288 + */
25289 + return 0;
25290 + truncated_jnodes = 0;
25291 +
25292 + info = reiser4_inode_data(inode);
25293 + tree = reiser4_tree_by_inode(inode);
25294 +
25295 + index = from;
25296 + end = from + count;
25297 +
25298 + while (1) {
25299 + jnode *gang[JNODE_GANG_SIZE];
25300 + int taken;
25301 + int i;
25302 + jnode *node;
25303 +
25304 + assert("nikita-3466", index <= end);
25305 +
25306 + read_lock_tree(tree);
25307 + taken =
25308 + radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25309 + (void **)gang, index,
25310 + JNODE_GANG_SIZE);
25311 + for (i = 0; i < taken; ++i) {
25312 + node = gang[i];
25313 + if (index_jnode(node) < end)
25314 + jref(node);
25315 + else
25316 + gang[i] = NULL;
25317 + }
25318 + read_unlock_tree(tree);
25319 +
25320 + for (i = 0; i < taken; ++i) {
25321 + node = gang[i];
25322 + if (node != NULL) {
25323 + index = max(index, index_jnode(node));
25324 + spin_lock_jnode(node);
25325 + assert("edward-1457", node->pg == NULL);
25326 + /* this is always called after
25327 + truncate_inode_pages_range(). Therefore, here
25328 + jnode can not have page. New pages can not be
25329 + created because truncate_jnodes_range goes
25330 + under exclusive access on file obtained,
25331 + where as new page creation requires
25332 + non-exclusive access obtained */
25333 + JF_SET(node, JNODE_HEARD_BANSHEE);
25334 + reiser4_uncapture_jnode(node);
25335 + unhash_unformatted_jnode(node);
25336 + truncated_jnodes++;
25337 + jput(node);
25338 + } else
25339 + break;
25340 + }
25341 + if (i != taken || taken == 0)
25342 + break;
25343 + }
25344 + return truncated_jnodes;
25345 +}
25346 +
25347 +/* Truncating files in reiser4: problems and solutions.
25348 +
25349 + VFS calls fs's truncate after it has called truncate_inode_pages()
25350 + to get rid of pages corresponding to part of file being truncated.
25351 + In reiser4 it may cause existence of unallocated extents which do
25352 + not have jnodes. Flush code does not expect that. Solution of this
25353 + problem is straightforward. As vfs's truncate is implemented using
25354 + setattr operation, it seems reasonable to have ->setattr() that
25355 + will cut file body. However, flush code also does not expect dirty
25356 + pages without parent items, so it is impossible to cut all items,
25357 + then truncate all pages in two steps. We resolve this problem by
25358 + cutting items one-by-one. Each such fine-grained step performed
25359 + under longterm znode lock calls at the end ->kill_hook() method of
25360 + a killed item to remove its binded pages and jnodes.
25361 +
25362 + The following function is a common part of mentioned kill hooks.
25363 + Also, this is called before tail-to-extent conversion (to not manage
25364 + few copies of the data).
25365 +*/
25366 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25367 + unsigned long count, int even_cows)
25368 +{
25369 + loff_t from_bytes, count_bytes;
25370 +
25371 + if (count == 0)
25372 + return;
25373 + from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25374 + count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25375 +
25376 + unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25377 + truncate_inode_pages_range(mapping, from_bytes,
25378 + from_bytes + count_bytes - 1);
25379 + truncate_jnodes_range(mapping->host, from, count);
25380 +}
25381 +
25382 +/*
25383 + * Local variables:
25384 + * c-indentation-style: "K&R"
25385 + * mode-name: "LC"
25386 + * c-basic-offset: 8
25387 + * tab-width: 8
25388 + * fill-column: 120
25389 + * scroll-step: 1
25390 + * End:
25391 + */
25392 diff -urN linux-2.6.33.orig/fs/reiser4/page_cache.h linux-2.6.33/fs/reiser4/page_cache.h
25393 --- linux-2.6.33.orig/fs/reiser4/page_cache.h 1970-01-01 01:00:00.000000000 +0100
25394 +++ linux-2.6.33/fs/reiser4/page_cache.h 2010-03-04 19:33:22.000000000 +0100
25395 @@ -0,0 +1,66 @@
25396 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25397 + * reiser4/README */
25398 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25399 +
25400 +#if !defined(__REISER4_PAGE_CACHE_H__)
25401 +#define __REISER4_PAGE_CACHE_H__
25402 +
25403 +#include "forward.h"
25404 +#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25405 +
25406 +#include <linux/fs.h> /* for struct super_block, address_space */
25407 +#include <linux/mm.h> /* for struct page */
25408 +#include <linux/pagemap.h> /* for lock_page() */
25409 +#include <linux/vmalloc.h> /* for __vmalloc() */
25410 +
25411 +extern int reiser4_init_formatted_fake(struct super_block *);
25412 +extern void reiser4_done_formatted_fake(struct super_block *);
25413 +
25414 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25415 +
25416 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25417 +
25418 +extern void reiser4_wait_page_writeback(struct page *);
25419 +static inline void lock_and_wait_page_writeback(struct page *page)
25420 +{
25421 + lock_page(page);
25422 + if (unlikely(PageWriteback(page)))
25423 + reiser4_wait_page_writeback(page);
25424 +}
25425 +
25426 +#define jprivate(page) ((jnode *)page_private(page))
25427 +
25428 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25429 +extern void reiser4_drop_page(struct page *);
25430 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25431 + unsigned long count, int even_cows);
25432 +extern void capture_reiser4_inodes(struct super_block *,
25433 + struct writeback_control *);
25434 +static inline void *reiser4_vmalloc(unsigned long size)
25435 +{
25436 + return __vmalloc(size,
25437 + reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25438 + PAGE_KERNEL);
25439 +}
25440 +
25441 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25442 +
25443 +#if REISER4_DEBUG
25444 +extern void print_page(const char *prefix, struct page *page);
25445 +#else
25446 +#define print_page(prf, p) noop
25447 +#endif
25448 +
25449 +/* __REISER4_PAGE_CACHE_H__ */
25450 +#endif
25451 +
25452 +/* Make Linus happy.
25453 + Local variables:
25454 + c-indentation-style: "K&R"
25455 + mode-name: "LC"
25456 + c-basic-offset: 8
25457 + tab-width: 8
25458 + fill-column: 120
25459 + scroll-step: 1
25460 + End:
25461 +*/
25462 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/cluster.c linux-2.6.33/fs/reiser4/plugin/cluster.c
25463 --- linux-2.6.33.orig/fs/reiser4/plugin/cluster.c 1970-01-01 01:00:00.000000000 +0100
25464 +++ linux-2.6.33/fs/reiser4/plugin/cluster.c 2010-03-04 19:33:22.000000000 +0100
25465 @@ -0,0 +1,72 @@
25466 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25467 + * reiser4/README */
25468 +
25469 +/* Contains reiser4 cluster plugins (see
25470 + http://www.namesys.com/cryptcompress_design.html
25471 + "Concepts of clustering" for details). */
25472 +
25473 +#include "plugin_header.h"
25474 +#include "plugin.h"
25475 +#include "../inode.h"
25476 +
25477 +static int change_cluster(struct inode *inode,
25478 + reiser4_plugin * plugin,
25479 + pset_member memb)
25480 +{
25481 + assert("edward-1324", inode != NULL);
25482 + assert("edward-1325", plugin != NULL);
25483 + assert("edward-1326", is_reiser4_inode(inode));
25484 + assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25485 +
25486 + /* Can't change the cluster plugin for already existent regular files */
25487 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25488 + return RETERR(-EINVAL);
25489 +
25490 + /* If matches, nothing to change. */
25491 + if (inode_hash_plugin(inode) != NULL &&
25492 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25493 + return 0;
25494 +
25495 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25496 + PSET_CLUSTER, plugin);
25497 +}
25498 +
25499 +static reiser4_plugin_ops cluster_plugin_ops = {
25500 + .init = NULL,
25501 + .load = NULL,
25502 + .save_len = NULL,
25503 + .save = NULL,
25504 + .change = &change_cluster
25505 +};
25506 +
25507 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25508 + [CLUSTER_ ## ID ## _ID] = { \
25509 + .h = { \
25510 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25511 + .id = CLUSTER_ ## ID ## _ID, \
25512 + .pops = &cluster_plugin_ops, \
25513 + .label = LABEL, \
25514 + .desc = DESC, \
25515 + .linkage = {NULL, NULL} \
25516 + }, \
25517 + .shift = SHIFT \
25518 + }
25519 +
25520 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25521 + SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25522 + SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25523 + SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25524 + SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25525 + SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25526 +};
25527 +
25528 +/*
25529 + Local variables:
25530 + c-indentation-style: "K&R"
25531 + mode-name: "LC"
25532 + c-basic-offset: 8
25533 + tab-width: 8
25534 + fill-column: 120
25535 + scroll-step: 1
25536 + End:
25537 +*/
25538 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/cluster.h linux-2.6.33/fs/reiser4/plugin/cluster.h
25539 --- linux-2.6.33.orig/fs/reiser4/plugin/cluster.h 1970-01-01 01:00:00.000000000 +0100
25540 +++ linux-2.6.33/fs/reiser4/plugin/cluster.h 2010-03-04 19:33:22.000000000 +0100
25541 @@ -0,0 +1,410 @@
25542 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25543 +
25544 +/* This file contains size/offset translators, modulators
25545 + and other helper functions. */
25546 +
25547 +#if !defined(__FS_REISER4_CLUSTER_H__)
25548 +#define __FS_REISER4_CLUSTER_H__
25549 +
25550 +#include "../inode.h"
25551 +
25552 +static inline int inode_cluster_shift(struct inode *inode)
25553 +{
25554 + assert("edward-92", inode != NULL);
25555 + assert("edward-93", reiser4_inode_data(inode) != NULL);
25556 +
25557 + return inode_cluster_plugin(inode)->shift;
25558 +}
25559 +
25560 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25561 +{
25562 + return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25563 +}
25564 +
25565 +/* cluster size in page units */
25566 +static inline unsigned cluster_nrpages(struct inode *inode)
25567 +{
25568 + return 1U << cluster_nrpages_shift(inode);
25569 +}
25570 +
25571 +static inline size_t inode_cluster_size(struct inode *inode)
25572 +{
25573 + assert("edward-96", inode != NULL);
25574 +
25575 + return 1U << inode_cluster_shift(inode);
25576 +}
25577 +
25578 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25579 +{
25580 + return idx >> cluster_nrpages_shift(inode);
25581 +}
25582 +
25583 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25584 +{
25585 + return idx << cluster_nrpages_shift(inode);
25586 +}
25587 +
25588 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25589 +{
25590 + return clust_to_pg(pg_to_clust(idx, inode), inode);
25591 +}
25592 +
25593 +static inline pgoff_t off_to_pg(loff_t off)
25594 +{
25595 + return (off >> PAGE_CACHE_SHIFT);
25596 +}
25597 +
25598 +static inline loff_t pg_to_off(pgoff_t idx)
25599 +{
25600 + return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25601 +}
25602 +
25603 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25604 +{
25605 + return off >> inode_cluster_shift(inode);
25606 +}
25607 +
25608 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25609 +{
25610 + return (loff_t) idx << inode_cluster_shift(inode);
25611 +}
25612 +
25613 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25614 +{
25615 + return clust_to_off(off_to_clust(off, inode), inode);
25616 +}
25617 +
25618 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25619 +{
25620 + return clust_to_pg(off_to_clust(off, inode), inode);
25621 +}
25622 +
25623 +static inline unsigned off_to_pgoff(loff_t off)
25624 +{
25625 + return off & (PAGE_CACHE_SIZE - 1);
25626 +}
25627 +
25628 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25629 +{
25630 + return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25631 +}
25632 +
25633 +static inline pgoff_t offset_in_clust(struct page *page)
25634 +{
25635 + assert("edward-1488", page != NULL);
25636 + assert("edward-1489", page->mapping != NULL);
25637 +
25638 + return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25639 +}
25640 +
25641 +static inline int first_page_in_cluster(struct page *page)
25642 +{
25643 + return offset_in_clust(page) == 0;
25644 +}
25645 +
25646 +static inline int last_page_in_cluster(struct page *page)
25647 +{
25648 + return offset_in_clust(page) ==
25649 + cluster_nrpages(page->mapping->host) - 1;
25650 +}
25651 +
25652 +static inline unsigned
25653 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25654 +{
25655 + return off_to_cloff(pg_to_off(idx), inode);
25656 +}
25657 +
25658 +/*********************** Size translators **************************/
25659 +
25660 +/* Translate linear size.
25661 + * New units are (1 << @blk_shift) times larger, then old ones.
25662 + * In other words, calculate number of logical blocks, occupied
25663 + * by @count elements
25664 + */
25665 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25666 +{
25667 + return (count + (1UL << blkbits) - 1) >> blkbits;
25668 +}
25669 +
25670 +/* size in pages */
25671 +static inline pgoff_t size_in_pages(loff_t size)
25672 +{
25673 + return size_in_blocks(size, PAGE_CACHE_SHIFT);
25674 +}
25675 +
25676 +/* size in logical clusters */
25677 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25678 +{
25679 + return size_in_blocks(size, inode_cluster_shift(inode));
25680 +}
25681 +
25682 +/* size in pages to the size in page clusters */
25683 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25684 +{
25685 + return size_in_blocks(size, cluster_nrpages_shift(inode));
25686 +}
25687 +
25688 +/*********************** Size modulators ***************************/
25689 +
25690 +/*
25691 + Modulate linear size by nominated block size and offset.
25692 +
25693 + The "finite" function (which is zero almost everywhere).
25694 + How much is a height of the figure at a position @pos,
25695 + when trying to construct rectangle of height (1 << @blkbits),
25696 + and square @size.
25697 +
25698 + ******
25699 + *******
25700 + *******
25701 + *******
25702 + ----------> pos
25703 +*/
25704 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25705 +{
25706 + unsigned end = size >> blkbits;
25707 + if (pos < end)
25708 + return 1U << blkbits;
25709 + if (unlikely(pos > end))
25710 + return 0;
25711 + return size & ~(~0ull << blkbits);
25712 +}
25713 +
25714 +/* the same as above, but block size is page size */
25715 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
25716 +{
25717 + return __mbb(size, pos, PAGE_CACHE_SHIFT);
25718 +}
25719 +
25720 +/* number of file's bytes in the nominated logical cluster */
25721 +static inline unsigned lbytes(cloff_t index, struct inode *inode)
25722 +{
25723 + return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25724 +}
25725 +
25726 +/* number of file's bytes in the nominated page */
25727 +static inline unsigned pbytes(pgoff_t index, struct inode *inode)
25728 +{
25729 + return __mbp(i_size_read(inode), index);
25730 +}
25731 +
25732 +/**
25733 + * number of pages occuped by @win->count bytes starting from
25734 + * @win->off at logical cluster defined by @win. This is exactly
25735 + * a number of pages to be modified and dirtied in any cluster operation.
25736 + */
25737 +static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
25738 +{
25739 + return ((win->off + win->count +
25740 + (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) -
25741 + off_to_pg(win->off);
25742 +}
25743 +
25744 +/* return true, if logical cluster is not occupied by the file */
25745 +static inline int new_logical_cluster(struct cluster_handle *clust,
25746 + struct inode *inode)
25747 +{
25748 + return clust_to_off(clust->index, inode) >= i_size_read(inode);
25749 +}
25750 +
25751 +/* return true, if pages @p1 and @p2 are of the same page cluster */
25752 +static inline int same_page_cluster(struct page *p1, struct page *p2)
25753 +{
25754 + assert("edward-1490", p1 != NULL);
25755 + assert("edward-1491", p2 != NULL);
25756 + assert("edward-1492", p1->mapping != NULL);
25757 + assert("edward-1493", p2->mapping != NULL);
25758 +
25759 + return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25760 + pg_to_clust(page_index(p2), p2->mapping->host));
25761 +}
25762 +
25763 +static inline int cluster_is_complete(struct cluster_handle *clust,
25764 + struct inode *inode)
25765 +{
25766 + return clust->tc.lsize == inode_cluster_size(inode);
25767 +}
25768 +
25769 +static inline void reiser4_slide_init(struct reiser4_slide *win)
25770 +{
25771 + assert("edward-1084", win != NULL);
25772 + memset(win, 0, sizeof *win);
25773 +}
25774 +
25775 +static inline tfm_action
25776 +cluster_get_tfm_act(struct tfm_cluster *tc)
25777 +{
25778 + assert("edward-1356", tc != NULL);
25779 + return tc->act;
25780 +}
25781 +
25782 +static inline void
25783 +cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act)
25784 +{
25785 + assert("edward-1356", tc != NULL);
25786 + tc->act = act;
25787 +}
25788 +
25789 +static inline void cluster_init_act(struct cluster_handle *clust,
25790 + tfm_action act,
25791 + struct reiser4_slide *window)
25792 +{
25793 + assert("edward-84", clust != NULL);
25794 + memset(clust, 0, sizeof *clust);
25795 + cluster_set_tfm_act(&clust->tc, act);
25796 + clust->dstat = INVAL_DISK_CLUSTER;
25797 + clust->win = window;
25798 +}
25799 +
25800 +static inline void cluster_init_read(struct cluster_handle *clust,
25801 + struct reiser4_slide *window)
25802 +{
25803 + cluster_init_act(clust, TFMA_READ, window);
25804 +}
25805 +
25806 +static inline void cluster_init_write(struct cluster_handle *clust,
25807 + struct reiser4_slide *window)
25808 +{
25809 + cluster_init_act(clust, TFMA_WRITE, window);
25810 +}
25811 +
25812 +/* true if @p1 and @p2 are items of the same disk cluster */
25813 +static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2)
25814 +{
25815 + /* drop this if you have other items to aggregate */
25816 + assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25817 +
25818 + return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25819 +}
25820 +
25821 +static inline int dclust_get_extension_dsize(hint_t *hint)
25822 +{
25823 + return hint->ext_coord.extension.ctail.dsize;
25824 +}
25825 +
25826 +static inline void dclust_set_extension_dsize(hint_t *hint, int dsize)
25827 +{
25828 + hint->ext_coord.extension.ctail.dsize = dsize;
25829 +}
25830 +
25831 +static inline int dclust_get_extension_shift(hint_t *hint)
25832 +{
25833 + return hint->ext_coord.extension.ctail.shift;
25834 +}
25835 +
25836 +static inline int dclust_get_extension_ncount(hint_t *hint)
25837 +{
25838 + return hint->ext_coord.extension.ctail.ncount;
25839 +}
25840 +
25841 +static inline void dclust_inc_extension_ncount(hint_t *hint)
25842 +{
25843 + hint->ext_coord.extension.ctail.ncount++;
25844 +}
25845 +
25846 +static inline void dclust_init_extension(hint_t *hint)
25847 +{
25848 + memset(&hint->ext_coord.extension.ctail, 0,
25849 + sizeof(hint->ext_coord.extension.ctail));
25850 +}
25851 +
25852 +static inline int hint_is_unprepped_dclust(hint_t *hint)
25853 +{
25854 + assert("edward-1451", hint_is_valid(hint));
25855 + return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25856 +}
25857 +
25858 +static inline void coord_set_between_clusters(coord_t *coord)
25859 +{
25860 +#if REISER4_DEBUG
25861 + int result;
25862 + result = zload(coord->node);
25863 + assert("edward-1296", !result);
25864 +#endif
25865 + if (!coord_is_between_items(coord)) {
25866 + coord->between = AFTER_ITEM;
25867 + coord->unit_pos = 0;
25868 + }
25869 +#if REISER4_DEBUG
25870 + zrelse(coord->node);
25871 +#endif
25872 +}
25873 +
25874 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25875 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25876 + znode_lock_mode mode);
25877 +int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *);
25878 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25879 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25880 + int even_cows);
25881 +void invalidate_hint_cluster(struct cluster_handle *clust);
25882 +int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode,
25883 + znode_lock_mode lock_mode);
25884 +void reset_cluster_params(struct cluster_handle *clust);
25885 +int set_cluster_by_page(struct cluster_handle *clust, struct page *page,
25886 + int count);
25887 +int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust,
25888 + rw_op rw);
25889 +void __put_page_cluster(int from, int count, struct page **pages,
25890 + struct inode *inode);
25891 +void put_page_cluster(struct cluster_handle *clust,
25892 + struct inode *inode, rw_op rw);
25893 +void put_cluster_handle(struct cluster_handle *clust);
25894 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc,
25895 + tfm_stream_id id);
25896 +int tfm_cluster_is_uptodate(struct tfm_cluster *tc);
25897 +void tfm_cluster_set_uptodate(struct tfm_cluster *tc);
25898 +void tfm_cluster_clr_uptodate(struct tfm_cluster *tc);
25899 +
25900 +/* move cluster handle to the target position
25901 + specified by the page of index @pgidx */
25902 +static inline void move_cluster_forward(struct cluster_handle *clust,
25903 + struct inode *inode,
25904 + pgoff_t pgidx)
25905 +{
25906 + assert("edward-1297", clust != NULL);
25907 + assert("edward-1298", inode != NULL);
25908 +
25909 + reset_cluster_params(clust);
25910 + if (clust->index_valid &&
25911 + /* Hole in the indices. Hint became invalid and can not be
25912 + used by find_cluster_item() even if seal/node versions
25913 + will coincide */
25914 + pg_to_clust(pgidx, inode) != clust->index + 1) {
25915 + reiser4_unset_hint(clust->hint);
25916 + invalidate_hint_cluster(clust);
25917 + }
25918 + clust->index = pg_to_clust(pgidx, inode);
25919 + clust->index_valid = 1;
25920 +}
25921 +
25922 +static inline int alloc_clust_pages(struct cluster_handle *clust,
25923 + struct inode *inode)
25924 +{
25925 + assert("edward-791", clust != NULL);
25926 + assert("edward-792", inode != NULL);
25927 + clust->pages =
25928 + kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25929 + reiser4_ctx_gfp_mask_get());
25930 + if (!clust->pages)
25931 + return -ENOMEM;
25932 + return 0;
25933 +}
25934 +
25935 +static inline void free_clust_pages(struct cluster_handle *clust)
25936 +{
25937 + kfree(clust->pages);
25938 +}
25939 +
25940 +#endif /* __FS_REISER4_CLUSTER_H__ */
25941 +
25942 +/* Make Linus happy.
25943 + Local variables:
25944 + c-indentation-style: "K&R"
25945 + mode-name: "LC"
25946 + c-basic-offset: 8
25947 + tab-width: 8
25948 + fill-column: 120
25949 + scroll-step: 1
25950 + End:
25951 +*/
25952 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.33/fs/reiser4/plugin/compress/compress.c
25953 --- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 01:00:00.000000000 +0100
25954 +++ linux-2.6.33/fs/reiser4/plugin/compress/compress.c 2010-03-04 19:33:22.000000000 +0100
25955 @@ -0,0 +1,355 @@
25956 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25957 +/* reiser4 compression transform plugins */
25958 +
25959 +#include "../../debug.h"
25960 +#include "../../inode.h"
25961 +#include "../plugin.h"
25962 +
25963 +#include <linux/lzo.h>
25964 +#include <linux/zlib.h>
25965 +#include <linux/types.h>
25966 +#include <linux/hardirq.h>
25967 +
25968 +static int change_compression(struct inode *inode,
25969 + reiser4_plugin * plugin,
25970 + pset_member memb)
25971 +{
25972 + assert("edward-1316", inode != NULL);
25973 + assert("edward-1317", plugin != NULL);
25974 + assert("edward-1318", is_reiser4_inode(inode));
25975 + assert("edward-1319",
25976 + plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25977 +
25978 + /* cannot change compression plugin of already existing regular object */
25979 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25980 + return RETERR(-EINVAL);
25981 +
25982 + /* If matches, nothing to change. */
25983 + if (inode_hash_plugin(inode) != NULL &&
25984 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25985 + return 0;
25986 +
25987 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25988 + PSET_COMPRESSION, plugin);
25989 +}
25990 +
25991 +static reiser4_plugin_ops compression_plugin_ops = {
25992 + .init = NULL,
25993 + .load = NULL,
25994 + .save_len = NULL,
25995 + .save = NULL,
25996 + .change = &change_compression
25997 +};
25998 +
25999 +/******************************************************************************/
26000 +/* gzip1 compression */
26001 +/******************************************************************************/
26002 +
26003 +#define GZIP1_DEF_LEVEL Z_BEST_SPEED
26004 +#define GZIP1_DEF_WINBITS 15
26005 +#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
26006 +
26007 +static int gzip1_init(void)
26008 +{
26009 + return 0;
26010 +}
26011 +
26012 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
26013 +{
26014 + return 0;
26015 +}
26016 +
26017 +static coa_t gzip1_alloc(tfm_action act)
26018 +{
26019 + coa_t coa = NULL;
26020 + int ret = 0;
26021 + switch (act) {
26022 + case TFMA_WRITE: /* compress */
26023 + coa = reiser4_vmalloc(zlib_deflate_workspacesize());
26024 + if (!coa) {
26025 + ret = -ENOMEM;
26026 + break;
26027 + }
26028 + break;
26029 + case TFMA_READ: /* decompress */
26030 + coa = reiser4_vmalloc(zlib_inflate_workspacesize());
26031 + if (!coa) {
26032 + ret = -ENOMEM;
26033 + break;
26034 + }
26035 + break;
26036 + default:
26037 + impossible("edward-767",
26038 + "trying to alloc workspace for unknown tfm action");
26039 + }
26040 + if (ret) {
26041 + warning("edward-768",
26042 + "alloc workspace for gzip1 (tfm action = %d) failed\n",
26043 + act);
26044 + return ERR_PTR(ret);
26045 + }
26046 + return coa;
26047 +}
26048 +
26049 +static void gzip1_free(coa_t coa, tfm_action act)
26050 +{
26051 + assert("edward-769", coa != NULL);
26052 +
26053 + switch (act) {
26054 + case TFMA_WRITE: /* compress */
26055 + vfree(coa);
26056 + break;
26057 + case TFMA_READ: /* decompress */
26058 + vfree(coa);
26059 + break;
26060 + default:
26061 + impossible("edward-770", "unknown tfm action");
26062 + }
26063 + return;
26064 +}
26065 +
26066 +static int gzip1_min_size_deflate(void)
26067 +{
26068 + return 64;
26069 +}
26070 +
26071 +static void
26072 +gzip1_compress(coa_t coa, __u8 * src_first, size_t src_len,
26073 + __u8 * dst_first, size_t *dst_len)
26074 +{
26075 + int ret = 0;
26076 + struct z_stream_s stream;
26077 +
26078 + assert("edward-842", coa != NULL);
26079 + assert("edward-875", src_len != 0);
26080 +
26081 + stream.workspace = coa;
26082 + ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
26083 + -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
26084 + Z_DEFAULT_STRATEGY);
26085 + if (ret != Z_OK) {
26086 + warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
26087 + goto rollback;
26088 + }
26089 + ret = zlib_deflateReset(&stream);
26090 + if (ret != Z_OK) {
26091 + warning("edward-772", "zlib_deflateReset returned %d\n", ret);
26092 + goto rollback;
26093 + }
26094 + stream.next_in = src_first;
26095 + stream.avail_in = src_len;
26096 + stream.next_out = dst_first;
26097 + stream.avail_out = *dst_len;
26098 +
26099 + ret = zlib_deflate(&stream, Z_FINISH);
26100 + if (ret != Z_STREAM_END) {
26101 + if (ret != Z_OK)
26102 + warning("edward-773",
26103 + "zlib_deflate returned %d\n", ret);
26104 + goto rollback;
26105 + }
26106 + *dst_len = stream.total_out;
26107 + return;
26108 + rollback:
26109 + *dst_len = src_len;
26110 + return;
26111 +}
26112 +
26113 +static void
26114 +gzip1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
26115 + __u8 * dst_first, size_t *dst_len)
26116 +{
26117 + int ret = 0;
26118 + struct z_stream_s stream;
26119 +
26120 + assert("edward-843", coa != NULL);
26121 + assert("edward-876", src_len != 0);
26122 +
26123 + stream.workspace = coa;
26124 + ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
26125 + if (ret != Z_OK) {
26126 + warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
26127 + return;
26128 + }
26129 + ret = zlib_inflateReset(&stream);
26130 + if (ret != Z_OK) {
26131 + warning("edward-775", "zlib_inflateReset returned %d\n", ret);
26132 + return;
26133 + }
26134 +
26135 + stream.next_in = src_first;
26136 + stream.avail_in = src_len;
26137 + stream.next_out = dst_first;
26138 + stream.avail_out = *dst_len;
26139 +
26140 + ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
26141 + /*
26142 + * Work around a bug in zlib, which sometimes wants to taste an extra
26143 + * byte when being used in the (undocumented) raw deflate mode.
26144 + * (From USAGI).
26145 + */
26146 + if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
26147 + u8 zerostuff = 0;
26148 + stream.next_in = &zerostuff;
26149 + stream.avail_in = 1;
26150 + ret = zlib_inflate(&stream, Z_FINISH);
26151 + }
26152 + if (ret != Z_STREAM_END) {
26153 + warning("edward-776", "zlib_inflate returned %d\n", ret);
26154 + return;
26155 + }
26156 + *dst_len = stream.total_out;
26157 + return;
26158 +}
26159 +
26160 +/******************************************************************************/
26161 +/* lzo1 compression */
26162 +/******************************************************************************/
26163 +
26164 +static int lzo1_init(void)
26165 +{
26166 + return 0;
26167 +}
26168 +
26169 +static int lzo1_overrun(unsigned in_len)
26170 +{
26171 + return in_len / 64 + 16 + 3;
26172 +}
26173 +
26174 +static coa_t lzo1_alloc(tfm_action act)
26175 +{
26176 + int ret = 0;
26177 + coa_t coa = NULL;
26178 +
26179 + switch (act) {
26180 + case TFMA_WRITE: /* compress */
26181 + coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
26182 + if (!coa) {
26183 + ret = -ENOMEM;
26184 + break;
26185 + }
26186 + case TFMA_READ: /* decompress */
26187 + break;
26188 + default:
26189 + impossible("edward-877",
26190 + "trying to alloc workspace for unknown tfm action");
26191 + }
26192 + if (ret) {
26193 + warning("edward-878",
26194 + "alloc workspace for lzo1 (tfm action = %d) failed\n",
26195 + act);
26196 + return ERR_PTR(ret);
26197 + }
26198 + return coa;
26199 +}
26200 +
26201 +static void lzo1_free(coa_t coa, tfm_action act)
26202 +{
26203 + assert("edward-879", coa != NULL);
26204 +
26205 + switch (act) {
26206 + case TFMA_WRITE: /* compress */
26207 + vfree(coa);
26208 + break;
26209 + case TFMA_READ: /* decompress */
26210 + impossible("edward-1304",
26211 + "trying to free non-allocated workspace");
26212 + default:
26213 + impossible("edward-880", "unknown tfm action");
26214 + }
26215 + return;
26216 +}
26217 +
26218 +static int lzo1_min_size_deflate(void)
26219 +{
26220 + return 256;
26221 +}
26222 +
26223 +static void
26224 +lzo1_compress(coa_t coa, __u8 * src_first, size_t src_len,
26225 + __u8 * dst_first, size_t *dst_len)
26226 +{
26227 + int result;
26228 +
26229 + assert("edward-846", coa != NULL);
26230 + assert("edward-847", src_len != 0);
26231 +
26232 + result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26233 + if (unlikely(result != LZO_E_OK)) {
26234 + warning("edward-849", "lzo1x_1_compress failed\n");
26235 + goto out;
26236 + }
26237 + if (*dst_len >= src_len) {
26238 + //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26239 + goto out;
26240 + }
26241 + return;
26242 + out:
26243 + *dst_len = src_len;
26244 + return;
26245 +}
26246 +
26247 +static void
26248 +lzo1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
26249 + __u8 * dst_first, size_t *dst_len)
26250 +{
26251 + int result;
26252 +
26253 + assert("edward-851", coa == NULL);
26254 + assert("edward-852", src_len != 0);
26255 +
26256 + result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
26257 + if (result != LZO_E_OK)
26258 + warning("edward-853", "lzo1x_1_decompress failed\n");
26259 + return;
26260 +}
26261 +
26262 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26263 + [LZO1_COMPRESSION_ID] = {
26264 + .h = {
26265 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26266 + .id = LZO1_COMPRESSION_ID,
26267 + .pops = &compression_plugin_ops,
26268 + .label = "lzo1",
26269 + .desc = "lzo1 compression transform",
26270 + .linkage = {NULL, NULL}
26271 + },
26272 + .init = lzo1_init,
26273 + .overrun = lzo1_overrun,
26274 + .alloc = lzo1_alloc,
26275 + .free = lzo1_free,
26276 + .min_size_deflate = lzo1_min_size_deflate,
26277 + .checksum = reiser4_adler32,
26278 + .compress = lzo1_compress,
26279 + .decompress = lzo1_decompress
26280 + },
26281 + [GZIP1_COMPRESSION_ID] = {
26282 + .h = {
26283 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26284 + .id = GZIP1_COMPRESSION_ID,
26285 + .pops = &compression_plugin_ops,
26286 + .label = "gzip1",
26287 + .desc = "gzip1 compression transform",
26288 + .linkage = {NULL, NULL}
26289 + },
26290 + .init = gzip1_init,
26291 + .overrun = gzip1_overrun,
26292 + .alloc = gzip1_alloc,
26293 + .free = gzip1_free,
26294 + .min_size_deflate = gzip1_min_size_deflate,
26295 + .checksum = reiser4_adler32,
26296 + .compress = gzip1_compress,
26297 + .decompress = gzip1_decompress
26298 + }
26299 +};
26300 +
26301 +/*
26302 + Local variables:
26303 + c-indentation-style: "K&R"
26304 + mode-name: "LC"
26305 + c-basic-offset: 8
26306 + tab-width: 8
26307 + fill-column: 120
26308 + scroll-step: 1
26309 + End:
26310 +*/
26311 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.33/fs/reiser4/plugin/compress/compress.h
26312 --- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 01:00:00.000000000 +0100
26313 +++ linux-2.6.33/fs/reiser4/plugin/compress/compress.h 2010-03-04 19:33:22.000000000 +0100
26314 @@ -0,0 +1,43 @@
26315 +#if !defined( __FS_REISER4_COMPRESS_H__ )
26316 +#define __FS_REISER4_COMPRESS_H__
26317 +
26318 +#include <linux/types.h>
26319 +#include <linux/string.h>
26320 +
26321 +/* transform direction */
26322 +typedef enum {
26323 + TFMA_READ, /* decrypt, decompress */
26324 + TFMA_WRITE, /* encrypt, compress */
26325 + TFMA_LAST
26326 +} tfm_action;
26327 +
26328 +/* supported compression algorithms */
26329 +typedef enum {
26330 + LZO1_COMPRESSION_ID,
26331 + GZIP1_COMPRESSION_ID,
26332 + LAST_COMPRESSION_ID,
26333 +} reiser4_compression_id;
26334 +
26335 +/* the same as pgoff, but units are page clusters */
26336 +typedef unsigned long cloff_t;
26337 +
26338 +/* working data of a (de)compression algorithm */
26339 +typedef void *coa_t;
26340 +
26341 +/* table for all supported (de)compression algorithms */
26342 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26343 +
26344 +__u32 reiser4_adler32(char *data, __u32 len);
26345 +
26346 +#endif /* __FS_REISER4_COMPRESS_H__ */
26347 +
26348 +/* Make Linus happy.
26349 + Local variables:
26350 + c-indentation-style: "K&R"
26351 + mode-name: "LC"
26352 + c-basic-offset: 8
26353 + tab-width: 8
26354 + fill-column: 120
26355 + scroll-step: 1
26356 + End:
26357 +*/
26358 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.33/fs/reiser4/plugin/compress/compress_mode.c
26359 --- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 01:00:00.000000000 +0100
26360 +++ linux-2.6.33/fs/reiser4/plugin/compress/compress_mode.c 2010-03-04 19:33:22.000000000 +0100
26361 @@ -0,0 +1,162 @@
26362 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26363 +/* This file contains Reiser4 compression mode plugins.
26364 +
26365 + Compression mode plugin is a set of handlers called by compressor
26366 + at flush time and represent some heuristics including the ones
26367 + which are to avoid compression of incompressible data, see
26368 + http://www.namesys.com/cryptcompress_design.html for more details.
26369 +*/
26370 +#include "../../inode.h"
26371 +#include "../plugin.h"
26372 +
26373 +static int should_deflate_none(struct inode * inode, cloff_t index)
26374 +{
26375 + return 0;
26376 +}
26377 +
26378 +static int should_deflate_common(struct inode * inode, cloff_t index)
26379 +{
26380 + return compression_is_on(cryptcompress_inode_data(inode));
26381 +}
26382 +
26383 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26384 +{
26385 + turn_off_compression(cryptcompress_inode_data(inode));
26386 + return 0;
26387 +}
26388 +
26389 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26390 +{
26391 + struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26392 +
26393 + assert("edward-1462",
26394 + get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26395 + get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26396 +
26397 + turn_off_compression(info);
26398 + if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26399 + set_lattice_factor(info, get_lattice_factor(info) << 1);
26400 + return 0;
26401 +}
26402 +
26403 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26404 +{
26405 + turn_on_compression(cryptcompress_inode_data(inode));
26406 + set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26407 + return 0;
26408 +}
26409 +
26410 +/* Check on dynamic lattice, the adaptive compression modes which
26411 + defines the following behavior:
26412 +
26413 + Compression is on: try to compress everything and turn
26414 + it off, whenever cluster is incompressible.
26415 +
26416 + Compression is off: try to compress clusters of indexes
26417 + k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26418 + them is compressible. If incompressible, then increase FACTOR */
26419 +
26420 +/* check if @index belongs to one-dimensional lattice
26421 + of sparce factor @factor */
26422 +static int is_on_lattice(cloff_t index, int factor)
26423 +{
26424 + return (factor ? index % factor == 0: index == 0);
26425 +}
26426 +
26427 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26428 +{
26429 + return should_deflate_common(inode, index) ||
26430 + is_on_lattice(index,
26431 + get_lattice_factor
26432 + (cryptcompress_inode_data(inode)));
26433 +}
26434 +
26435 +/* compression mode_plugins */
26436 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26437 + [NONE_COMPRESSION_MODE_ID] = {
26438 + .h = {
26439 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26440 + .id = NONE_COMPRESSION_MODE_ID,
26441 + .pops = NULL,
26442 + .label = "none",
26443 + .desc = "Compress nothing",
26444 + .linkage = {NULL, NULL}
26445 + },
26446 + .should_deflate = should_deflate_none,
26447 + .accept_hook = NULL,
26448 + .discard_hook = NULL
26449 + },
26450 + /* Check-on-dynamic-lattice adaptive compression mode */
26451 + [LATTD_COMPRESSION_MODE_ID] = {
26452 + .h = {
26453 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26454 + .id = LATTD_COMPRESSION_MODE_ID,
26455 + .pops = NULL,
26456 + .label = "lattd",
26457 + .desc = "Check on dynamic lattice",
26458 + .linkage = {NULL, NULL}
26459 + },
26460 + .should_deflate = should_deflate_lattd,
26461 + .accept_hook = accept_hook_lattd,
26462 + .discard_hook = discard_hook_lattd
26463 + },
26464 + /* Check-ultimately compression mode:
26465 + Turn off compression forever as soon as we meet
26466 + incompressible data */
26467 + [ULTIM_COMPRESSION_MODE_ID] = {
26468 + .h = {
26469 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26470 + .id = ULTIM_COMPRESSION_MODE_ID,
26471 + .pops = NULL,
26472 + .label = "ultim",
26473 + .desc = "Check ultimately",
26474 + .linkage = {NULL, NULL}
26475 + },
26476 + .should_deflate = should_deflate_common,
26477 + .accept_hook = NULL,
26478 + .discard_hook = discard_hook_ultim
26479 + },
26480 + /* Force-to-compress-everything compression mode */
26481 + [FORCE_COMPRESSION_MODE_ID] = {
26482 + .h = {
26483 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26484 + .id = FORCE_COMPRESSION_MODE_ID,
26485 + .pops = NULL,
26486 + .label = "force",
26487 + .desc = "Force to compress everything",
26488 + .linkage = {NULL, NULL}
26489 + },
26490 + .should_deflate = NULL,
26491 + .accept_hook = NULL,
26492 + .discard_hook = NULL
26493 + },
26494 + /* Convert-to-extent compression mode.
26495 + In this mode items will be converted to extents and management
26496 + will be passed to (classic) unix file plugin as soon as ->write()
26497 + detects that the first complete logical cluster (of index #0) is
26498 + incompressible. */
26499 + [CONVX_COMPRESSION_MODE_ID] = {
26500 + .h = {
26501 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26502 + .id = CONVX_COMPRESSION_MODE_ID,
26503 + .pops = NULL,
26504 + .label = "conv",
26505 + .desc = "Convert to extent",
26506 + .linkage = {NULL, NULL}
26507 + },
26508 + .should_deflate = should_deflate_common,
26509 + .accept_hook = NULL,
26510 + .discard_hook = NULL
26511 + }
26512 +};
26513 +
26514 +/*
26515 + Local variables:
26516 + c-indentation-style: "K&R"
26517 + mode-name: "LC"
26518 + c-basic-offset: 8
26519 + tab-width: 8
26520 + fill-column: 120
26521 + scroll-step: 1
26522 + End:
26523 +*/
26524 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.33/fs/reiser4/plugin/compress/Makefile
26525 --- linux-2.6.33.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 01:00:00.000000000 +0100
26526 +++ linux-2.6.33/fs/reiser4/plugin/compress/Makefile 2010-03-04 19:33:22.000000000 +0100
26527 @@ -0,0 +1,5 @@
26528 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26529 +
26530 +compress_plugins-objs := \
26531 + compress.o \
26532 + compress_mode.o
26533 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.33/fs/reiser4/plugin/crypto/cipher.c
26534 --- linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 01:00:00.000000000 +0100
26535 +++ linux-2.6.33/fs/reiser4/plugin/crypto/cipher.c 2010-03-04 19:33:22.000000000 +0100
26536 @@ -0,0 +1,37 @@
26537 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
26538 + licensing governed by reiser4/README */
26539 +/* Reiser4 cipher transform plugins */
26540 +
26541 +#include "../../debug.h"
26542 +#include "../plugin.h"
26543 +
26544 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26545 + [NONE_CIPHER_ID] = {
26546 + .h = {
26547 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26548 + .id = NONE_CIPHER_ID,
26549 + .pops = NULL,
26550 + .label = "none",
26551 + .desc = "no cipher transform",
26552 + .linkage = {NULL, NULL}
26553 + },
26554 + .alloc = NULL,
26555 + .free = NULL,
26556 + .scale = NULL,
26557 + .align_stream = NULL,
26558 + .setkey = NULL,
26559 + .encrypt = NULL,
26560 + .decrypt = NULL
26561 + }
26562 +};
26563 +
26564 +/* Make Linus happy.
26565 + Local variables:
26566 + c-indentation-style: "K&R"
26567 + mode-name: "LC"
26568 + c-basic-offset: 8
26569 + tab-width: 8
26570 + fill-column: 120
26571 + scroll-step: 1
26572 + End:
26573 +*/
26574 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.33/fs/reiser4/plugin/crypto/cipher.h
26575 --- linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 01:00:00.000000000 +0100
26576 +++ linux-2.6.33/fs/reiser4/plugin/crypto/cipher.h 2010-03-04 19:33:22.000000000 +0100
26577 @@ -0,0 +1,55 @@
26578 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26579 +/* This file contains definitions for the objects operated
26580 + by reiser4 key manager, which is something like keyring
26581 + wrapped by appropriate reiser4 plugin */
26582 +
26583 +#if !defined( __FS_REISER4_CRYPT_H__ )
26584 +#define __FS_REISER4_CRYPT_H__
26585 +
26586 +#include <linux/crypto.h>
26587 +
26588 +/* key info imported from user space */
26589 +struct reiser4_crypto_data {
26590 + int keysize; /* uninstantiated key size */
26591 + __u8 * key; /* uninstantiated key */
26592 + int keyid_size; /* size of passphrase */
26593 + __u8 * keyid; /* passphrase */
26594 +};
26595 +
26596 +/* This object contains all needed infrastructure to implement
26597 + cipher transform. This is operated (allocating, inheriting,
26598 + validating, binding to host inode, etc..) by reiser4 key manager.
26599 +
26600 + This info can be allocated in two cases:
26601 + 1. importing a key from user space.
26602 + 2. reading inode from disk */
26603 +struct reiser4_crypto_info {
26604 + struct inode * host;
26605 + struct crypto_hash * digest;
26606 + struct crypto_blkcipher * cipher;
26607 +#if 0
26608 + cipher_key_plugin * kplug; /* key manager */
26609 +#endif
26610 + __u8 * keyid; /* key fingerprint, created by digest plugin,
26611 + using uninstantiated key and passphrase.
26612 + supposed to be stored in disk stat-data */
26613 + int inst; /* this indicates if the cipher key is
26614 + instantiated (case 1 above) */
26615 + int keysize; /* uninstantiated key size (bytes), supposed
26616 + to be stored in disk stat-data */
26617 + int keyload_count; /* number of the objects which has this
26618 + crypto-stat attached */
26619 +};
26620 +
26621 +#endif /* __FS_REISER4_CRYPT_H__ */
26622 +
26623 +/*
26624 + Local variables:
26625 + c-indentation-style: "K&R"
26626 + mode-name: "LC"
26627 + c-basic-offset: 8
26628 + tab-width: 8
26629 + fill-column: 120
26630 + scroll-step: 1
26631 + End:
26632 +*/
26633 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.33/fs/reiser4/plugin/crypto/digest.c
26634 --- linux-2.6.33.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 01:00:00.000000000 +0100
26635 +++ linux-2.6.33/fs/reiser4/plugin/crypto/digest.c 2010-03-04 19:33:22.000000000 +0100
26636 @@ -0,0 +1,58 @@
26637 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26638 +
26639 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26640 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26641 +#include "../../debug.h"
26642 +#include "../plugin_header.h"
26643 +#include "../plugin.h"
26644 +#include "../file/cryptcompress.h"
26645 +
26646 +#include <linux/types.h>
26647 +
26648 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26649 +
26650 +static struct crypto_hash * alloc_sha256 (void)
26651 +{
26652 +#if REISER4_SHA256
26653 + return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26654 +#else
26655 + warning("edward-1418", "sha256 unsupported");
26656 + return ERR_PTR(-EINVAL);
26657 +#endif
26658 +}
26659 +
26660 +static void free_sha256 (struct crypto_hash * tfm)
26661 +{
26662 +#if REISER4_SHA256
26663 + crypto_free_hash(tfm);
26664 +#endif
26665 + return;
26666 +}
26667 +
26668 +/* digest plugins */
26669 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26670 + [SHA256_32_DIGEST_ID] = {
26671 + .h = {
26672 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26673 + .id = SHA256_32_DIGEST_ID,
26674 + .pops = NULL,
26675 + .label = "sha256_32",
26676 + .desc = "sha256_32 digest transform",
26677 + .linkage = {NULL, NULL}
26678 + },
26679 + .fipsize = sizeof(__u32),
26680 + .alloc = alloc_sha256,
26681 + .free = free_sha256
26682 + }
26683 +};
26684 +
26685 +/*
26686 + Local variables:
26687 + c-indentation-style: "K&R"
26688 + mode-name: "LC"
26689 + c-basic-offset: 8
26690 + tab-width: 8
26691 + fill-column: 120
26692 + scroll-step: 1
26693 + End:
26694 +*/
26695 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.33/fs/reiser4/plugin/dir/dir.h
26696 --- linux-2.6.33.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 01:00:00.000000000 +0100
26697 +++ linux-2.6.33/fs/reiser4/plugin/dir/dir.h 2010-03-04 19:33:22.000000000 +0100
26698 @@ -0,0 +1,36 @@
26699 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26700 + * reiser4/README */
26701 +
26702 +/* this file contains declarations of methods implementing directory plugins */
26703 +
26704 +#if !defined( __REISER4_DIR_H__ )
26705 +#define __REISER4_DIR_H__
26706 +
26707 +/*#include "../../key.h"
26708 +
26709 +#include <linux/fs.h>*/
26710 +
26711 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26712 +
26713 +/* "hashed" directory methods of dir plugin */
26714 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
26715 + reiser4_key *);
26716 +
26717 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26718 +
26719 +/* "seekable" directory methods of dir plugin */
26720 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
26721 + reiser4_key *);
26722 +
26723 +/* __REISER4_DIR_H__ */
26724 +#endif
26725 +
26726 +/*
26727 + Local variables:
26728 + c-indentation-style: "K&R"
26729 + mode-name: "LC"
26730 + c-basic-offset: 8
26731 + tab-width: 8
26732 + fill-column: 120
26733 + End:
26734 +*/
26735 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.33/fs/reiser4/plugin/dir/hashed_dir.c
26736 --- linux-2.6.33.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 01:00:00.000000000 +0100
26737 +++ linux-2.6.33/fs/reiser4/plugin/dir/hashed_dir.c 2010-03-04 19:33:22.000000000 +0100
26738 @@ -0,0 +1,81 @@
26739 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26740 + * reiser4/README */
26741 +
26742 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26743 + names to the files. */
26744 +
26745 +/*
26746 + * Hashed directory logically consists of persistent directory
26747 + * entries. Directory entry is a pair of a file name and a key of stat-data of
26748 + * a file that has this name in the given directory.
26749 + *
26750 + * Directory entries are stored in the tree in the form of directory
26751 + * items. Directory item should implement dir_entry_ops portion of item plugin
26752 + * interface (see plugin/item/item.h). Hashed directory interacts with
26753 + * directory item plugin exclusively through dir_entry_ops operations.
26754 + *
26755 + * Currently there are two implementations of directory items: "simple
26756 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26757 + * (plugin/item/cde.[ch]) with the latter being the default.
26758 + *
26759 + * There is, however some delicate way through which directory code interferes
26760 + * with item plugin: key assignment policy. A key for a directory item is
26761 + * chosen by directory code, and as described in kassign.c, this key contains
26762 + * a portion of file name. Directory item uses this knowledge to avoid storing
26763 + * this portion of file name twice: in the key and in the directory item body.
26764 + *
26765 + */
26766 +
26767 +#include "../../inode.h"
26768 +
26769 +void complete_entry_key(const struct inode *, const char *name,
26770 + int len, reiser4_key * result);
26771 +
26772 +/* this is implementation of build_entry_key method of dir
26773 + plugin for HASHED_DIR_PLUGIN_ID
26774 + */
26775 +void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
26776 + * (or will be) in.*/
26777 + const struct qstr *qname, /* name of file referenced
26778 + * by this entry */
26779 + reiser4_key * result /* resulting key of directory
26780 + * entry */ )
26781 +{
26782 + const char *name;
26783 + int len;
26784 +
26785 + assert("nikita-1139", dir != NULL);
26786 + assert("nikita-1140", qname != NULL);
26787 + assert("nikita-1141", qname->name != NULL);
26788 + assert("nikita-1142", result != NULL);
26789 +
26790 + name = qname->name;
26791 + len = qname->len;
26792 +
26793 + assert("nikita-2867", strlen(name) == len);
26794 +
26795 + reiser4_key_init(result);
26796 + /* locality of directory entry's key is objectid of parent
26797 + directory */
26798 + set_key_locality(result, get_inode_oid(dir));
26799 + /* minor packing locality is constant */
26800 + set_key_type(result, KEY_FILE_NAME_MINOR);
26801 + /* dot is special case---we always want it to be first entry in
26802 + a directory. Actually, we just want to have smallest
26803 + directory entry.
26804 + */
26805 + if (len == 1 && name[0] == '.')
26806 + return;
26807 +
26808 + /* initialize part of entry key which depends on file name */
26809 + complete_entry_key(dir, name, len, result);
26810 +}
26811 +
26812 +/* Local variables:
26813 + c-indentation-style: "K&R"
26814 + mode-name: "LC"
26815 + c-basic-offset: 8
26816 + tab-width: 8
26817 + fill-column: 120
26818 + End:
26819 +*/
26820 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.33/fs/reiser4/plugin/dir/Makefile
26821 --- linux-2.6.33.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 01:00:00.000000000 +0100
26822 +++ linux-2.6.33/fs/reiser4/plugin/dir/Makefile 2010-03-04 19:33:22.000000000 +0100
26823 @@ -0,0 +1,5 @@
26824 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26825 +
26826 +dir_plugins-objs := \
26827 + hashed_dir.o \
26828 + seekable_dir.o
26829 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.33/fs/reiser4/plugin/dir/seekable_dir.c
26830 --- linux-2.6.33.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 01:00:00.000000000 +0100
26831 +++ linux-2.6.33/fs/reiser4/plugin/dir/seekable_dir.c 2010-03-04 19:33:22.000000000 +0100
26832 @@ -0,0 +1,46 @@
26833 +/* Copyright 2005 by Hans Reiser, licensing governed by
26834 + * reiser4/README */
26835 +
26836 +#include "../../inode.h"
26837 +
26838 +/* this is implementation of build_entry_key method of dir
26839 + plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26840 + This is for directories where we want repeatable and restartable readdir()
26841 + even in case 32bit user level struct dirent (readdir(3)).
26842 +*/
26843 +void
26844 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26845 + reiser4_key * result)
26846 +{
26847 + oid_t objectid;
26848 +
26849 + assert("nikita-2283", dir != NULL);
26850 + assert("nikita-2284", name != NULL);
26851 + assert("nikita-2285", name->name != NULL);
26852 + assert("nikita-2286", result != NULL);
26853 +
26854 + reiser4_key_init(result);
26855 + /* locality of directory entry's key is objectid of parent
26856 + directory */
26857 + set_key_locality(result, get_inode_oid(dir));
26858 + /* minor packing locality is constant */
26859 + set_key_type(result, KEY_FILE_NAME_MINOR);
26860 + /* dot is special case---we always want it to be first entry in
26861 + a directory. Actually, we just want to have smallest
26862 + directory entry.
26863 + */
26864 + if ((name->len == 1) && (name->name[0] == '.'))
26865 + return;
26866 +
26867 + /* objectid of key is 31 lowest bits of hash. */
26868 + objectid =
26869 + inode_hash_plugin(dir)->hash(name->name,
26870 + (int)name->len) & 0x7fffffff;
26871 +
26872 + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26873 + set_key_objectid(result, objectid);
26874 +
26875 + /* offset is always 0. */
26876 + set_key_offset(result, (__u64) 0);
26877 + return;
26878 +}
26879 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.33/fs/reiser4/plugin/dir_plugin_common.c
26880 --- linux-2.6.33.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 01:00:00.000000000 +0100
26881 +++ linux-2.6.33/fs/reiser4/plugin/dir_plugin_common.c 2010-03-04 19:33:22.000000000 +0100
26882 @@ -0,0 +1,865 @@
26883 +/* Copyright 2005 by Hans Reiser, licensing governed by
26884 + reiser4/README */
26885 +
26886 +/* this file contains typical implementations for most of methods of
26887 + directory plugin
26888 +*/
26889 +
26890 +#include "../inode.h"
26891 +
26892 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
26893 + lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *);
26894 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry,
26895 + reiser4_key * key);
26896 +void check_light_weight(struct inode *inode, struct inode *parent);
26897 +
26898 +/* this is common implementation of get_parent method of dir plugin
26899 + this is used by NFS kernel server to "climb" up directory tree to
26900 + check permissions
26901 + */
26902 +struct dentry *get_parent_common(struct inode *child)
26903 +{
26904 + struct super_block *s;
26905 + struct inode *parent;
26906 + struct dentry dotdot;
26907 + struct dentry *dentry;
26908 + reiser4_key key;
26909 + int result;
26910 +
26911 + /*
26912 + * lookup dotdot entry.
26913 + */
26914 +
26915 + s = child->i_sb;
26916 + memset(&dotdot, 0, sizeof(dotdot));
26917 + dotdot.d_name.name = "..";
26918 + dotdot.d_name.len = 2;
26919 + dotdot.d_op = &get_super_private(s)->ops.dentry;
26920 +
26921 + result = reiser4_lookup_name(child, &dotdot, &key);
26922 + if (result != 0)
26923 + return ERR_PTR(result);
26924 +
26925 + parent = reiser4_iget(s, &key, 1);
26926 + if (!IS_ERR(parent)) {
26927 + /*
26928 + * FIXME-NIKITA dubious: attributes are inherited from @child
26929 + * to @parent. But:
26930 + *
26931 + * (*) this is the only this we can do
26932 + *
26933 + * (*) attributes of light-weight object are inherited
26934 + * from a parent through which object was looked up first,
26935 + * so it is ambiguous anyway.
26936 + *
26937 + */
26938 + check_light_weight(parent, child);
26939 + reiser4_iget_complete(parent);
26940 + dentry = d_obtain_alias(parent);
26941 + if (!IS_ERR(dentry))
26942 + dentry->d_op = &get_super_private(s)->ops.dentry;
26943 + } else if (PTR_ERR(parent) == -ENOENT)
26944 + dentry = ERR_PTR(RETERR(-ESTALE));
26945 + else
26946 + dentry = (void *)parent;
26947 + return dentry;
26948 +}
26949 +
26950 +/* this is common implementation of is_name_acceptable method of dir
26951 + plugin
26952 + */
26953 +int is_name_acceptable_common(const struct inode *inode, /* directory to check*/
26954 + const char *name UNUSED_ARG, /* name to check */
26955 + int len/* @name's length */)
26956 +{
26957 + assert("nikita-733", inode != NULL);
26958 + assert("nikita-734", name != NULL);
26959 + assert("nikita-735", len > 0);
26960 +
26961 + return len <= reiser4_max_filename_len(inode);
26962 +}
26963 +
26964 +/* there is no common implementation of build_entry_key method of dir
26965 + plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26966 + plugin/dir/seekable.c:build_entry_key_seekable() for example
26967 +*/
26968 +
26969 +/* this is common implementation of build_readdir_key method of dir
26970 + plugin
26971 + see reiser4_readdir_common for more details
26972 +*/
26973 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
26974 + reiser4_key * result/* where to store key */)
26975 +{
26976 + reiser4_file_fsdata *fdata;
26977 + struct inode *inode;
26978 +
26979 + assert("nikita-1361", dir != NULL);
26980 + assert("nikita-1362", result != NULL);
26981 + assert("nikita-1363", dir->f_dentry != NULL);
26982 + inode = dir->f_dentry->d_inode;
26983 + assert("nikita-1373", inode != NULL);
26984 +
26985 + fdata = reiser4_get_file_fsdata(dir);
26986 + if (IS_ERR(fdata))
26987 + return PTR_ERR(fdata);
26988 + assert("nikita-1364", fdata != NULL);
26989 + return extract_key_from_de_id(get_inode_oid(inode),
26990 + &fdata->dir.readdir.position.
26991 + dir_entry_key, result);
26992 +
26993 +}
26994 +
26995 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26996 + int adj);
26997 +
26998 +/* this is common implementation of add_entry method of dir plugin
26999 +*/
27000 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
27001 + * in */
27002 + struct dentry *where, /* new name */
27003 + reiser4_object_create_data * data, /* parameters of
27004 + * new object */
27005 + reiser4_dir_entry_desc * entry /* parameters of
27006 + * new directory
27007 + * entry */)
27008 +{
27009 + int result;
27010 + coord_t *coord;
27011 + lock_handle lh;
27012 + struct reiser4_dentry_fsdata *fsdata;
27013 + reiser4_block_nr reserve;
27014 +
27015 + assert("nikita-1114", object != NULL);
27016 + assert("nikita-1250", where != NULL);
27017 +
27018 + fsdata = reiser4_get_dentry_fsdata(where);
27019 + if (unlikely(IS_ERR(fsdata)))
27020 + return PTR_ERR(fsdata);
27021 +
27022 + reserve = inode_dir_plugin(object)->estimate.add_entry(object);
27023 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
27024 + return RETERR(-ENOSPC);
27025 +
27026 + init_lh(&lh);
27027 + coord = &fsdata->dec.entry_coord;
27028 + coord_clear_iplug(coord);
27029 +
27030 + /* check for this entry in a directory. This is plugin method. */
27031 + result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
27032 + entry);
27033 + if (likely(result == -ENOENT)) {
27034 + /* add new entry. Just pass control to the directory
27035 + item plugin. */
27036 + assert("nikita-1709", inode_dir_item_plugin(object));
27037 + assert("nikita-2230", coord->node == lh.node);
27038 + reiser4_seal_done(&fsdata->dec.entry_seal);
27039 + result =
27040 + inode_dir_item_plugin(object)->s.dir.add_entry(object,
27041 + coord, &lh,
27042 + where,
27043 + entry);
27044 + if (result == 0) {
27045 + reiser4_adjust_dir_file(object, where,
27046 + fsdata->dec.pos + 1, +1);
27047 + INODE_INC_FIELD(object, i_size);
27048 + }
27049 + } else if (result == 0) {
27050 + assert("nikita-2232", coord->node == lh.node);
27051 + result = RETERR(-EEXIST);
27052 + }
27053 + done_lh(&lh);
27054 +
27055 + return result;
27056 +}
27057 +
27058 +/**
27059 + * rem_entry - remove entry from directory item
27060 + * @dir:
27061 + * @dentry:
27062 + * @entry:
27063 + * @coord:
27064 + * @lh:
27065 + *
27066 + * Checks that coordinate @coord is set properly and calls item plugin
27067 + * method to cut entry.
27068 + */
27069 +static int
27070 +rem_entry(struct inode *dir, struct dentry *dentry,
27071 + reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh)
27072 +{
27073 + item_plugin *iplug;
27074 + struct inode *child;
27075 +
27076 + iplug = inode_dir_item_plugin(dir);
27077 + child = dentry->d_inode;
27078 + assert("nikita-3399", child != NULL);
27079 +
27080 + /* check that we are really destroying an entry for @child */
27081 + if (REISER4_DEBUG) {
27082 + int result;
27083 + reiser4_key key;
27084 +
27085 + result = iplug->s.dir.extract_key(coord, &key);
27086 + if (result != 0)
27087 + return result;
27088 + if (get_key_objectid(&key) != get_inode_oid(child)) {
27089 + warning("nikita-3397",
27090 + "rem_entry: %#llx != %#llx\n",
27091 + get_key_objectid(&key),
27092 + (unsigned long long)get_inode_oid(child));
27093 + return RETERR(-EIO);
27094 + }
27095 + }
27096 + return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
27097 +}
27098 +
27099 +/**
27100 + * reiser4_rem_entry_common - remove entry from a directory
27101 + * @dir: directory to remove entry from
27102 + * @where: name that is being removed
27103 + * @entry: description of entry being removed
27104 + *
27105 + * This is common implementation of rem_entry method of dir plugin.
27106 + */
27107 +int reiser4_rem_entry_common(struct inode *dir,
27108 + struct dentry *dentry,
27109 + reiser4_dir_entry_desc * entry)
27110 +{
27111 + int result;
27112 + coord_t *coord;
27113 + lock_handle lh;
27114 + struct reiser4_dentry_fsdata *fsdata;
27115 + __u64 tograb;
27116 +
27117 + assert("nikita-1124", dir != NULL);
27118 + assert("nikita-1125", dentry != NULL);
27119 +
27120 + tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
27121 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
27122 + if (result != 0)
27123 + return RETERR(-ENOSPC);
27124 +
27125 + init_lh(&lh);
27126 +
27127 + /* check for this entry in a directory. This is plugin method. */
27128 + result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
27129 + fsdata = reiser4_get_dentry_fsdata(dentry);
27130 + if (IS_ERR(fsdata)) {
27131 + done_lh(&lh);
27132 + return PTR_ERR(fsdata);
27133 + }
27134 +
27135 + coord = &fsdata->dec.entry_coord;
27136 +
27137 + assert("nikita-3404",
27138 + get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
27139 + dir->i_size <= 1);
27140 +
27141 + coord_clear_iplug(coord);
27142 + if (result == 0) {
27143 + /* remove entry. Just pass control to the directory item
27144 + plugin. */
27145 + assert("vs-542", inode_dir_item_plugin(dir));
27146 + reiser4_seal_done(&fsdata->dec.entry_seal);
27147 + reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
27148 + result =
27149 + WITH_COORD(coord,
27150 + rem_entry(dir, dentry, entry, coord, &lh));
27151 + if (result == 0) {
27152 + if (dir->i_size >= 1)
27153 + INODE_DEC_FIELD(dir, i_size);
27154 + else {
27155 + warning("nikita-2509", "Dir %llu is runt",
27156 + (unsigned long long)
27157 + get_inode_oid(dir));
27158 + result = RETERR(-EIO);
27159 + }
27160 +
27161 + assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
27162 + dentry->d_inode->i_size != 2 ||
27163 + inode_dir_plugin(dentry->d_inode) == NULL);
27164 + }
27165 + }
27166 + done_lh(&lh);
27167 +
27168 + return result;
27169 +}
27170 +
27171 +static reiser4_block_nr estimate_init(struct inode *parent,
27172 + struct inode *object);
27173 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
27174 +
27175 +/* this is common implementation of init method of dir plugin
27176 + create "." and ".." entries
27177 +*/
27178 +int reiser4_dir_init_common(struct inode *object, /* new directory */
27179 + struct inode *parent, /* parent directory */
27180 + reiser4_object_create_data * data /* info passed
27181 + * to us, this
27182 + * is filled by
27183 + * reiser4()
27184 + * syscall in
27185 + * particular */)
27186 +{
27187 + reiser4_block_nr reserve;
27188 +
27189 + assert("nikita-680", object != NULL);
27190 + assert("nikita-681", S_ISDIR(object->i_mode));
27191 + assert("nikita-682", parent != NULL);
27192 + assert("nikita-684", data != NULL);
27193 + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
27194 + assert("nikita-687", object->i_mode & S_IFDIR);
27195 +
27196 + reserve = estimate_init(parent, object);
27197 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
27198 + return RETERR(-ENOSPC);
27199 +
27200 + return create_dot_dotdot(object, parent);
27201 +}
27202 +
27203 +/* this is common implementation of done method of dir plugin
27204 + remove "." entry
27205 +*/
27206 +int reiser4_dir_done_common(struct inode *object/* object being deleted */)
27207 +{
27208 + int result;
27209 + reiser4_block_nr reserve;
27210 + struct dentry goodby_dots;
27211 + reiser4_dir_entry_desc entry;
27212 +
27213 + assert("nikita-1449", object != NULL);
27214 +
27215 + if (reiser4_inode_get_flag(object, REISER4_NO_SD))
27216 + return 0;
27217 +
27218 + /* of course, this can be rewritten to sweep everything in one
27219 + reiser4_cut_tree(). */
27220 + memset(&entry, 0, sizeof entry);
27221 +
27222 + /* FIXME: this done method is called from reiser4_delete_dir_common
27223 + * which reserved space already */
27224 + reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
27225 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
27226 + return RETERR(-ENOSPC);
27227 +
27228 + memset(&goodby_dots, 0, sizeof goodby_dots);
27229 + entry.obj = goodby_dots.d_inode = object;
27230 + goodby_dots.d_name.name = ".";
27231 + goodby_dots.d_name.len = 1;
27232 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
27233 + reiser4_free_dentry_fsdata(&goodby_dots);
27234 + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
27235 + warning("nikita-2252", "Cannot remove dot of %lli: %i",
27236 + (unsigned long long)get_inode_oid(object), result);
27237 + return 0;
27238 +}
27239 +
27240 +/* this is common implementation of attach method of dir plugin
27241 +*/
27242 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
27243 + struct inode *parent UNUSED_ARG)
27244 +{
27245 + assert("nikita-2647", child != NULL);
27246 + assert("nikita-2648", parent != NULL);
27247 +
27248 + return 0;
27249 +}
27250 +
27251 +/* this is common implementation of detach method of dir plugin
27252 + remove "..", decrease nlink on parent
27253 +*/
27254 +int reiser4_detach_common(struct inode *object, struct inode *parent)
27255 +{
27256 + int result;
27257 + struct dentry goodby_dots;
27258 + reiser4_dir_entry_desc entry;
27259 +
27260 + assert("nikita-2885", object != NULL);
27261 + assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
27262 +
27263 + memset(&entry, 0, sizeof entry);
27264 +
27265 + /* NOTE-NIKITA this only works if @parent is -the- parent of
27266 + @object, viz. object whose key is stored in dotdot
27267 + entry. Wouldn't work with hard-links on directories. */
27268 + memset(&goodby_dots, 0, sizeof goodby_dots);
27269 + entry.obj = goodby_dots.d_inode = parent;
27270 + goodby_dots.d_name.name = "..";
27271 + goodby_dots.d_name.len = 2;
27272 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
27273 + reiser4_free_dentry_fsdata(&goodby_dots);
27274 + if (result == 0) {
27275 + /* the dot should be the only entry remaining at this time... */
27276 + assert("nikita-3400",
27277 + object->i_size == 1 && object->i_nlink <= 2);
27278 +#if 0
27279 + /* and, together with the only name directory can have, they
27280 + * provides for the last 2 remaining references. If we get
27281 + * here as part of error handling during mkdir, @object
27282 + * possibly has no name yet, so its nlink == 1. If we get here
27283 + * from rename (targeting empty directory), it has no name
27284 + * already, so its nlink == 1. */
27285 + assert("nikita-3401",
27286 + object->i_nlink == 2 || object->i_nlink == 1);
27287 +#endif
27288 +
27289 + /* decrement nlink of directory removed ".." pointed
27290 + to */
27291 + reiser4_del_nlink(parent, NULL, 0);
27292 + }
27293 + return result;
27294 +}
27295 +
27296 +/* this is common implementation of estimate.add_entry method of
27297 + dir plugin
27298 + estimation of adding entry which supposes that entry is inserting a
27299 + unit into item
27300 +*/
27301 +reiser4_block_nr estimate_add_entry_common(const struct inode *inode)
27302 +{
27303 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
27304 +}
27305 +
27306 +/* this is common implementation of estimate.rem_entry method of dir
27307 + plugin
27308 +*/
27309 +reiser4_block_nr estimate_rem_entry_common(const struct inode *inode)
27310 +{
27311 + return estimate_one_item_removal(reiser4_tree_by_inode(inode));
27312 +}
27313 +
27314 +/* this is common implementation of estimate.unlink method of dir
27315 + plugin
27316 +*/
27317 +reiser4_block_nr
27318 +dir_estimate_unlink_common(const struct inode *parent,
27319 + const struct inode *object)
27320 +{
27321 + reiser4_block_nr res;
27322 +
27323 + /* hashed_rem_entry(object) */
27324 + res = inode_dir_plugin(object)->estimate.rem_entry(object);
27325 + /* del_nlink(parent) */
27326 + res += 2 * inode_file_plugin(parent)->estimate.update(parent);
27327 +
27328 + return res;
27329 +}
27330 +
27331 +/*
27332 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
27333 + * methods: if @inode is a light-weight file, setup its credentials
27334 + * that are not stored in the stat-data in this case
27335 + */
27336 +void check_light_weight(struct inode *inode, struct inode *parent)
27337 +{
27338 + if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
27339 + inode->i_uid = parent->i_uid;
27340 + inode->i_gid = parent->i_gid;
27341 + /* clear light-weight flag. If inode would be read by any
27342 + other name, [ug]id wouldn't change. */
27343 + reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
27344 + }
27345 +}
27346 +
27347 +/* looks for name specified in @dentry in directory @parent and if name is
27348 + found - key of object found entry points to is stored in @entry->key */
27349 +int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup
27350 + * for name in */
27351 + struct dentry *dentry, /* name to look for */
27352 + reiser4_key * key/* place to store key */)
27353 +{
27354 + int result;
27355 + coord_t *coord;
27356 + lock_handle lh;
27357 + const char *name;
27358 + int len;
27359 + reiser4_dir_entry_desc entry;
27360 + struct reiser4_dentry_fsdata *fsdata;
27361 +
27362 + assert("nikita-1247", parent != NULL);
27363 + assert("nikita-1248", dentry != NULL);
27364 + assert("nikita-1123", dentry->d_name.name != NULL);
27365 + assert("vs-1486",
27366 + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27367 +
27368 + name = dentry->d_name.name;
27369 + len = dentry->d_name.len;
27370 +
27371 + if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27372 + /* some arbitrary error code to return */
27373 + return RETERR(-ENAMETOOLONG);
27374 +
27375 + fsdata = reiser4_get_dentry_fsdata(dentry);
27376 + if (IS_ERR(fsdata))
27377 + return PTR_ERR(fsdata);
27378 +
27379 + coord = &fsdata->dec.entry_coord;
27380 + coord_clear_iplug(coord);
27381 + init_lh(&lh);
27382 +
27383 + /* find entry in a directory. This is plugin method. */
27384 + result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27385 + &entry);
27386 + if (result == 0) {
27387 + /* entry was found, extract object key from it. */
27388 + result =
27389 + WITH_COORD(coord,
27390 + item_plugin_by_coord(coord)->s.dir.
27391 + extract_key(coord, key));
27392 + }
27393 + done_lh(&lh);
27394 + return result;
27395 +
27396 +}
27397 +
27398 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27399 +static reiser4_block_nr
27400 +estimate_init(struct inode *parent, struct inode *object)
27401 +{
27402 + reiser4_block_nr res = 0;
27403 +
27404 + assert("vpf-321", parent != NULL);
27405 + assert("vpf-322", object != NULL);
27406 +
27407 + /* hashed_add_entry(object) */
27408 + res += inode_dir_plugin(object)->estimate.add_entry(object);
27409 + /* reiser4_add_nlink(object) */
27410 + res += inode_file_plugin(object)->estimate.update(object);
27411 + /* hashed_add_entry(object) */
27412 + res += inode_dir_plugin(object)->estimate.add_entry(object);
27413 + /* reiser4_add_nlink(parent) */
27414 + res += inode_file_plugin(parent)->estimate.update(parent);
27415 +
27416 + return 0;
27417 +}
27418 +
27419 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27420 +static int create_dot_dotdot(struct inode *object/* object to create dot and
27421 + * dotdot for */ ,
27422 + struct inode *parent/* parent of @object */)
27423 +{
27424 + int result;
27425 + struct dentry dots_entry;
27426 + reiser4_dir_entry_desc entry;
27427 +
27428 + assert("nikita-688", object != NULL);
27429 + assert("nikita-689", S_ISDIR(object->i_mode));
27430 + assert("nikita-691", parent != NULL);
27431 +
27432 + /* We store dot and dotdot as normal directory entries. This is
27433 + not necessary, because almost all information stored in them
27434 + is already in the stat-data of directory, the only thing
27435 + being missed is objectid of grand-parent directory that can
27436 + easily be added there as extension.
27437 +
27438 + But it is done the way it is done, because not storing dot
27439 + and dotdot will lead to the following complications:
27440 +
27441 + . special case handling in ->lookup().
27442 + . addition of another extension to the sd.
27443 + . dependency on key allocation policy for stat data.
27444 +
27445 + */
27446 +
27447 + memset(&entry, 0, sizeof entry);
27448 + memset(&dots_entry, 0, sizeof dots_entry);
27449 + entry.obj = dots_entry.d_inode = object;
27450 + dots_entry.d_name.name = ".";
27451 + dots_entry.d_name.len = 1;
27452 + result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27453 + reiser4_free_dentry_fsdata(&dots_entry);
27454 +
27455 + if (result == 0) {
27456 + result = reiser4_add_nlink(object, object, 0);
27457 + if (result == 0) {
27458 + entry.obj = dots_entry.d_inode = parent;
27459 + dots_entry.d_name.name = "..";
27460 + dots_entry.d_name.len = 2;
27461 + result = reiser4_add_entry_common(object,
27462 + &dots_entry, NULL, &entry);
27463 + reiser4_free_dentry_fsdata(&dots_entry);
27464 + /* if creation of ".." failed, iput() will delete
27465 + object with ".". */
27466 + if (result == 0) {
27467 + result = reiser4_add_nlink(parent, object, 0);
27468 + if (result != 0)
27469 + /*
27470 + * if we failed to bump i_nlink, try
27471 + * to remove ".."
27472 + */
27473 + reiser4_detach_common(object, parent);
27474 + }
27475 + }
27476 + }
27477 +
27478 + if (result != 0) {
27479 + /*
27480 + * in the case of error, at least update stat-data so that,
27481 + * ->i_nlink updates are not lingering.
27482 + */
27483 + reiser4_update_sd(object);
27484 + reiser4_update_sd(parent);
27485 + }
27486 +
27487 + return result;
27488 +}
27489 +
27490 +/*
27491 + * return 0 iff @coord contains a directory entry for the file with the name
27492 + * @name.
27493 + */
27494 +static int
27495 +check_item(const struct inode *dir, const coord_t *coord, const char *name)
27496 +{
27497 + item_plugin *iplug;
27498 + char buf[DE_NAME_BUF_LEN];
27499 +
27500 + iplug = item_plugin_by_coord(coord);
27501 + if (iplug == NULL) {
27502 + warning("nikita-1135", "Cannot get item plugin");
27503 + print_coord("coord", coord, 1);
27504 + return RETERR(-EIO);
27505 + } else if (item_id_by_coord(coord) !=
27506 + item_id_by_plugin(inode_dir_item_plugin(dir))) {
27507 + /* item id of current item does not match to id of items a
27508 + directory is built of */
27509 + warning("nikita-1136", "Wrong item plugin");
27510 + print_coord("coord", coord, 1);
27511 + return RETERR(-EIO);
27512 + }
27513 + assert("nikita-1137", iplug->s.dir.extract_name);
27514 +
27515 + /* Compare name stored in this entry with name we are looking for.
27516 +
27517 + NOTE-NIKITA Here should go code for support of something like
27518 + unicode, code tables, etc.
27519 + */
27520 + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27521 +}
27522 +
27523 +static int
27524 +check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name)
27525 +{
27526 + return WITH_COORD(coord, check_item(dir, coord, name->name));
27527 +}
27528 +
27529 +/*
27530 + * argument package used by entry_actor to scan entries with identical keys.
27531 + */
27532 +struct entry_actor_args {
27533 + /* name we are looking for */
27534 + const char *name;
27535 + /* key of directory entry. entry_actor() scans through sequence of
27536 + * items/units having the same key */
27537 + reiser4_key *key;
27538 + /* how many entries with duplicate key was scanned so far. */
27539 + int non_uniq;
27540 +#if REISER4_USE_COLLISION_LIMIT
27541 + /* scan limit */
27542 + int max_non_uniq;
27543 +#endif
27544 + /* return parameter: set to true, if ->name wasn't found */
27545 + int not_found;
27546 + /* what type of lock to take when moving to the next node during
27547 + * scan */
27548 + znode_lock_mode mode;
27549 +
27550 + /* last coord that was visited during scan */
27551 + coord_t last_coord;
27552 + /* last node locked during scan */
27553 + lock_handle last_lh;
27554 + /* inode of directory */
27555 + const struct inode *inode;
27556 +};
27557 +
27558 +/* Function called by reiser4_find_entry() to look for given name
27559 + in the directory. */
27560 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27561 + coord_t *coord /* current coord */ ,
27562 + lock_handle * lh /* current lock handle */ ,
27563 + void *entry_actor_arg/* argument to scan */)
27564 +{
27565 + reiser4_key unit_key;
27566 + struct entry_actor_args *args;
27567 +
27568 + assert("nikita-1131", tree != NULL);
27569 + assert("nikita-1132", coord != NULL);
27570 + assert("nikita-1133", entry_actor_arg != NULL);
27571 +
27572 + args = entry_actor_arg;
27573 + ++args->non_uniq;
27574 +#if REISER4_USE_COLLISION_LIMIT
27575 + if (args->non_uniq > args->max_non_uniq) {
27576 + args->not_found = 1;
27577 + /* hash collision overflow. */
27578 + return RETERR(-EBUSY);
27579 + }
27580 +#endif
27581 +
27582 + /*
27583 + * did we just reach the end of the sequence of items/units with
27584 + * identical keys?
27585 + */
27586 + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27587 + assert("nikita-1791",
27588 + keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27589 + args->not_found = 1;
27590 + args->last_coord.between = AFTER_UNIT;
27591 + return 0;
27592 + }
27593 +
27594 + coord_dup(&args->last_coord, coord);
27595 + /*
27596 + * did scan just moved to the next node?
27597 + */
27598 + if (args->last_lh.node != lh->node) {
27599 + int lock_result;
27600 +
27601 + /*
27602 + * if so, lock new node with the mode requested by the caller
27603 + */
27604 + done_lh(&args->last_lh);
27605 + assert("nikita-1896", znode_is_any_locked(lh->node));
27606 + lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27607 + args->mode, ZNODE_LOCK_HIPRI);
27608 + if (lock_result != 0)
27609 + return lock_result;
27610 + }
27611 + return check_item(args->inode, coord, args->name);
27612 +}
27613 +
27614 +/* Look for given @name within directory @dir.
27615 +
27616 + This is called during lookup, creation and removal of directory
27617 + entries and on reiser4_rename_common
27618 +
27619 + First calculate key that directory entry for @name would have. Search
27620 + for this key in the tree. If such key is found, scan all items with
27621 + the same key, checking name in each directory entry along the way.
27622 +*/
27623 +int reiser4_find_entry(struct inode *dir, /* directory to scan */
27624 + struct dentry *de, /* name to search for */
27625 + lock_handle * lh, /* resulting lock handle */
27626 + znode_lock_mode mode, /* required lock mode */
27627 + reiser4_dir_entry_desc * entry /* parameters of found
27628 + directory entry */)
27629 +{
27630 + const struct qstr *name;
27631 + seal_t *seal;
27632 + coord_t *coord;
27633 + int result;
27634 + __u32 flags;
27635 + struct de_location *dec;
27636 + struct reiser4_dentry_fsdata *fsdata;
27637 +
27638 + assert("nikita-1130", lh != NULL);
27639 + assert("nikita-1128", dir != NULL);
27640 +
27641 + name = &de->d_name;
27642 + assert("nikita-1129", name != NULL);
27643 +
27644 + /* dentry private data don't require lock, because dentry
27645 + manipulations are protected by i_mutex on parent.
27646 +
27647 + This is not so for inodes, because there is no -the- parent in
27648 + inode case.
27649 + */
27650 + fsdata = reiser4_get_dentry_fsdata(de);
27651 + if (IS_ERR(fsdata))
27652 + return PTR_ERR(fsdata);
27653 + dec = &fsdata->dec;
27654 +
27655 + coord = &dec->entry_coord;
27656 + coord_clear_iplug(coord);
27657 + seal = &dec->entry_seal;
27658 + /* compose key of directory entry for @name */
27659 + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27660 +
27661 + if (reiser4_seal_is_set(seal)) {
27662 + /* check seal */
27663 + result = reiser4_seal_validate(seal, coord, &entry->key,
27664 + lh, mode, ZNODE_LOCK_LOPRI);
27665 + if (result == 0) {
27666 + /* key was found. Check that it is really item we are
27667 + looking for. */
27668 + result = check_entry(dir, coord, name);
27669 + if (result == 0)
27670 + return 0;
27671 + }
27672 + }
27673 + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27674 + /*
27675 + * find place in the tree where directory item should be located.
27676 + */
27677 + result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27678 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27679 + flags, NULL/*ra_info */);
27680 + if (result == CBK_COORD_FOUND) {
27681 + struct entry_actor_args arg;
27682 +
27683 + /* fast path: no hash collisions */
27684 + result = check_entry(dir, coord, name);
27685 + if (result == 0) {
27686 + reiser4_seal_init(seal, coord, &entry->key);
27687 + dec->pos = 0;
27688 + } else if (result > 0) {
27689 + /* Iterate through all units with the same keys. */
27690 + arg.name = name->name;
27691 + arg.key = &entry->key;
27692 + arg.not_found = 0;
27693 + arg.non_uniq = 0;
27694 +#if REISER4_USE_COLLISION_LIMIT
27695 + arg.max_non_uniq = max_hash_collisions(dir);
27696 + assert("nikita-2851", arg.max_non_uniq > 1);
27697 +#endif
27698 + arg.mode = mode;
27699 + arg.inode = dir;
27700 + coord_init_zero(&arg.last_coord);
27701 + init_lh(&arg.last_lh);
27702 +
27703 + result = reiser4_iterate_tree
27704 + (reiser4_tree_by_inode(dir),
27705 + coord, lh,
27706 + entry_actor, &arg, mode, 1);
27707 + /* if end of the tree or extent was reached during
27708 + scanning. */
27709 + if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27710 + /* step back */
27711 + done_lh(lh);
27712 +
27713 + result = zload(arg.last_coord.node);
27714 + if (result == 0) {
27715 + coord_clear_iplug(&arg.last_coord);
27716 + coord_dup(coord, &arg.last_coord);
27717 + move_lh(lh, &arg.last_lh);
27718 + result = RETERR(-ENOENT);
27719 + zrelse(arg.last_coord.node);
27720 + --arg.non_uniq;
27721 + }
27722 + }
27723 +
27724 + done_lh(&arg.last_lh);
27725 + if (result == 0)
27726 + reiser4_seal_init(seal, coord, &entry->key);
27727 +
27728 + if (result == 0 || result == -ENOENT) {
27729 + assert("nikita-2580", arg.non_uniq > 0);
27730 + dec->pos = arg.non_uniq - 1;
27731 + }
27732 + }
27733 + } else
27734 + dec->pos = -1;
27735 + return result;
27736 +}
27737 +
27738 +/*
27739 + Local variables:
27740 + c-indentation-style: "K&R"
27741 + mode-name: "LC"
27742 + c-basic-offset: 8
27743 + tab-width: 8
27744 + fill-column: 120
27745 + scroll-step: 1
27746 + End:
27747 +*/
27748 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.c
27749 --- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 01:00:00.000000000 +0100
27750 +++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.c 2010-03-04 19:33:22.000000000 +0100
27751 @@ -0,0 +1,655 @@
27752 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27753 +
27754 +#include "../../debug.h"
27755 +#include "../../dformat.h"
27756 +#include "../../key.h"
27757 +#include "../node/node.h"
27758 +#include "../space/space_allocator.h"
27759 +#include "disk_format40.h"
27760 +#include "../plugin.h"
27761 +#include "../../txnmgr.h"
27762 +#include "../../jnode.h"
27763 +#include "../../tree.h"
27764 +#include "../../super.h"
27765 +#include "../../wander.h"
27766 +#include "../../inode.h"
27767 +#include "../../ktxnmgrd.h"
27768 +#include "../../status_flags.h"
27769 +
27770 +#include <linux/types.h> /* for __u?? */
27771 +#include <linux/fs.h> /* for struct super_block */
27772 +#include <linux/buffer_head.h>
27773 +
27774 +/* reiser 4.0 default disk layout */
27775 +
27776 +/* Amount of free blocks needed to perform release_format40 when fs gets
27777 + mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27778 + & tx record. */
27779 +#define RELEASE_RESERVED 4
27780 +
27781 +/* The greatest supported format40 version number */
27782 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27783 +
27784 +/* This flag indicates that backup should be updated
27785 + (the update is performed by fsck) */
27786 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
27787 +
27788 +/* functions to access fields of format40_disk_super_block */
27789 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27790 +{
27791 + return le64_to_cpu(get_unaligned(&sb->block_count));
27792 +}
27793 +
27794 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27795 +{
27796 + return le64_to_cpu(get_unaligned(&sb->free_blocks));
27797 +}
27798 +
27799 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27800 +{
27801 + return le64_to_cpu(get_unaligned(&sb->root_block));
27802 +}
27803 +
27804 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27805 +{
27806 + return le16_to_cpu(get_unaligned(&sb->tree_height));
27807 +}
27808 +
27809 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27810 +{
27811 + return le64_to_cpu(get_unaligned(&sb->file_count));
27812 +}
27813 +
27814 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
27815 +{
27816 + return le64_to_cpu(get_unaligned(&sb->oid));
27817 +}
27818 +
27819 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27820 +{
27821 + return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27822 +}
27823 +
27824 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
27825 +{
27826 + return le64_to_cpu(get_unaligned(&sb->flags));
27827 +}
27828 +
27829 +static __u32 get_format40_version(const format40_disk_super_block * sb)
27830 +{
27831 + return le32_to_cpu(get_unaligned(&sb->version)) &
27832 + ~FORMAT40_UPDATE_BACKUP;
27833 +}
27834 +
27835 +static int update_backup_version(const format40_disk_super_block * sb)
27836 +{
27837 + return (le32_to_cpu(get_unaligned(&sb->version)) &
27838 + FORMAT40_UPDATE_BACKUP);
27839 +}
27840 +
27841 +static int update_disk_version(const format40_disk_super_block * sb)
27842 +{
27843 + return (get_format40_version(sb) < FORMAT40_VERSION);
27844 +}
27845 +
27846 +static int incomplete_compatibility(const format40_disk_super_block * sb)
27847 +{
27848 + return (get_format40_version(sb) > FORMAT40_VERSION);
27849 +}
27850 +
27851 +static format40_super_info *get_sb_info(struct super_block *super)
27852 +{
27853 + return &get_super_private(super)->u.format40;
27854 +}
27855 +
27856 +static int consult_diskmap(struct super_block *s)
27857 +{
27858 + format40_super_info *info;
27859 + journal_location *jloc;
27860 +
27861 + info = get_sb_info(s);
27862 + jloc = &get_super_private(s)->jloc;
27863 + /* Default format-specific locations, if there is nothing in
27864 + * diskmap */
27865 + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27866 + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27867 + info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27868 +#ifdef CONFIG_REISER4_BADBLOCKS
27869 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27870 + &jloc->footer);
27871 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27872 + &jloc->header);
27873 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27874 + &info->loc.super);
27875 +#endif
27876 + return 0;
27877 +}
27878 +
27879 +/* find any valid super block of disk_format40 (even if the first
27880 + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27881 + if needed */
27882 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27883 + *s)
27884 +{
27885 + struct buffer_head *super_bh;
27886 + format40_disk_super_block *disk_sb;
27887 + format40_super_info *info;
27888 +
27889 + assert("umka-487", s != NULL);
27890 +
27891 + info = get_sb_info(s);
27892 +
27893 + super_bh = sb_bread(s, info->loc.super);
27894 + if (super_bh == NULL)
27895 + return ERR_PTR(RETERR(-EIO));
27896 +
27897 + disk_sb = (format40_disk_super_block *) super_bh->b_data;
27898 + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27899 + brelse(super_bh);
27900 + return ERR_PTR(RETERR(-EINVAL));
27901 + }
27902 +
27903 + reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27904 + reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27905 + le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27906 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27907 +
27908 + return super_bh;
27909 +}
27910 +
27911 +/* find the most recent version of super block. This is called after journal is
27912 + replayed */
27913 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27914 +{
27915 + /* Here the most recent superblock copy has to be read. However, as
27916 + journal replay isn't complete, we are using
27917 + find_a_disk_format40_super_block() function. */
27918 + return find_a_disk_format40_super_block(s);
27919 +}
27920 +
27921 +static int get_super_jnode(struct super_block *s)
27922 +{
27923 + reiser4_super_info_data *sbinfo = get_super_private(s);
27924 + jnode *sb_jnode;
27925 + int ret;
27926 +
27927 + sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27928 +
27929 + ret = jload(sb_jnode);
27930 +
27931 + if (ret) {
27932 + reiser4_drop_io_head(sb_jnode);
27933 + return ret;
27934 + }
27935 +
27936 + pin_jnode_data(sb_jnode);
27937 + jrelse(sb_jnode);
27938 +
27939 + sbinfo->u.format40.sb_jnode = sb_jnode;
27940 +
27941 + return 0;
27942 +}
27943 +
27944 +static void done_super_jnode(struct super_block *s)
27945 +{
27946 + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27947 +
27948 + if (sb_jnode) {
27949 + unpin_jnode_data(sb_jnode);
27950 + reiser4_drop_io_head(sb_jnode);
27951 + }
27952 +}
27953 +
27954 +typedef enum format40_init_stage {
27955 + NONE_DONE = 0,
27956 + CONSULT_DISKMAP,
27957 + FIND_A_SUPER,
27958 + INIT_JOURNAL_INFO,
27959 + INIT_STATUS,
27960 + JOURNAL_REPLAY,
27961 + READ_SUPER,
27962 + KEY_CHECK,
27963 + INIT_OID,
27964 + INIT_TREE,
27965 + JOURNAL_RECOVER,
27966 + INIT_SA,
27967 + INIT_JNODE,
27968 + ALL_DONE
27969 +} format40_init_stage;
27970 +
27971 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27972 +{
27973 + format40_disk_super_block *sb_copy;
27974 +
27975 + sb_copy = kmalloc(sizeof(format40_disk_super_block),
27976 + reiser4_ctx_gfp_mask_get());
27977 + if (sb_copy == NULL)
27978 + return ERR_PTR(RETERR(-ENOMEM));
27979 + memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27980 + sizeof(format40_disk_super_block));
27981 + return sb_copy;
27982 +}
27983 +
27984 +static int check_key_format(const format40_disk_super_block *sb_copy)
27985 +{
27986 + if (!equi(REISER4_LARGE_KEY,
27987 + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27988 + warning("nikita-3228", "Key format mismatch. "
27989 + "Only %s keys are supported.",
27990 + REISER4_LARGE_KEY ? "large" : "small");
27991 + return RETERR(-EINVAL);
27992 + }
27993 + return 0;
27994 +}
27995 +
27996 +/**
27997 + * try_init_format40
27998 + * @super:
27999 + * @stage:
28000 + *
28001 + */
28002 +static int try_init_format40(struct super_block *super,
28003 + format40_init_stage *stage)
28004 +{
28005 + int result;
28006 + struct buffer_head *super_bh;
28007 + reiser4_super_info_data *sbinfo;
28008 + format40_disk_super_block *sb_copy;
28009 + tree_level height;
28010 + reiser4_block_nr root_block;
28011 + node_plugin *nplug;
28012 +
28013 + assert("vs-475", super != NULL);
28014 + assert("vs-474", get_super_private(super));
28015 +
28016 + *stage = NONE_DONE;
28017 +
28018 + result = consult_diskmap(super);
28019 + if (result)
28020 + return result;
28021 + *stage = CONSULT_DISKMAP;
28022 +
28023 + super_bh = find_a_disk_format40_super_block(super);
28024 + if (IS_ERR(super_bh))
28025 + return PTR_ERR(super_bh);
28026 + brelse(super_bh);
28027 + *stage = FIND_A_SUPER;
28028 +
28029 + /* ok, we are sure that filesystem format is a format40 format */
28030 +
28031 + /* map jnodes for journal control blocks (header, footer) to disk */
28032 + result = reiser4_init_journal_info(super);
28033 + if (result)
28034 + return result;
28035 + *stage = INIT_JOURNAL_INFO;
28036 +
28037 + /* ok, we are sure that filesystem format is a format40 format */
28038 + /* Now check it's state */
28039 + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
28040 + if (result != 0 && result != -EINVAL)
28041 + /* -EINVAL means there is no magic, so probably just old
28042 + * fs. */
28043 + return result;
28044 + *stage = INIT_STATUS;
28045 +
28046 + result = reiser4_status_query(NULL, NULL);
28047 + if (result == REISER4_STATUS_MOUNT_WARN)
28048 + notice("vpf-1363", "Warning: mounting %s with errors.",
28049 + super->s_id);
28050 + if (result == REISER4_STATUS_MOUNT_RO)
28051 + notice("vpf-1364", "Warning: mounting %s with fatal errors,"
28052 + " forcing read-only mount.", super->s_id);
28053 + result = reiser4_journal_replay(super);
28054 + if (result)
28055 + return result;
28056 + *stage = JOURNAL_REPLAY;
28057 +
28058 + super_bh = read_super_block(super);
28059 + if (IS_ERR(super_bh))
28060 + return PTR_ERR(super_bh);
28061 + *stage = READ_SUPER;
28062 +
28063 + /* allocate and make a copy of format40_disk_super_block */
28064 + sb_copy = copy_sb(super_bh);
28065 + brelse(super_bh);
28066 +
28067 + if (IS_ERR(sb_copy))
28068 + return PTR_ERR(sb_copy);
28069 + printk("reiser4: %s: found disk format 4.0.%u.\n",
28070 + super->s_id,
28071 + get_format40_version(sb_copy));
28072 + if (incomplete_compatibility(sb_copy))
28073 + printk("reiser4: Warning: The last completely supported "
28074 + "version of disk format40 is %u. Some objects of "
28075 + "the semantic tree can be unaccessible.\n",
28076 + FORMAT40_VERSION);
28077 + /* make sure that key format of kernel and filesystem match */
28078 + result = check_key_format(sb_copy);
28079 + if (result) {
28080 + kfree(sb_copy);
28081 + return result;
28082 + }
28083 + *stage = KEY_CHECK;
28084 +
28085 + result = oid_init_allocator(super, get_format40_file_count(sb_copy),
28086 + get_format40_oid(sb_copy));
28087 + if (result) {
28088 + kfree(sb_copy);
28089 + return result;
28090 + }
28091 + *stage = INIT_OID;
28092 +
28093 + /* get things necessary to init reiser4_tree */
28094 + root_block = get_format40_root_block(sb_copy);
28095 + height = get_format40_tree_height(sb_copy);
28096 + nplug = node_plugin_by_id(NODE40_ID);
28097 +
28098 + /* initialize reiser4_super_info_data */
28099 + sbinfo = get_super_private(super);
28100 + assert("", sbinfo->tree.super == super);
28101 + /* init reiser4_tree for the filesystem */
28102 + result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
28103 + if (result) {
28104 + kfree(sb_copy);
28105 + return result;
28106 + }
28107 + *stage = INIT_TREE;
28108 +
28109 + /*
28110 + * initialize reiser4_super_info_data with data from format40 super
28111 + * block
28112 + */
28113 + sbinfo->default_uid = 0;
28114 + sbinfo->default_gid = 0;
28115 + sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
28116 + /* number of blocks in filesystem and reserved space */
28117 + reiser4_set_block_count(super, get_format40_block_count(sb_copy));
28118 + sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
28119 + sbinfo->version = get_format40_version(sb_copy);
28120 + kfree(sb_copy);
28121 +
28122 + if (update_backup_version(sb_copy))
28123 + printk("reiser4: Warning: metadata backup is not updated. "
28124 + "Please run 'fsck.reiser4 --fix' on %s.\n",
28125 + super->s_id);
28126 +
28127 + sbinfo->fsuid = 0;
28128 + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
28129 + * are not supported */
28130 + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
28131 + * layout 40 are
28132 + * of one
28133 + * plugin */
28134 + /* sbinfo->tmgr is initialized already */
28135 +
28136 + /* recover sb data which were logged separately from sb block */
28137 +
28138 + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
28139 + * oid_init_allocator() and reiser4_set_free_blocks() with new
28140 + * data. What's the reason to call them above? */
28141 + result = reiser4_journal_recover_sb_data(super);
28142 + if (result != 0)
28143 + return result;
28144 + *stage = JOURNAL_RECOVER;
28145 +
28146 + /*
28147 + * Set number of used blocks. The number of used blocks is not stored
28148 + * neither in on-disk super block nor in the journal footer blocks. At
28149 + * this moment actual values of total blocks and free block counters
28150 + * are set in the reiser4 super block (in-memory structure) and we can
28151 + * calculate number of used blocks from them.
28152 + */
28153 + reiser4_set_data_blocks(super,
28154 + reiser4_block_count(super) -
28155 + reiser4_free_blocks(super));
28156 +
28157 +#if REISER4_DEBUG
28158 + sbinfo->min_blocks_used = 16 /* reserved area */ +
28159 + 2 /* super blocks */ +
28160 + 2 /* journal footer and header */ ;
28161 +#endif
28162 +
28163 + /* init disk space allocator */
28164 + result = sa_init_allocator(reiser4_get_space_allocator(super),
28165 + super, NULL);
28166 + if (result)
28167 + return result;
28168 + *stage = INIT_SA;
28169 +
28170 + result = get_super_jnode(super);
28171 + if (result == 0)
28172 + *stage = ALL_DONE;
28173 + return result;
28174 +}
28175 +
28176 +/* plugin->u.format.get_ready */
28177 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
28178 +{
28179 + int result;
28180 + format40_init_stage stage;
28181 +
28182 + result = try_init_format40(s, &stage);
28183 + switch (stage) {
28184 + case ALL_DONE:
28185 + assert("nikita-3458", result == 0);
28186 + break;
28187 + case INIT_JNODE:
28188 + done_super_jnode(s);
28189 + case INIT_SA:
28190 + sa_destroy_allocator(reiser4_get_space_allocator(s), s);
28191 + case JOURNAL_RECOVER:
28192 + case INIT_TREE:
28193 + reiser4_done_tree(&get_super_private(s)->tree);
28194 + case INIT_OID:
28195 + case KEY_CHECK:
28196 + case READ_SUPER:
28197 + case JOURNAL_REPLAY:
28198 + case INIT_STATUS:
28199 + reiser4_status_finish();
28200 + case INIT_JOURNAL_INFO:
28201 + reiser4_done_journal_info(s);
28202 + case FIND_A_SUPER:
28203 + case CONSULT_DISKMAP:
28204 + case NONE_DONE:
28205 + break;
28206 + default:
28207 + impossible("nikita-3457", "init stage: %i", stage);
28208 + }
28209 +
28210 + if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
28211 + return RETERR(-ENOSPC);
28212 +
28213 + return result;
28214 +}
28215 +
28216 +static void pack_format40_super(const struct super_block *s, char *data)
28217 +{
28218 + format40_disk_super_block *super_data =
28219 + (format40_disk_super_block *) data;
28220 +
28221 + reiser4_super_info_data *sbinfo = get_super_private(s);
28222 +
28223 + assert("zam-591", data != NULL);
28224 +
28225 + put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
28226 + &super_data->free_blocks);
28227 +
28228 + put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
28229 + &super_data->root_block);
28230 +
28231 + put_unaligned(cpu_to_le64(oid_next(s)),
28232 + &super_data->oid);
28233 +
28234 + put_unaligned(cpu_to_le64(oids_used(s)),
28235 + &super_data->file_count);
28236 +
28237 + put_unaligned(cpu_to_le16(sbinfo->tree.height),
28238 + &super_data->tree_height);
28239 +
28240 + if (update_disk_version(super_data)) {
28241 + __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
28242 +
28243 + put_unaligned(cpu_to_le32(version), &super_data->version);
28244 + }
28245 +}
28246 +
28247 +/* plugin->u.format.log_super
28248 + return a jnode which should be added to transaction when the super block
28249 + gets logged */
28250 +jnode *log_super_format40(struct super_block *s)
28251 +{
28252 + jnode *sb_jnode;
28253 +
28254 + sb_jnode = get_super_private(s)->u.format40.sb_jnode;
28255 +
28256 + jload(sb_jnode);
28257 +
28258 + pack_format40_super(s, jdata(sb_jnode));
28259 +
28260 + jrelse(sb_jnode);
28261 +
28262 + return sb_jnode;
28263 +}
28264 +
28265 +/* plugin->u.format.release */
28266 +int release_format40(struct super_block *s)
28267 +{
28268 + int ret;
28269 + reiser4_super_info_data *sbinfo;
28270 +
28271 + sbinfo = get_super_private(s);
28272 + assert("zam-579", sbinfo != NULL);
28273 +
28274 + if (!rofs_super(s)) {
28275 + ret = reiser4_capture_super_block(s);
28276 + if (ret != 0)
28277 + warning("vs-898",
28278 + "reiser4_capture_super_block failed: %d",
28279 + ret);
28280 +
28281 + ret = txnmgr_force_commit_all(s, 1);
28282 + if (ret != 0)
28283 + warning("jmacd-74438", "txn_force failed: %d", ret);
28284 +
28285 + all_grabbed2free();
28286 + }
28287 +
28288 + sa_destroy_allocator(&sbinfo->space_allocator, s);
28289 + reiser4_done_journal_info(s);
28290 + done_super_jnode(s);
28291 +
28292 + rcu_barrier();
28293 + reiser4_done_tree(&sbinfo->tree);
28294 + /* call finish_rcu(), because some znode were "released" in
28295 + * reiser4_done_tree(). */
28296 + rcu_barrier();
28297 +
28298 + return 0;
28299 +}
28300 +
28301 +#define FORMAT40_ROOT_LOCALITY 41
28302 +#define FORMAT40_ROOT_OBJECTID 42
28303 +
28304 +/* plugin->u.format.root_dir_key */
28305 +const reiser4_key *root_dir_key_format40(const struct super_block *super
28306 + UNUSED_ARG)
28307 +{
28308 + static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
28309 + .el = {
28310 + __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
28311 +#if REISER4_LARGE_KEY
28312 + ON_LARGE_KEY(0ull,)
28313 +#endif
28314 + __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
28315 + 0ull
28316 + }
28317 + };
28318 +
28319 + return &FORMAT40_ROOT_DIR_KEY;
28320 +}
28321 +
28322 +/* plugin->u.format.check_open.
28323 + Check the opened object for validness. For now it checks for the valid oid &
28324 + locality only, can be improved later and it its work may depend on the mount
28325 + options. */
28326 +int check_open_format40(const struct inode *object)
28327 +{
28328 + oid_t max, oid;
28329 +
28330 + max = oid_next(object->i_sb) - 1;
28331 +
28332 + /* Check the oid. */
28333 + oid = get_inode_oid(object);
28334 + if (oid > max) {
28335 + warning("vpf-1360", "The object with the oid %llu "
28336 + "greater then the max used oid %llu found.",
28337 + (unsigned long long)oid, (unsigned long long)max);
28338 +
28339 + return RETERR(-EIO);
28340 + }
28341 +
28342 + /* Check the locality. */
28343 + oid = reiser4_inode_data(object)->locality_id;
28344 + if (oid > max) {
28345 + warning("vpf-1361", "The object with the locality %llu "
28346 + "greater then the max used oid %llu found.",
28347 + (unsigned long long)oid, (unsigned long long)max);
28348 +
28349 + return RETERR(-EIO);
28350 + }
28351 +
28352 + return 0;
28353 +}
28354 +
28355 +/* plugin->u.format.version_update.
28356 + Perform all version update operations from the on-disk
28357 + format40_disk_super_block.version on disk to FORMAT40_VERSION.
28358 + */
28359 +int version_update_format40(struct super_block *super) {
28360 + txn_handle * trans;
28361 + lock_handle lh;
28362 + txn_atom *atom;
28363 + int ret;
28364 +
28365 + /* Nothing to do if RO mount or the on-disk version is not less. */
28366 + if (super->s_flags & MS_RDONLY)
28367 + return 0;
28368 +
28369 + if (get_super_private(super)->version >= FORMAT40_VERSION)
28370 + return 0;
28371 +
28372 + printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28373 + "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28374 + "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28375 +
28376 + /* Mark the uber znode dirty to call log_super on write_logs. */
28377 + init_lh(&lh);
28378 + ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28379 + ZNODE_LOCK_HIPRI, &lh);
28380 + if (ret != 0)
28381 + return ret;
28382 +
28383 + znode_make_dirty(lh.node);
28384 + done_lh(&lh);
28385 +
28386 + /* Update the backup blocks. */
28387 +
28388 + /* Force write_logs immediately. */
28389 + trans = get_current_context()->trans;
28390 + atom = get_current_atom_locked();
28391 + assert("vpf-1906", atom != NULL);
28392 +
28393 + spin_lock_txnh(trans);
28394 + return force_commit_atom(trans);
28395 +}
28396 +
28397 +/* Make Linus happy.
28398 + Local variables:
28399 + c-indentation-style: "K&R"
28400 + mode-name: "LC"
28401 + c-basic-offset: 8
28402 + tab-width: 8
28403 + fill-column: 120
28404 + scroll-step: 1
28405 + End:
28406 +*/
28407 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.h
28408 --- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 01:00:00.000000000 +0100
28409 +++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.h 2010-03-04 19:33:22.000000000 +0100
28410 @@ -0,0 +1,109 @@
28411 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28412 +
28413 +/* this file contains:
28414 + - definition of ondisk super block of standart disk layout for
28415 + reiser 4.0 (layout 40)
28416 + - definition of layout 40 specific portion of in-core super block
28417 + - declarations of functions implementing methods of layout plugin
28418 + for layout 40
28419 + - declarations of functions used to get/set fields in layout 40 super block
28420 +*/
28421 +
28422 +#ifndef __DISK_FORMAT40_H__
28423 +#define __DISK_FORMAT40_H__
28424 +
28425 +/* magic for default reiser4 layout */
28426 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28427 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28428 +
28429 +#include "../../dformat.h"
28430 +
28431 +#include <linux/fs.h> /* for struct super_block */
28432 +
28433 +typedef enum {
28434 + FORMAT40_LARGE_KEYS
28435 +} format40_flags;
28436 +
28437 +/* ondisk super block for format 40. It is 512 bytes long */
28438 +typedef struct format40_disk_super_block {
28439 + /* 0 */ d64 block_count;
28440 + /* number of block in a filesystem */
28441 + /* 8 */ d64 free_blocks;
28442 + /* number of free blocks */
28443 + /* 16 */ d64 root_block;
28444 + /* filesystem tree root block */
28445 + /* 24 */ d64 oid;
28446 + /* smallest free objectid */
28447 + /* 32 */ d64 file_count;
28448 + /* number of files in a filesystem */
28449 + /* 40 */ d64 flushes;
28450 + /* number of times super block was
28451 + flushed. Needed if format 40
28452 + will have few super blocks */
28453 + /* 48 */ d32 mkfs_id;
28454 + /* unique identifier of fs */
28455 + /* 52 */ char magic[16];
28456 + /* magic string ReIsEr40FoRmAt */
28457 + /* 68 */ d16 tree_height;
28458 + /* height of filesystem tree */
28459 + /* 70 */ d16 formatting_policy;
28460 + /* not used anymore */
28461 + /* 72 */ d64 flags;
28462 + /* 80 */ d32 version;
28463 + /* on-disk format version number
28464 + initially assigned by mkfs as the greatest format40
28465 + version number supported by reiser4progs and updated
28466 + in mount time in accordance with the greatest format40
28467 + version number supported by kernel.
28468 + Is used by fsck to catch possible corruption and
28469 + for various compatibility issues */
28470 + /* 84 */ char not_used[428];
28471 +} format40_disk_super_block;
28472 +
28473 +/* format 40 specific part of reiser4_super_info_data */
28474 +typedef struct format40_super_info {
28475 +/* format40_disk_super_block actual_sb; */
28476 + jnode *sb_jnode;
28477 + struct {
28478 + reiser4_block_nr super;
28479 + } loc;
28480 +} format40_super_info;
28481 +
28482 +/* Defines for journal header and footer respectively. */
28483 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28484 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28485 +
28486 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28487 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28488 +
28489 +#define FORMAT40_STATUS_BLOCKNR \
28490 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28491 +
28492 +/* Diskmap declarations */
28493 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28494 +#define FORMAT40_SUPER 1
28495 +#define FORMAT40_JH 2
28496 +#define FORMAT40_JF 3
28497 +
28498 +/* declarations of functions implementing methods of layout plugin for
28499 + format 40. The functions theirself are in disk_format40.c */
28500 +extern int init_format_format40(struct super_block *, void *data);
28501 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28502 +extern int release_format40(struct super_block *s);
28503 +extern jnode *log_super_format40(struct super_block *s);
28504 +extern int check_open_format40(const struct inode *object);
28505 +extern int version_update_format40(struct super_block *super);
28506 +
28507 +/* __DISK_FORMAT40_H__ */
28508 +#endif
28509 +
28510 +/* Make Linus happy.
28511 + Local variables:
28512 + c-indentation-style: "K&R"
28513 + mode-name: "LC"
28514 + c-basic-offset: 8
28515 + tab-width: 8
28516 + fill-column: 120
28517 + scroll-step: 1
28518 + End:
28519 +*/
28520 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.c
28521 --- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 01:00:00.000000000 +0100
28522 +++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.c 2010-03-04 19:33:22.000000000 +0100
28523 @@ -0,0 +1,38 @@
28524 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28525 +
28526 +#include "../../debug.h"
28527 +#include "../plugin_header.h"
28528 +#include "disk_format40.h"
28529 +#include "disk_format.h"
28530 +#include "../plugin.h"
28531 +
28532 +/* initialization of disk layout plugins */
28533 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28534 + [FORMAT40_ID] = {
28535 + .h = {
28536 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28537 + .id = FORMAT40_ID,
28538 + .pops = NULL,
28539 + .label = "reiser40",
28540 + .desc = "standard disk layout for reiser40",
28541 + .linkage = {NULL, NULL}
28542 + },
28543 + .init_format = init_format_format40,
28544 + .root_dir_key = root_dir_key_format40,
28545 + .release = release_format40,
28546 + .log_super = log_super_format40,
28547 + .check_open = check_open_format40,
28548 + .version_update = version_update_format40
28549 + }
28550 +};
28551 +
28552 +/* Make Linus happy.
28553 + Local variables:
28554 + c-indentation-style: "K&R"
28555 + mode-name: "LC"
28556 + c-basic-offset: 8
28557 + tab-width: 8
28558 + fill-column: 120
28559 + scroll-step: 1
28560 + End:
28561 +*/
28562 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.h
28563 --- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 01:00:00.000000000 +0100
28564 +++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.h 2010-03-04 19:33:22.000000000 +0100
28565 @@ -0,0 +1,27 @@
28566 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28567 +
28568 +/* identifiers for disk layouts, they are also used as indexes in array of disk
28569 + plugins */
28570 +
28571 +#if !defined( __REISER4_DISK_FORMAT_H__ )
28572 +#define __REISER4_DISK_FORMAT_H__
28573 +
28574 +typedef enum {
28575 + /* standard reiser4 disk layout plugin id */
28576 + FORMAT40_ID,
28577 + LAST_FORMAT_ID
28578 +} disk_format_id;
28579 +
28580 +/* __REISER4_DISK_FORMAT_H__ */
28581 +#endif
28582 +
28583 +/* Make Linus happy.
28584 + Local variables:
28585 + c-indentation-style: "K&R"
28586 + mode-name: "LC"
28587 + c-basic-offset: 8
28588 + tab-width: 8
28589 + fill-column: 120
28590 + scroll-step: 1
28591 + End:
28592 +*/
28593 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.33/fs/reiser4/plugin/disk_format/Makefile
28594 --- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 01:00:00.000000000 +0100
28595 +++ linux-2.6.33/fs/reiser4/plugin/disk_format/Makefile 2010-03-04 19:33:22.000000000 +0100
28596 @@ -0,0 +1,5 @@
28597 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
28598 +
28599 +df_plugins-objs := \
28600 + disk_format40.o \
28601 + disk_format.o
28602 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/fibration.c linux-2.6.33/fs/reiser4/plugin/fibration.c
28603 --- linux-2.6.33.orig/fs/reiser4/plugin/fibration.c 1970-01-01 01:00:00.000000000 +0100
28604 +++ linux-2.6.33/fs/reiser4/plugin/fibration.c 2010-03-04 19:33:22.000000000 +0100
28605 @@ -0,0 +1,175 @@
28606 +/* Copyright 2004 by Hans Reiser, licensing governed by
28607 + * reiser4/README */
28608 +
28609 +/* Directory fibrations */
28610 +
28611 +/*
28612 + * Suppose we have a directory tree with sources of some project. During
28613 + * compilation .o files are created within this tree. This makes access
28614 + * to the original source files less efficient, because source files are
28615 + * now "diluted" by object files: default directory plugin uses prefix
28616 + * of a file name as a part of the key for directory entry (and this
28617 + * part is also inherited by the key of file body). This means that
28618 + * foo.o will be located close to foo.c and foo.h in the tree.
28619 + *
28620 + * To avoid this effect directory plugin fill highest 7 (unused
28621 + * originally) bits of the second component of the directory entry key
28622 + * by bit-pattern depending on the file name (see
28623 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28624 + * "fibre". Fibre of the file name key is inherited by key of stat data
28625 + * and keys of file body (in the case of REISER4_LARGE_KEY).
28626 + *
28627 + * Fibre for a given file is chosen by per-directory fibration
28628 + * plugin. Names within given fibre are ordered lexicographically.
28629 + */
28630 +
28631 +#include "../debug.h"
28632 +#include "plugin_header.h"
28633 +#include "plugin.h"
28634 +#include "../super.h"
28635 +#include "../inode.h"
28636 +
28637 +#include <linux/types.h>
28638 +
28639 +static const int fibre_shift = 57;
28640 +
28641 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28642 +
28643 +/*
28644 + * Trivial fibration: all files of directory are just ordered
28645 + * lexicographically.
28646 + */
28647 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28648 +{
28649 + return FIBRE_NO(0);
28650 +}
28651 +
28652 +/*
28653 + * dot-o fibration: place .o files after all others.
28654 + */
28655 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28656 +{
28657 + /* special treatment for .*\.o */
28658 + if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28659 + return FIBRE_NO(1);
28660 + else
28661 + return FIBRE_NO(0);
28662 +}
28663 +
28664 +/*
28665 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
28666 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28667 + * default fibre for the rest.
28668 + */
28669 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28670 +{
28671 + if (len > 2 && name[len - 2] == '.')
28672 + return FIBRE_NO(name[len - 1]);
28673 + else
28674 + return FIBRE_NO(0);
28675 +}
28676 +
28677 +/*
28678 + * ext.3 fibration: try to separate files with different 3-character
28679 + * extensions from each other.
28680 + */
28681 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28682 +{
28683 + if (len > 4 && name[len - 4] == '.')
28684 + return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28685 + else
28686 + return FIBRE_NO(0);
28687 +}
28688 +
28689 +static int change_fibration(struct inode *inode,
28690 + reiser4_plugin * plugin,
28691 + pset_member memb)
28692 +{
28693 + int result;
28694 +
28695 + assert("nikita-3503", inode != NULL);
28696 + assert("nikita-3504", plugin != NULL);
28697 +
28698 + assert("nikita-3505", is_reiser4_inode(inode));
28699 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28700 + assert("nikita-3507",
28701 + plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28702 +
28703 + result = 0;
28704 + if (inode_fibration_plugin(inode) == NULL ||
28705 + inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28706 + if (is_dir_empty(inode) == 0)
28707 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28708 + PSET_FIBRATION, plugin);
28709 + else
28710 + result = RETERR(-ENOTEMPTY);
28711 +
28712 + }
28713 + return result;
28714 +}
28715 +
28716 +static reiser4_plugin_ops fibration_plugin_ops = {
28717 + .init = NULL,
28718 + .load = NULL,
28719 + .save_len = NULL,
28720 + .save = NULL,
28721 + .change = change_fibration
28722 +};
28723 +
28724 +/* fibration plugins */
28725 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28726 + [FIBRATION_LEXICOGRAPHIC] = {
28727 + .h = {
28728 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28729 + .id = FIBRATION_LEXICOGRAPHIC,
28730 + .pops = &fibration_plugin_ops,
28731 + .label = "lexicographic",
28732 + .desc = "no fibration",
28733 + .linkage = {NULL, NULL}
28734 + },
28735 + .fibre = fibre_trivial
28736 + },
28737 + [FIBRATION_DOT_O] = {
28738 + .h = {
28739 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28740 + .id = FIBRATION_DOT_O,
28741 + .pops = &fibration_plugin_ops,
28742 + .label = "dot-o",
28743 + .desc = "fibrate .o files separately",
28744 + .linkage = {NULL, NULL}
28745 + },
28746 + .fibre = fibre_dot_o
28747 + },
28748 + [FIBRATION_EXT_1] = {
28749 + .h = {
28750 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28751 + .id = FIBRATION_EXT_1,
28752 + .pops = &fibration_plugin_ops,
28753 + .label = "ext-1",
28754 + .desc = "fibrate file by single character extension",
28755 + .linkage = {NULL, NULL}
28756 + },
28757 + .fibre = fibre_ext_1
28758 + },
28759 + [FIBRATION_EXT_3] = {
28760 + .h = {
28761 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28762 + .id = FIBRATION_EXT_3,
28763 + .pops = &fibration_plugin_ops,
28764 + .label = "ext-3",
28765 + .desc = "fibrate file by three character extension",
28766 + .linkage = {NULL, NULL}
28767 + },
28768 + .fibre = fibre_ext_3
28769 + }
28770 +};
28771 +
28772 +/*
28773 + * Local variables:
28774 + * c-indentation-style: "K&R"
28775 + * mode-name: "LC"
28776 + * c-basic-offset: 8
28777 + * tab-width: 8
28778 + * fill-column: 79
28779 + * End:
28780 + */
28781 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/fibration.h linux-2.6.33/fs/reiser4/plugin/fibration.h
28782 --- linux-2.6.33.orig/fs/reiser4/plugin/fibration.h 1970-01-01 01:00:00.000000000 +0100
28783 +++ linux-2.6.33/fs/reiser4/plugin/fibration.h 2010-03-04 19:33:22.000000000 +0100
28784 @@ -0,0 +1,37 @@
28785 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28786 +
28787 +/* Fibration plugin used by hashed directory plugin to segment content
28788 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28789 +
28790 +#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__)
28791 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
28792 +
28793 +#include "plugin_header.h"
28794 +
28795 +typedef struct fibration_plugin {
28796 + /* generic fields */
28797 + plugin_header h;
28798 +
28799 + __u64(*fibre) (const struct inode *dir, const char *name, int len);
28800 +} fibration_plugin;
28801 +
28802 +typedef enum {
28803 + FIBRATION_LEXICOGRAPHIC,
28804 + FIBRATION_DOT_O,
28805 + FIBRATION_EXT_1,
28806 + FIBRATION_EXT_3,
28807 + LAST_FIBRATION_ID
28808 +} reiser4_fibration_id;
28809 +
28810 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28811 +#endif
28812 +
28813 +/* Make Linus happy.
28814 + Local variables:
28815 + c-indentation-style: "K&R"
28816 + mode-name: "LC"
28817 + c-basic-offset: 8
28818 + tab-width: 8
28819 + fill-column: 120
28820 + End:
28821 +*/
28822 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.c
28823 --- linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 01:00:00.000000000 +0100
28824 +++ linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.c 2010-03-04 19:33:22.000000000 +0100
28825 @@ -0,0 +1,3803 @@
28826 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28827 + reiser4/README */
28828 +/*
28829 + * Written by Edward Shishkin.
28830 + *
28831 + * Implementations of inode/file/address_space operations
28832 + * specific for cryptcompress file plugin which manages
28833 + * regular files built of compressed and(or) encrypted bodies.
28834 + * See http://dev.namesys.com/CryptcompressPlugin for details.
28835 + */
28836 +
28837 +#include "../../inode.h"
28838 +#include "../cluster.h"
28839 +#include "../object.h"
28840 +#include "../../tree_walk.h"
28841 +#include "cryptcompress.h"
28842 +
28843 +#include <linux/pagevec.h>
28844 +#include <asm/uaccess.h>
28845 +#include <linux/swap.h>
28846 +#include <linux/writeback.h>
28847 +#include <linux/random.h>
28848 +#include <linux/scatterlist.h>
28849 +
28850 +/*
28851 + Managing primary and secondary caches by Reiser4
28852 + cryptcompress file plugin. Synchronization scheme.
28853 +
28854 +
28855 + +------------------+
28856 + +------------------->| tfm stream |
28857 + | | (compressed data)|
28858 + flush | +------------------+
28859 + +-----------------+ |
28860 + |(->)longterm lock| V
28861 +--+ writepages() | | +-***-+ reiser4 +---+
28862 + | | +--+ | *** | storage tree | |
28863 + | | | +-***-+ (primary cache)| |
28864 +u | write() (secondary| cache) V / | \ | |
28865 +s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
28866 +e | | | |page cluster | | | **disk cluster** | | i |
28867 +r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
28868 + | read() ^ ^ | | k |
28869 + | | (->)longterm lock| | page_io()| |
28870 + | | +------+ | |
28871 +--+ readpages() | | +---+
28872 + | V
28873 + | +------------------+
28874 + +--------------------| tfm stream |
28875 + | (plain text) |
28876 + +------------------+
28877 +*/
28878 +
28879 +/* get cryptcompress specific portion of inode */
28880 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28881 +{
28882 + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28883 +}
28884 +
28885 +/* plugin->u.file.init_inode_data */
28886 +void init_inode_data_cryptcompress(struct inode *inode,
28887 + reiser4_object_create_data * crd,
28888 + int create)
28889 +{
28890 + struct cryptcompress_info *data;
28891 +
28892 + data = cryptcompress_inode_data(inode);
28893 + assert("edward-685", data != NULL);
28894 +
28895 + memset(data, 0, sizeof(*data));
28896 +
28897 + mutex_init(&data->checkin_mutex);
28898 + data->trunc_index = ULONG_MAX;
28899 + turn_on_compression(data);
28900 + set_lattice_factor(data, MIN_LATTICE_FACTOR);
28901 + init_inode_ordering(inode, crd, create);
28902 +}
28903 +
28904 +/* The following is a part of reiser4 cipher key manager
28905 + which is called when opening/creating a cryptcompress file */
28906 +
28907 +/* get/set cipher key info */
28908 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28909 +{
28910 + assert("edward-90", inode != NULL);
28911 + assert("edward-91", reiser4_inode_data(inode) != NULL);
28912 + return cryptcompress_inode_data(inode)->crypt;
28913 +}
28914 +
28915 +static void set_inode_crypto_info (struct inode * inode,
28916 + struct reiser4_crypto_info * info)
28917 +{
28918 + cryptcompress_inode_data(inode)->crypt = info;
28919 +}
28920 +
28921 +/* allocate a cipher key info */
28922 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28923 +{
28924 + struct reiser4_crypto_info *info;
28925 + int fipsize;
28926 +
28927 + info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28928 + if (!info)
28929 + return ERR_PTR(-ENOMEM);
28930 +
28931 + fipsize = inode_digest_plugin(inode)->fipsize;
28932 + info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28933 + if (!info->keyid) {
28934 + kfree(info);
28935 + return ERR_PTR(-ENOMEM);
28936 + }
28937 + info->host = inode;
28938 + return info;
28939 +}
28940 +
28941 +#if 0
28942 +/* allocate/free low-level info for cipher and digest
28943 + transforms */
28944 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28945 +{
28946 + struct crypto_blkcipher * ctfm = NULL;
28947 + struct crypto_hash * dtfm = NULL;
28948 + cipher_plugin * cplug = inode_cipher_plugin(info->host);
28949 + digest_plugin * dplug = inode_digest_plugin(info->host);
28950 +
28951 + if (cplug->alloc) {
28952 + ctfm = cplug->alloc();
28953 + if (IS_ERR(ctfm)) {
28954 + warning("edward-1364",
28955 + "Can not allocate info for %s\n",
28956 + cplug->h.desc);
28957 + return RETERR(PTR_ERR(ctfm));
28958 + }
28959 + }
28960 + info_set_cipher(info, ctfm);
28961 + if (dplug->alloc) {
28962 + dtfm = dplug->alloc();
28963 + if (IS_ERR(dtfm)) {
28964 + warning("edward-1365",
28965 + "Can not allocate info for %s\n",
28966 + dplug->h.desc);
28967 + goto unhappy_with_digest;
28968 + }
28969 + }
28970 + info_set_digest(info, dtfm);
28971 + return 0;
28972 + unhappy_with_digest:
28973 + if (cplug->free) {
28974 + cplug->free(ctfm);
28975 + info_set_cipher(info, NULL);
28976 + }
28977 + return RETERR(PTR_ERR(dtfm));
28978 +}
28979 +#endif
28980 +
28981 +static void
28982 +free_crypto_tfms(struct reiser4_crypto_info * info)
28983 +{
28984 + assert("edward-1366", info != NULL);
28985 + if (!info_get_cipher(info)) {
28986 + assert("edward-1601", !info_get_digest(info));
28987 + return;
28988 + }
28989 + inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28990 + info_set_cipher(info, NULL);
28991 + inode_digest_plugin(info->host)->free(info_get_digest(info));
28992 + info_set_digest(info, NULL);
28993 + return;
28994 +}
28995 +
28996 +#if 0
28997 +/* create a key fingerprint for disk stat-data */
28998 +static int create_keyid (struct reiser4_crypto_info * info,
28999 + struct reiser4_crypto_data * data)
29000 +{
29001 + int ret = -ENOMEM;
29002 + size_t blk, pad;
29003 + __u8 * dmem;
29004 + __u8 * cmem;
29005 + struct hash_desc ddesc;
29006 + struct blkcipher_desc cdesc;
29007 + struct scatterlist sg;
29008 +
29009 + assert("edward-1367", info != NULL);
29010 + assert("edward-1368", info->keyid != NULL);
29011 +
29012 + ddesc.tfm = info_get_digest(info);
29013 + ddesc.flags = 0;
29014 + cdesc.tfm = info_get_cipher(info);
29015 + cdesc.flags = 0;
29016 +
29017 + dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
29018 + reiser4_ctx_gfp_mask_get());
29019 + if (!dmem)
29020 + goto exit1;
29021 +
29022 + blk = crypto_blkcipher_blocksize(cdesc.tfm);
29023 +
29024 + pad = data->keyid_size % blk;
29025 + pad = (pad ? blk - pad : 0);
29026 +
29027 + cmem = kmalloc((size_t)data->keyid_size + pad,
29028 + reiser4_ctx_gfp_mask_get());
29029 + if (!cmem)
29030 + goto exit2;
29031 + memcpy(cmem, data->keyid, data->keyid_size);
29032 + memset(cmem + data->keyid_size, 0, pad);
29033 +
29034 + sg_init_one(&sg, cmem, data->keyid_size + pad);
29035 +
29036 + ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
29037 + data->keyid_size + pad);
29038 + if (ret) {
29039 + warning("edward-1369",
29040 + "encryption failed flags=%x\n", cdesc.flags);
29041 + goto exit3;
29042 + }
29043 + ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
29044 + if (ret) {
29045 + warning("edward-1602",
29046 + "digest failed flags=%x\n", ddesc.flags);
29047 + goto exit3;
29048 + }
29049 + memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
29050 + exit3:
29051 + kfree(cmem);
29052 + exit2:
29053 + kfree(dmem);
29054 + exit1:
29055 + return ret;
29056 +}
29057 +#endif
29058 +
29059 +static void destroy_keyid(struct reiser4_crypto_info * info)
29060 +{
29061 + assert("edward-1370", info != NULL);
29062 + assert("edward-1371", info->keyid != NULL);
29063 + kfree(info->keyid);
29064 + return;
29065 +}
29066 +
29067 +static void __free_crypto_info (struct inode * inode)
29068 +{
29069 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
29070 + assert("edward-1372", info != NULL);
29071 +
29072 + free_crypto_tfms(info);
29073 + destroy_keyid(info);
29074 + kfree(info);
29075 +}
29076 +
29077 +#if 0
29078 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
29079 +{
29080 + assert("edward-1373", info != NULL);
29081 + assert("edward-1374", info->inst == 0);
29082 + info->inst = 1;
29083 +}
29084 +#endif
29085 +
29086 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
29087 +{
29088 + assert("edward-1375", info != NULL);
29089 + info->inst = 0;
29090 +}
29091 +
29092 +#if 0
29093 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
29094 +{
29095 + return info->inst;
29096 +}
29097 +
29098 +static int inode_has_cipher_key(struct inode * inode)
29099 +{
29100 + assert("edward-1376", inode != NULL);
29101 + return inode_crypto_info(inode) &&
29102 + is_crypto_info_instantiated(inode_crypto_info(inode));
29103 +}
29104 +#endif
29105 +
29106 +static void free_crypto_info (struct inode * inode)
29107 +{
29108 + uninstantiate_crypto_info(inode_crypto_info(inode));
29109 + __free_crypto_info(inode);
29110 +}
29111 +
29112 +static int need_cipher(struct inode * inode)
29113 +{
29114 + return inode_cipher_plugin(inode) !=
29115 + cipher_plugin_by_id(NONE_CIPHER_ID);
29116 +}
29117 +
29118 +/* Parse @data which contains a (uninstantiated) cipher key imported
29119 + from user space, create a low-level cipher info and attach it to
29120 + the @object. If success, then info contains an instantiated key */
29121 +#if 0
29122 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
29123 + struct reiser4_crypto_data * data)
29124 +{
29125 + int ret;
29126 + struct reiser4_crypto_info * info;
29127 +
29128 + assert("edward-1377", data != NULL);
29129 + assert("edward-1378", need_cipher(object));
29130 +
29131 + if (inode_file_plugin(object) !=
29132 + file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
29133 + return ERR_PTR(-EINVAL);
29134 +
29135 + info = reiser4_alloc_crypto_info(object);
29136 + if (IS_ERR(info))
29137 + return info;
29138 + ret = alloc_crypto_tfms(info);
29139 + if (ret)
29140 + goto err;
29141 + /* instantiating a key */
29142 + ret = crypto_blkcipher_setkey(info_get_cipher(info),
29143 + data->key,
29144 + data->keysize);
29145 + if (ret) {
29146 + warning("edward-1379",
29147 + "setkey failed flags=%x",
29148 + crypto_blkcipher_get_flags(info_get_cipher(info)));
29149 + goto err;
29150 + }
29151 + info->keysize = data->keysize;
29152 + ret = create_keyid(info, data);
29153 + if (ret)
29154 + goto err;
29155 + instantiate_crypto_info(info);
29156 + return info;
29157 + err:
29158 + __free_crypto_info(object);
29159 + return ERR_PTR(ret);
29160 +}
29161 +#endif
29162 +
29163 +/* increment/decrement a load counter when
29164 + attaching/detaching the crypto-stat to any object */
29165 +static void load_crypto_info(struct reiser4_crypto_info * info)
29166 +{
29167 + assert("edward-1380", info != NULL);
29168 + inc_keyload_count(info);
29169 +}
29170 +
29171 +static void unload_crypto_info(struct inode * inode)
29172 +{
29173 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
29174 + assert("edward-1381", info->keyload_count > 0);
29175 +
29176 + dec_keyload_count(inode_crypto_info(inode));
29177 + if (info->keyload_count == 0)
29178 + /* final release */
29179 + free_crypto_info(inode);
29180 +}
29181 +
29182 +/* attach/detach an existing crypto-stat */
29183 +void reiser4_attach_crypto_info(struct inode * inode,
29184 + struct reiser4_crypto_info * info)
29185 +{
29186 + assert("edward-1382", inode != NULL);
29187 + assert("edward-1383", info != NULL);
29188 + assert("edward-1384", inode_crypto_info(inode) == NULL);
29189 +
29190 + set_inode_crypto_info(inode, info);
29191 + load_crypto_info(info);
29192 +}
29193 +
29194 +/* returns true, if crypto stat can be attached to the @host */
29195 +#if REISER4_DEBUG
29196 +static int host_allows_crypto_info(struct inode * host)
29197 +{
29198 + int ret;
29199 + file_plugin * fplug = inode_file_plugin(host);
29200 +
29201 + switch (fplug->h.id) {
29202 + case CRYPTCOMPRESS_FILE_PLUGIN_ID:
29203 + ret = 1;
29204 + break;
29205 + default:
29206 + ret = 0;
29207 + }
29208 + return ret;
29209 +}
29210 +#endif /* REISER4_DEBUG */
29211 +
29212 +static void reiser4_detach_crypto_info(struct inode * inode)
29213 +{
29214 + assert("edward-1385", inode != NULL);
29215 + assert("edward-1386", host_allows_crypto_info(inode));
29216 +
29217 + if (inode_crypto_info(inode))
29218 + unload_crypto_info(inode);
29219 + set_inode_crypto_info(inode, NULL);
29220 +}
29221 +
29222 +#if 0
29223 +
29224 +/* compare fingerprints of @child and @parent */
29225 +static int keyid_eq(struct reiser4_crypto_info * child,
29226 + struct reiser4_crypto_info * parent)
29227 +{
29228 + return !memcmp(child->keyid,
29229 + parent->keyid,
29230 + info_digest_plugin(parent)->fipsize);
29231 +}
29232 +
29233 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
29234 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
29235 +{
29236 + if (!need_cipher(child))
29237 + return 0;
29238 + /* the child is created */
29239 + if (!inode_crypto_info(child))
29240 + return 1;
29241 + /* the child is looked up */
29242 + if (!inode_crypto_info(parent))
29243 + return 0;
29244 + return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
29245 + inode_digest_plugin(child) == inode_digest_plugin(parent) &&
29246 + inode_crypto_info(child)->keysize ==
29247 + inode_crypto_info(parent)->keysize &&
29248 + keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
29249 +}
29250 +#endif
29251 +
29252 +/* helper functions for ->create() method of the cryptcompress plugin */
29253 +static int inode_set_crypto(struct inode * object)
29254 +{
29255 + reiser4_inode * info;
29256 + if (!inode_crypto_info(object)) {
29257 + if (need_cipher(object))
29258 + return RETERR(-EINVAL);
29259 + /* the file is not to be encrypted */
29260 + return 0;
29261 + }
29262 + info = reiser4_inode_data(object);
29263 + info->extmask |= (1 << CRYPTO_STAT);
29264 + return 0;
29265 +}
29266 +
29267 +static int inode_init_compression(struct inode * object)
29268 +{
29269 + int result = 0;
29270 + assert("edward-1461", object != NULL);
29271 + if (inode_compression_plugin(object)->init)
29272 + result = inode_compression_plugin(object)->init();
29273 + return result;
29274 +}
29275 +
29276 +static int inode_check_cluster(struct inode * object)
29277 +{
29278 + assert("edward-696", object != NULL);
29279 +
29280 + if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
29281 + warning("edward-1320", "Can not support '%s' "
29282 + "logical clusters (less then page size)",
29283 + inode_cluster_plugin(object)->h.label);
29284 + return RETERR(-EINVAL);
29285 + }
29286 + if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
29287 + warning("edward-1463", "Can not support '%s' "
29288 + "logical clusters (too big for transform)",
29289 + inode_cluster_plugin(object)->h.label);
29290 + return RETERR(-EINVAL);
29291 + }
29292 + return 0;
29293 +}
29294 +
29295 +/* plugin->destroy_inode() */
29296 +void destroy_inode_cryptcompress(struct inode * inode)
29297 +{
29298 + assert("edward-1464", INODE_PGCOUNT(inode) == 0);
29299 + reiser4_detach_crypto_info(inode);
29300 + return;
29301 +}
29302 +
29303 +/* plugin->create_object():
29304 +. install plugins
29305 +. attach crypto info if specified
29306 +. attach compression info if specified
29307 +. attach cluster info
29308 +*/
29309 +int create_object_cryptcompress(struct inode *object, struct inode *parent,
29310 + reiser4_object_create_data * data)
29311 +{
29312 + int result;
29313 + reiser4_inode *info;
29314 +
29315 + assert("edward-23", object != NULL);
29316 + assert("edward-24", parent != NULL);
29317 + assert("edward-30", data != NULL);
29318 + assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
29319 + assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
29320 +
29321 + info = reiser4_inode_data(object);
29322 +
29323 + assert("edward-29", info != NULL);
29324 +
29325 + /* set file bit */
29326 + info->plugin_mask |= (1 << PSET_FILE);
29327 +
29328 + /* set crypto */
29329 + result = inode_set_crypto(object);
29330 + if (result)
29331 + goto error;
29332 + /* set compression */
29333 + result = inode_init_compression(object);
29334 + if (result)
29335 + goto error;
29336 + /* set cluster */
29337 + result = inode_check_cluster(object);
29338 + if (result)
29339 + goto error;
29340 +
29341 + /* save everything in disk stat-data */
29342 + result = write_sd_by_inode_common(object);
29343 + if (!result)
29344 + return 0;
29345 + error:
29346 + reiser4_detach_crypto_info(object);
29347 + return result;
29348 +}
29349 +
29350 +/* plugin->open() */
29351 +int open_cryptcompress(struct inode * inode, struct file * file)
29352 +{
29353 + return 0;
29354 +}
29355 +
29356 +/* returns a blocksize, the attribute of a cipher algorithm */
29357 +static unsigned int
29358 +cipher_blocksize(struct inode * inode)
29359 +{
29360 + assert("edward-758", need_cipher(inode));
29361 + assert("edward-1400", inode_crypto_info(inode) != NULL);
29362 + return crypto_blkcipher_blocksize
29363 + (info_get_cipher(inode_crypto_info(inode)));
29364 +}
29365 +
29366 +/* returns offset translated by scale factor of the crypto-algorithm */
29367 +static loff_t inode_scaled_offset (struct inode * inode,
29368 + const loff_t src_off /* input offset */)
29369 +{
29370 + assert("edward-97", inode != NULL);
29371 +
29372 + if (!need_cipher(inode) ||
29373 + src_off == get_key_offset(reiser4_min_key()) ||
29374 + src_off == get_key_offset(reiser4_max_key()))
29375 + return src_off;
29376 +
29377 + return inode_cipher_plugin(inode)->scale(inode,
29378 + cipher_blocksize(inode),
29379 + src_off);
29380 +}
29381 +
29382 +/* returns disk cluster size */
29383 +size_t inode_scaled_cluster_size(struct inode * inode)
29384 +{
29385 + assert("edward-110", inode != NULL);
29386 +
29387 + return inode_scaled_offset(inode, inode_cluster_size(inode));
29388 +}
29389 +
29390 +/* set number of cluster pages */
29391 +static void set_cluster_nrpages(struct cluster_handle * clust,
29392 + struct inode *inode)
29393 +{
29394 + struct reiser4_slide * win;
29395 +
29396 + assert("edward-180", clust != NULL);
29397 + assert("edward-1040", inode != NULL);
29398 +
29399 + clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29400 + win = clust->win;
29401 + if (!win) {
29402 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29403 + return;
29404 + }
29405 + assert("edward-1176", clust->op != LC_INVAL);
29406 + assert("edward-1064", win->off + win->count + win->delta != 0);
29407 +
29408 + if (win->stat == HOLE_WINDOW &&
29409 + win->off == 0 && win->count == inode_cluster_size(inode)) {
29410 + /* special case: writing a "fake" logical cluster */
29411 + clust->nr_pages = 0;
29412 + return;
29413 + }
29414 + clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29415 + lbytes(clust->index, inode)));
29416 + return;
29417 +}
29418 +
29419 +/* plugin->key_by_inode()
29420 + build key of a disk cluster */
29421 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29422 + reiser4_key * key)
29423 +{
29424 + assert("edward-64", inode != 0);
29425 +
29426 + if (likely(off != get_key_offset(reiser4_max_key())))
29427 + off = off_to_clust_to_off(off, inode);
29428 + if (inode_crypto_info(inode))
29429 + off = inode_scaled_offset(inode, off);
29430 +
29431 + key_by_inode_and_offset_common(inode, 0, key);
29432 + set_key_offset(key, (__u64)off);
29433 + return 0;
29434 +}
29435 +
29436 +/* plugin->flow_by_inode() */
29437 +/* flow is used to read/write disk clusters */
29438 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29439 + int user, /* 1: @buf is of user space,
29440 + 0: kernel space */
29441 + loff_t size, /* @buf size */
29442 + loff_t off, /* offset to start io from */
29443 + rw_op op, /* READ or WRITE */
29444 + flow_t * f /* resulting flow */)
29445 +{
29446 + assert("edward-436", f != NULL);
29447 + assert("edward-149", inode != NULL);
29448 + assert("edward-150", inode_file_plugin(inode) != NULL);
29449 + assert("edward-1465", user == 0); /* we use flow to read/write
29450 + disk clusters located in
29451 + kernel space */
29452 + f->length = size;
29453 + memcpy(&f->data, &buf, sizeof(buf));
29454 + f->user = user;
29455 + f->op = op;
29456 +
29457 + return key_by_inode_cryptcompress(inode, off, &f->key);
29458 +}
29459 +
29460 +static int
29461 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29462 + znode_lock_mode lock_mode)
29463 +{
29464 + coord_t *coord;
29465 +
29466 + assert("edward-704", hint != NULL);
29467 + assert("edward-1089", !hint_is_valid(hint));
29468 + assert("edward-706", hint->lh.owner == NULL);
29469 +
29470 + coord = &hint->ext_coord.coord;
29471 +
29472 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29473 + /* hint either not set or set by different operation */
29474 + return RETERR(-E_REPEAT);
29475 +
29476 + if (get_key_offset(key) != hint->offset)
29477 + /* hint is set for different key */
29478 + return RETERR(-E_REPEAT);
29479 +
29480 + assert("edward-707", reiser4_schedulable());
29481 +
29482 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29483 + key, &hint->lh, lock_mode,
29484 + ZNODE_LOCK_LOPRI);
29485 +}
29486 +
29487 +/* reserve disk space when writing a logical cluster */
29488 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29489 +{
29490 + int result = 0;
29491 +
29492 + assert("edward-965", reiser4_schedulable());
29493 + assert("edward-439", inode != NULL);
29494 + assert("edward-440", clust != NULL);
29495 + assert("edward-441", clust->pages != NULL);
29496 +
29497 + if (clust->nr_pages == 0) {
29498 + assert("edward-1152", clust->win != NULL);
29499 + assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29500 + /* don't reserve disk space for fake logical cluster */
29501 + return 0;
29502 + }
29503 + assert("edward-442", jprivate(clust->pages[0]) != NULL);
29504 +
29505 + result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29506 + estimate_update_cluster(inode),
29507 + BA_CAN_COMMIT);
29508 + if (result)
29509 + return result;
29510 + clust->reserved = 1;
29511 + grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29512 + estimate_update_cluster(inode));
29513 +#if REISER4_DEBUG
29514 + clust->reserved_prepped = estimate_update_cluster(inode);
29515 + clust->reserved_unprepped = estimate_insert_cluster(inode);
29516 +#endif
29517 + /* there can be space grabbed by txnmgr_force_commit_all */
29518 + return 0;
29519 +}
29520 +
29521 +/* free reserved disk space if writing a logical cluster fails */
29522 +static void free_reserved4cluster(struct inode *inode,
29523 + struct cluster_handle *ch, int count)
29524 +{
29525 + assert("edward-967", ch->reserved == 1);
29526 +
29527 + cluster_reserved2free(count);
29528 + ch->reserved = 0;
29529 +}
29530 +
29531 +/* The core search procedure of the cryptcompress plugin.
29532 + If returned value is not cbk_errored, then current znode is locked */
29533 +static int find_cluster_item(hint_t * hint,
29534 + const reiser4_key * key, /* key of the item we are
29535 + looking for */
29536 + znode_lock_mode lock_mode /* which lock */ ,
29537 + ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29538 +{
29539 + int result;
29540 + reiser4_key ikey;
29541 + int went_right = 0;
29542 + coord_t *coord = &hint->ext_coord.coord;
29543 + coord_t orig = *coord;
29544 +
29545 + assert("edward-152", hint != NULL);
29546 +
29547 + if (!hint_is_valid(hint)) {
29548 + result = cryptcompress_hint_validate(hint, key, lock_mode);
29549 + if (result == -E_REPEAT)
29550 + goto traverse_tree;
29551 + else if (result) {
29552 + assert("edward-1216", 0);
29553 + return result;
29554 + }
29555 + hint_set_valid(hint);
29556 + }
29557 + assert("edward-709", znode_is_any_locked(coord->node));
29558 +
29559 + /* In-place lookup is going here, it means we just need to
29560 + check if next item of the @coord match to the @keyhint) */
29561 +
29562 + if (equal_to_rdk(coord->node, key)) {
29563 + result = goto_right_neighbor(coord, &hint->lh);
29564 + if (result == -E_NO_NEIGHBOR) {
29565 + assert("edward-1217", 0);
29566 + return RETERR(-EIO);
29567 + }
29568 + if (result)
29569 + return result;
29570 + assert("edward-1218", equal_to_ldk(coord->node, key));
29571 + went_right = 1;
29572 + } else {
29573 + coord->item_pos++;
29574 + coord->unit_pos = 0;
29575 + coord->between = AT_UNIT;
29576 + }
29577 + result = zload(coord->node);
29578 + if (result)
29579 + return result;
29580 + assert("edward-1219", !node_is_empty(coord->node));
29581 +
29582 + if (!coord_is_existing_item(coord)) {
29583 + zrelse(coord->node);
29584 + goto not_found;
29585 + }
29586 + item_key_by_coord(coord, &ikey);
29587 + zrelse(coord->node);
29588 + if (!keyeq(key, &ikey))
29589 + goto not_found;
29590 + /* Ok, item is found, update node counts */
29591 + if (went_right)
29592 + dclust_inc_extension_ncount(hint);
29593 + return CBK_COORD_FOUND;
29594 +
29595 + not_found:
29596 + assert("edward-1220", coord->item_pos > 0);
29597 + //coord->item_pos--;
29598 + /* roll back */
29599 + *coord = orig;
29600 + ON_DEBUG(coord_update_v(coord));
29601 + return CBK_COORD_NOTFOUND;
29602 +
29603 + traverse_tree:
29604 + assert("edward-713", hint->lh.owner == NULL);
29605 + assert("edward-714", reiser4_schedulable());
29606 +
29607 + reiser4_unset_hint(hint);
29608 + dclust_init_extension(hint);
29609 + coord_init_zero(coord);
29610 + result = coord_by_key(current_tree, key, coord, &hint->lh,
29611 + lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29612 + CBK_UNIQUE | flags, ra_info);
29613 + if (cbk_errored(result))
29614 + return result;
29615 + if(result == CBK_COORD_FOUND)
29616 + dclust_inc_extension_ncount(hint);
29617 + hint_set_valid(hint);
29618 + return result;
29619 +}
29620 +
29621 +/* This function is called by deflate[inflate] manager when
29622 + creating a transformed/plain stream to check if we should
29623 + create/cut some overhead. If this returns true, then @oh
29624 + contains the size of this overhead.
29625 + */
29626 +static int need_cut_or_align(struct inode * inode,
29627 + struct cluster_handle * ch, rw_op rw, int * oh)
29628 +{
29629 + struct tfm_cluster * tc = &ch->tc;
29630 + switch (rw) {
29631 + case WRITE_OP: /* estimate align */
29632 + *oh = tc->len % cipher_blocksize(inode);
29633 + if (*oh != 0)
29634 + return 1;
29635 + break;
29636 + case READ_OP: /* estimate cut */
29637 + *oh = *(tfm_output_data(ch) + tc->len - 1);
29638 + break;
29639 + default:
29640 + impossible("edward-1401", "bad option");
29641 + }
29642 + return (tc->len != tc->lsize);
29643 +}
29644 +
29645 +/* create/cut an overhead of transformed/plain stream */
29646 +static void align_or_cut_overhead(struct inode * inode,
29647 + struct cluster_handle * ch, rw_op rw)
29648 +{
29649 + unsigned int oh;
29650 + cipher_plugin * cplug = inode_cipher_plugin(inode);
29651 +
29652 + assert("edward-1402", need_cipher(inode));
29653 +
29654 + if (!need_cut_or_align(inode, ch, rw, &oh))
29655 + return;
29656 + switch (rw) {
29657 + case WRITE_OP: /* do align */
29658 + ch->tc.len +=
29659 + cplug->align_stream(tfm_input_data(ch) +
29660 + ch->tc.len, ch->tc.len,
29661 + cipher_blocksize(inode));
29662 + *(tfm_input_data(ch) + ch->tc.len - 1) =
29663 + cipher_blocksize(inode) - oh;
29664 + break;
29665 + case READ_OP: /* do cut */
29666 + assert("edward-1403", oh <= cipher_blocksize(inode));
29667 + ch->tc.len -= oh;
29668 + break;
29669 + default:
29670 + impossible("edward-1404", "bad option");
29671 + }
29672 + return;
29673 +}
29674 +
29675 +static unsigned max_cipher_overhead(struct inode * inode)
29676 +{
29677 + if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29678 + return 0;
29679 + return cipher_blocksize(inode);
29680 +}
29681 +
29682 +static int deflate_overhead(struct inode *inode)
29683 +{
29684 + return (inode_compression_plugin(inode)->
29685 + checksum ? DC_CHECKSUM_SIZE : 0);
29686 +}
29687 +
29688 +static unsigned deflate_overrun(struct inode * inode, int ilen)
29689 +{
29690 + return coa_overrun(inode_compression_plugin(inode), ilen);
29691 +}
29692 +
29693 +/* Estimating compressibility of a logical cluster by various
29694 + policies represented by compression mode plugin.
29695 + If this returns false, then compressor won't be called for
29696 + the cluster of index @index.
29697 +*/
29698 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
29699 + struct inode *inode)
29700 +{
29701 + compression_plugin *cplug = inode_compression_plugin(inode);
29702 + compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29703 +
29704 + assert("edward-1321", tc->len != 0);
29705 + assert("edward-1322", cplug != NULL);
29706 + assert("edward-1323", mplug != NULL);
29707 +
29708 + return /* estimate by size */
29709 + (cplug->min_size_deflate ?
29710 + tc->len >= cplug->min_size_deflate() :
29711 + 1) &&
29712 + /* estimate by compression mode plugin */
29713 + (mplug->should_deflate ?
29714 + mplug->should_deflate(inode, index) :
29715 + 1);
29716 +}
29717 +
29718 +/* Evaluating results of compression transform.
29719 + Returns true, if we need to accept this results */
29720 +static int save_compressed(int size_before, int size_after, struct inode *inode)
29721 +{
29722 + return (size_after + deflate_overhead(inode) +
29723 + max_cipher_overhead(inode) < size_before);
29724 +}
29725 +
29726 +/* Guess result of the evaluation above */
29727 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29728 + int encrypted /* is cluster encrypted */ )
29729 +{
29730 + struct tfm_cluster * tc = &ch->tc;
29731 +
29732 + assert("edward-142", tc != 0);
29733 + assert("edward-143", inode != NULL);
29734 +
29735 + return tc->len <
29736 + (encrypted ?
29737 + inode_scaled_offset(inode, tc->lsize) :
29738 + tc->lsize);
29739 +}
29740 +
29741 +/* If results of compression were accepted, then we add
29742 + a checksum to catch possible disk cluster corruption.
29743 + The following is a format of the data stored in disk clusters:
29744 +
29745 + data This is (transformed) logical cluster.
29746 + cipher_overhead This is created by ->align() method
29747 + of cipher plugin. May be absent.
29748 + checksum (4) This is created by ->checksum method
29749 + of compression plugin to check
29750 + integrity. May be absent.
29751 +
29752 + Crypto overhead format:
29753 +
29754 + data
29755 + control_byte (1) contains aligned overhead size:
29756 + 1 <= overhead <= cipher_blksize
29757 +*/
29758 +/* Append a checksum at the end of a transformed stream */
29759 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29760 +{
29761 + __u32 checksum;
29762 +
29763 + assert("edward-1309", tc != NULL);
29764 + assert("edward-1310", tc->len > 0);
29765 + assert("edward-1311", cplug->checksum != NULL);
29766 +
29767 + checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29768 + put_unaligned(cpu_to_le32(checksum),
29769 + (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29770 + tc->len += (int)DC_CHECKSUM_SIZE;
29771 +}
29772 +
29773 +/* Check a disk cluster checksum.
29774 + Returns 0 if checksum is correct, otherwise returns 1 */
29775 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29776 +{
29777 + assert("edward-1312", tc != NULL);
29778 + assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29779 + assert("edward-1314", cplug->checksum != NULL);
29780 +
29781 + if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29782 + tc->len - (int)DC_CHECKSUM_SIZE) !=
29783 + le32_to_cpu(get_unaligned((d32 *)
29784 + (tfm_stream_data(tc, INPUT_STREAM)
29785 + + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29786 + warning("edward-156",
29787 + "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29788 + (int)le32_to_cpu
29789 + (get_unaligned((d32 *)
29790 + (tfm_stream_data(tc, INPUT_STREAM) +
29791 + tc->len - (int)DC_CHECKSUM_SIZE))),
29792 + (int)cplug->checksum
29793 + (tfm_stream_data(tc, INPUT_STREAM),
29794 + tc->len - (int)DC_CHECKSUM_SIZE));
29795 + return 1;
29796 + }
29797 + tc->len -= (int)DC_CHECKSUM_SIZE;
29798 + return 0;
29799 +}
29800 +
29801 +/* get input/output stream for some transform action */
29802 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29803 + tfm_stream_id id)
29804 +{
29805 + size_t size = inode_scaled_cluster_size(inode);
29806 +
29807 + assert("edward-901", tc != NULL);
29808 + assert("edward-1027", inode_compression_plugin(inode) != NULL);
29809 +
29810 + if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29811 + size += deflate_overrun(inode, inode_cluster_size(inode));
29812 +
29813 + if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29814 + alternate_streams(tc);
29815 + if (!get_tfm_stream(tc, id))
29816 + return alloc_tfm_stream(tc, size, id);
29817 +
29818 + assert("edward-902", tfm_stream_is_set(tc, id));
29819 +
29820 + if (tfm_stream_size(tc, id) < size)
29821 + return realloc_tfm_stream(tc, size, id);
29822 + return 0;
29823 +}
29824 +
29825 +/* Common deflate manager */
29826 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29827 +{
29828 + int result = 0;
29829 + int compressed = 0;
29830 + int encrypted = 0;
29831 + struct tfm_cluster * tc = &clust->tc;
29832 + compression_plugin * coplug;
29833 +
29834 + assert("edward-401", inode != NULL);
29835 + assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29836 + assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29837 + assert("edward-498", !tfm_cluster_is_uptodate(tc));
29838 +
29839 + coplug = inode_compression_plugin(inode);
29840 + if (should_compress(tc, clust->index, inode)) {
29841 + /* try to compress, discard bad results */
29842 + size_t dst_len;
29843 + compression_mode_plugin * mplug =
29844 + inode_compression_mode_plugin(inode);
29845 + assert("edward-602", coplug != NULL);
29846 + assert("edward-1423", coplug->compress != NULL);
29847 +
29848 + result = grab_coa(tc, coplug);
29849 + if (result) {
29850 + warning("edward-1424",
29851 + "alloc_coa failed with ret=%d, skipped compression",
29852 + result);
29853 + goto cipher;
29854 + }
29855 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29856 + if (result) {
29857 + warning("edward-1425",
29858 + "alloc stream failed with ret=%d, skipped compression",
29859 + result);
29860 + goto cipher;
29861 + }
29862 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29863 + coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29864 + tfm_input_data(clust), tc->len,
29865 + tfm_output_data(clust), &dst_len);
29866 + /* make sure we didn't overwrite extra bytes */
29867 + assert("edward-603",
29868 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29869 +
29870 + /* evaluate results of compression transform */
29871 + if (save_compressed(tc->len, dst_len, inode)) {
29872 + /* good result, accept */
29873 + tc->len = dst_len;
29874 + if (mplug->accept_hook != NULL) {
29875 + result = mplug->accept_hook(inode, clust->index);
29876 + if (result)
29877 + warning("edward-1426",
29878 + "accept_hook failed with ret=%d",
29879 + result);
29880 + }
29881 + compressed = 1;
29882 + }
29883 + else {
29884 + /* bad result, discard */
29885 +#if 0
29886 + if (cluster_is_complete(clust, inode))
29887 + warning("edward-1496",
29888 + "incompressible cluster %lu (inode %llu)",
29889 + clust->index,
29890 + (unsigned long long)get_inode_oid(inode));
29891 +#endif
29892 + if (mplug->discard_hook != NULL &&
29893 + cluster_is_complete(clust, inode)) {
29894 + result = mplug->discard_hook(inode,
29895 + clust->index);
29896 + if (result)
29897 + warning("edward-1427",
29898 + "discard_hook failed with ret=%d",
29899 + result);
29900 + }
29901 + }
29902 + }
29903 + cipher:
29904 + if (need_cipher(inode)) {
29905 + cipher_plugin * ciplug;
29906 + struct blkcipher_desc desc;
29907 + struct scatterlist src;
29908 + struct scatterlist dst;
29909 +
29910 + ciplug = inode_cipher_plugin(inode);
29911 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
29912 + desc.flags = 0;
29913 + if (compressed)
29914 + alternate_streams(tc);
29915 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29916 + if (result)
29917 + return result;
29918 +
29919 + align_or_cut_overhead(inode, clust, WRITE_OP);
29920 + sg_init_one(&src, tfm_input_data(clust), tc->len);
29921 + sg_init_one(&dst, tfm_output_data(clust), tc->len);
29922 +
29923 + result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29924 + if (result) {
29925 + warning("edward-1405",
29926 + "encryption failed flags=%x\n", desc.flags);
29927 + return result;
29928 + }
29929 + encrypted = 1;
29930 + }
29931 + if (compressed && coplug->checksum != NULL)
29932 + dc_set_checksum(coplug, tc);
29933 + if (!compressed && !encrypted)
29934 + alternate_streams(tc);
29935 + return result;
29936 +}
29937 +
29938 +/* Common inflate manager. */
29939 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29940 +{
29941 + int result = 0;
29942 + int transformed = 0;
29943 + struct tfm_cluster * tc = &clust->tc;
29944 + compression_plugin * coplug;
29945 +
29946 + assert("edward-905", inode != NULL);
29947 + assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29948 + assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29949 + assert("edward-1349", tc->act == TFMA_READ);
29950 + assert("edward-907", !tfm_cluster_is_uptodate(tc));
29951 +
29952 + /* Handle a checksum (if any) */
29953 + coplug = inode_compression_plugin(inode);
29954 + if (need_inflate(clust, inode, need_cipher(inode)) &&
29955 + coplug->checksum != NULL) {
29956 + result = dc_check_checksum(coplug, tc);
29957 + if (unlikely(result)) {
29958 + warning("edward-1460",
29959 + "Inode %llu: disk cluster %lu looks corrupted",
29960 + (unsigned long long)get_inode_oid(inode),
29961 + clust->index);
29962 + return RETERR(-EIO);
29963 + }
29964 + }
29965 + if (need_cipher(inode)) {
29966 + cipher_plugin * ciplug;
29967 + struct blkcipher_desc desc;
29968 + struct scatterlist src;
29969 + struct scatterlist dst;
29970 +
29971 + ciplug = inode_cipher_plugin(inode);
29972 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
29973 + desc.flags = 0;
29974 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29975 + if (result)
29976 + return result;
29977 + assert("edward-909", tfm_cluster_is_set(tc));
29978 +
29979 + sg_init_one(&src, tfm_input_data(clust), tc->len);
29980 + sg_init_one(&dst, tfm_output_data(clust), tc->len);
29981 +
29982 + result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29983 + if (result) {
29984 + warning("edward-1600", "decrypt failed flags=%x\n",
29985 + desc.flags);
29986 + return result;
29987 + }
29988 + align_or_cut_overhead(inode, clust, READ_OP);
29989 + transformed = 1;
29990 + }
29991 + if (need_inflate(clust, inode, 0)) {
29992 + size_t dst_len = inode_cluster_size(inode);
29993 + if(transformed)
29994 + alternate_streams(tc);
29995 +
29996 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29997 + if (result)
29998 + return result;
29999 + assert("edward-1305", coplug->decompress != NULL);
30000 + assert("edward-910", tfm_cluster_is_set(tc));
30001 +
30002 + coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
30003 + tfm_input_data(clust), tc->len,
30004 + tfm_output_data(clust), &dst_len);
30005 + /* check length */
30006 + tc->len = dst_len;
30007 + assert("edward-157", dst_len == tc->lsize);
30008 + transformed = 1;
30009 + }
30010 + if (!transformed)
30011 + alternate_streams(tc);
30012 + return result;
30013 +}
30014 +
30015 +/* This is implementation of readpage method of struct
30016 + address_space_operations for cryptcompress plugin. */
30017 +int readpage_cryptcompress(struct file *file, struct page *page)
30018 +{
30019 + reiser4_context *ctx;
30020 + struct cluster_handle clust;
30021 + item_plugin *iplug;
30022 + int result;
30023 +
30024 + assert("edward-88", PageLocked(page));
30025 + assert("vs-976", !PageUptodate(page));
30026 + assert("edward-89", page->mapping && page->mapping->host);
30027 +
30028 + ctx = reiser4_init_context(page->mapping->host->i_sb);
30029 + if (IS_ERR(ctx)) {
30030 + unlock_page(page);
30031 + return PTR_ERR(ctx);
30032 + }
30033 + assert("edward-113",
30034 + ergo(file != NULL,
30035 + page->mapping == file->f_dentry->d_inode->i_mapping));
30036 +
30037 + if (PageUptodate(page)) {
30038 + warning("edward-1338", "page is already uptodate\n");
30039 + unlock_page(page);
30040 + reiser4_exit_context(ctx);
30041 + return 0;
30042 + }
30043 + cluster_init_read(&clust, NULL);
30044 + clust.file = file;
30045 + iplug = item_plugin_by_id(CTAIL_ID);
30046 + if (!iplug->s.file.readpage) {
30047 + unlock_page(page);
30048 + put_cluster_handle(&clust);
30049 + reiser4_exit_context(ctx);
30050 + return -EINVAL;
30051 + }
30052 + result = iplug->s.file.readpage(&clust, page);
30053 +
30054 + put_cluster_handle(&clust);
30055 + reiser4_txn_restart(ctx);
30056 + reiser4_exit_context(ctx);
30057 + return result;
30058 +}
30059 +
30060 +/* number of pages to check in */
30061 +static int get_new_nrpages(struct cluster_handle * clust)
30062 +{
30063 + switch (clust->op) {
30064 + case LC_APPOV:
30065 + return clust->nr_pages;
30066 + case LC_TRUNC:
30067 + assert("edward-1179", clust->win != NULL);
30068 + return size_in_pages(clust->win->off + clust->win->count);
30069 + default:
30070 + impossible("edward-1180", "bad page cluster option");
30071 + return 0;
30072 + }
30073 +}
30074 +
30075 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
30076 + struct inode * inode)
30077 +{
30078 + int i;
30079 + struct page *pg;
30080 + int nrpages = get_new_nrpages(clust);
30081 +
30082 + for (i = 0; i < nrpages; i++) {
30083 +
30084 + pg = clust->pages[i];
30085 + assert("edward-968", pg != NULL);
30086 + lock_page(pg);
30087 + assert("edward-1065", PageUptodate(pg));
30088 + set_page_dirty_notag(pg);
30089 + unlock_page(pg);
30090 + mark_page_accessed(pg);
30091 + }
30092 +}
30093 +
30094 +/* Grab a page cluster for read/write operations.
30095 + Attach a jnode for write operations (when preparing for modifications, which
30096 + are supposed to be committed).
30097 +
30098 + We allocate only one jnode per page cluster; this jnode is binded to the
30099 + first page of this cluster, so we have an extra-reference that will be put
30100 + as soon as jnode is evicted from memory), other references will be cleaned
30101 + up in flush time (assume that check in page cluster was successful).
30102 +*/
30103 +int grab_page_cluster(struct inode * inode,
30104 + struct cluster_handle * clust, rw_op rw)
30105 +{
30106 + int i;
30107 + int result = 0;
30108 + jnode *node = NULL;
30109 +
30110 + assert("edward-182", clust != NULL);
30111 + assert("edward-183", clust->pages != NULL);
30112 + assert("edward-1466", clust->node == NULL);
30113 + assert("edward-1428", inode != NULL);
30114 + assert("edward-1429", inode->i_mapping != NULL);
30115 + assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
30116 +
30117 + if (clust->nr_pages == 0)
30118 + return 0;
30119 +
30120 + for (i = 0; i < clust->nr_pages; i++) {
30121 +
30122 + assert("edward-1044", clust->pages[i] == NULL);
30123 +
30124 + clust->pages[i] =
30125 + find_or_create_page(inode->i_mapping,
30126 + clust_to_pg(clust->index, inode) + i,
30127 + reiser4_ctx_gfp_mask_get());
30128 + if (!clust->pages[i]) {
30129 + result = RETERR(-ENOMEM);
30130 + break;
30131 + }
30132 + if (i == 0 && rw == WRITE_OP) {
30133 + node = jnode_of_page(clust->pages[i]);
30134 + if (IS_ERR(node)) {
30135 + result = PTR_ERR(node);
30136 + unlock_page(clust->pages[i]);
30137 + break;
30138 + }
30139 + JF_SET(node, JNODE_CLUSTER_PAGE);
30140 + assert("edward-920", jprivate(clust->pages[0]));
30141 + }
30142 + INODE_PGCOUNT_INC(inode);
30143 + unlock_page(clust->pages[i]);
30144 + }
30145 + if (unlikely(result)) {
30146 + while (i) {
30147 + put_cluster_page(clust->pages[--i]);
30148 + INODE_PGCOUNT_DEC(inode);
30149 + }
30150 + if (node && !IS_ERR(node))
30151 + jput(node);
30152 + return result;
30153 + }
30154 + clust->node = node;
30155 + return 0;
30156 +}
30157 +
30158 +static void truncate_page_cluster_range(struct inode * inode,
30159 + struct page ** pages,
30160 + cloff_t index,
30161 + int from, int count,
30162 + int even_cows)
30163 +{
30164 + assert("edward-1467", count > 0);
30165 + reiser4_invalidate_pages(inode->i_mapping,
30166 + clust_to_pg(index, inode) + from,
30167 + count, even_cows);
30168 +}
30169 +
30170 +/* Put @count pages starting from @from offset */
30171 +void __put_page_cluster(int from, int count,
30172 + struct page ** pages, struct inode * inode)
30173 +{
30174 + int i;
30175 + assert("edward-1468", pages != NULL);
30176 + assert("edward-1469", inode != NULL);
30177 + assert("edward-1470", from >= 0 && count >= 0);
30178 +
30179 + for (i = 0; i < count; i++) {
30180 + assert("edward-1471", pages[from + i] != NULL);
30181 + assert("edward-1472",
30182 + pages[from + i]->index == pages[from]->index + i);
30183 +
30184 + put_cluster_page(pages[from + i]);
30185 + INODE_PGCOUNT_DEC(inode);
30186 + }
30187 +}
30188 +
30189 +/*
30190 + * This is dual to grab_page_cluster,
30191 + * however if @rw == WRITE_OP, then we call this function
30192 + * only if something is failed before checkin page cluster.
30193 + */
30194 +void put_page_cluster(struct cluster_handle * clust,
30195 + struct inode * inode, rw_op rw)
30196 +{
30197 + assert("edward-445", clust != NULL);
30198 + assert("edward-922", clust->pages != NULL);
30199 + assert("edward-446",
30200 + ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
30201 +
30202 + __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
30203 + if (rw == WRITE_OP) {
30204 + if (unlikely(clust->node)) {
30205 + assert("edward-447",
30206 + clust->node == jprivate(clust->pages[0]));
30207 + jput(clust->node);
30208 + clust->node = NULL;
30209 + }
30210 + }
30211 +}
30212 +
30213 +#if REISER4_DEBUG
30214 +int cryptcompress_inode_ok(struct inode *inode)
30215 +{
30216 + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
30217 + return 0;
30218 + if (!cluster_shift_ok(inode_cluster_shift(inode)))
30219 + return 0;
30220 + return 1;
30221 +}
30222 +
30223 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
30224 +{
30225 + assert("edward-1115", win != NULL);
30226 + assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
30227 +
30228 + return (win->off != inode_cluster_size(inode)) &&
30229 + (win->off + win->count + win->delta <= inode_cluster_size(inode));
30230 +}
30231 +
30232 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
30233 +{
30234 + assert("edward-279", clust != NULL);
30235 +
30236 + if (!clust->pages)
30237 + return 0;
30238 + return (clust->win ? window_ok(clust->win, inode) : 1);
30239 +}
30240 +#if 0
30241 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
30242 +{
30243 + int found;
30244 + struct page * page;
30245 +
30246 + found = find_get_pages(inode->i_mapping, start, 1, &page);
30247 + if (found)
30248 + put_cluster_page(page);
30249 + return !found;
30250 +}
30251 +#else
30252 +#define pages_truncate_ok(inode, start) 1
30253 +#endif
30254 +
30255 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
30256 +{
30257 + jnode *node;
30258 + node = jlookup(current_tree, get_inode_oid(inode),
30259 + clust_to_pg(index, inode));
30260 + if (likely(!node))
30261 + return 1;
30262 + jput(node);
30263 + return 0;
30264 +}
30265 +
30266 +static int find_fake_appended(struct inode *inode, cloff_t * index);
30267 +
30268 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
30269 +{
30270 + int result;
30271 + cloff_t raidx;
30272 +
30273 + result = find_fake_appended(inode, &raidx);
30274 + return !result && (aidx == raidx);
30275 +}
30276 +#endif
30277 +
30278 +/* guess next window stat */
30279 +static inline window_stat next_window_stat(struct reiser4_slide * win)
30280 +{
30281 + assert("edward-1130", win != NULL);
30282 + return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
30283 + HOLE_WINDOW : DATA_WINDOW);
30284 +}
30285 +
30286 +/* guess and set next cluster index and window params */
30287 +static void move_update_window(struct inode * inode,
30288 + struct cluster_handle * clust,
30289 + loff_t file_off, loff_t to_file)
30290 +{
30291 + struct reiser4_slide * win;
30292 +
30293 + assert("edward-185", clust != NULL);
30294 + assert("edward-438", clust->pages != NULL);
30295 + assert("edward-281", cluster_ok(clust, inode));
30296 +
30297 + win = clust->win;
30298 + if (!win)
30299 + return;
30300 +
30301 + switch (win->stat) {
30302 + case DATA_WINDOW:
30303 + /* increment */
30304 + clust->index++;
30305 + win->stat = DATA_WINDOW;
30306 + win->off = 0;
30307 + win->count = min((loff_t)inode_cluster_size(inode), to_file);
30308 + break;
30309 + case HOLE_WINDOW:
30310 + switch (next_window_stat(win)) {
30311 + case HOLE_WINDOW:
30312 + /* skip */
30313 + clust->index = off_to_clust(file_off, inode);
30314 + win->stat = HOLE_WINDOW;
30315 + win->off = 0;
30316 + win->count = off_to_cloff(file_off, inode);
30317 + win->delta = min((loff_t)(inode_cluster_size(inode) -
30318 + win->count), to_file);
30319 + break;
30320 + case DATA_WINDOW:
30321 + /* stay */
30322 + win->stat = DATA_WINDOW;
30323 + /* off+count+delta=inv */
30324 + win->off = win->off + win->count;
30325 + win->count = win->delta;
30326 + win->delta = 0;
30327 + break;
30328 + default:
30329 + impossible("edward-282", "wrong next window state");
30330 + }
30331 + break;
30332 + default:
30333 + impossible("edward-283", "wrong current window state");
30334 + }
30335 + assert("edward-1068", cluster_ok(clust, inode));
30336 +}
30337 +
30338 +static int update_sd_cryptcompress(struct inode *inode)
30339 +{
30340 + int result = 0;
30341 +
30342 + assert("edward-978", reiser4_schedulable());
30343 +
30344 + result = reiser4_grab_space_force(/* one for stat data update */
30345 + estimate_update_common(inode),
30346 + BA_CAN_COMMIT);
30347 + if (result)
30348 + return result;
30349 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
30350 + result = reiser4_update_sd(inode);
30351 +
30352 + return result;
30353 +}
30354 +
30355 +static void uncapture_cluster_jnode(jnode * node)
30356 +{
30357 + txn_atom *atom;
30358 +
30359 + assert_spin_locked(&(node->guard));
30360 +
30361 + atom = jnode_get_atom(node);
30362 + if (atom == NULL) {
30363 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
30364 + spin_unlock_jnode(node);
30365 + return;
30366 + }
30367 + reiser4_uncapture_block(node);
30368 + spin_unlock_atom(atom);
30369 + jput(node);
30370 +}
30371 +
30372 +static void put_found_pages(struct page **pages, int nr)
30373 +{
30374 + int i;
30375 + for (i = 0; i < nr; i++) {
30376 + assert("edward-1045", pages[i] != NULL);
30377 + put_cluster_page(pages[i]);
30378 + }
30379 +}
30380 +
30381 +/* Lifecycle of a logical cluster in the system.
30382 + *
30383 + *
30384 + * Logical cluster of a cryptcompress file is represented in the system by
30385 + * . page cluster (in memory, primary cache, contains plain text);
30386 + * . disk cluster (in memory, secondary cache, contains transformed text).
30387 + * Primary cache is to reduce number of transform operations (compression,
30388 + * encryption), i.e. to implement transform-caching strategy.
30389 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
30390 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30391 + * a logical cluster to the primary cache. Disk cluster is a set of items
30392 + * of the same type defined by some reiser4 item plugin id.
30393 + *
30394 + * 1. Performing modifications
30395 + *
30396 + * Every modification of a cryptcompress file is considered as a set of
30397 + * operations performed on file's logical clusters. Every such "atomic"
30398 + * modification is truncate, append and(or) overwrite some bytes of a
30399 + * logical cluster performed in the primary cache with the following
30400 + * synchronization with the secondary cache (in flush time). Disk clusters,
30401 + * which live in the secondary cache, are supposed to be synchronized with
30402 + * disk. The mechanism of synchronization of primary and secondary caches
30403 + * includes so-called checkin/checkout technique described below.
30404 + *
30405 + * 2. Submitting modifications
30406 + *
30407 + * Each page cluster has associated jnode (a special in-memory header to
30408 + * keep a track of transactions in reiser4), which is attached to its first
30409 + * page when grabbing page cluster for modifications (see grab_page_cluster).
30410 + * Submitting modifications (see checkin_logical_cluster) is going per logical
30411 + * cluster and includes:
30412 + * . checkin_cluster_size;
30413 + * . checkin_page_cluster.
30414 + * checkin_cluster_size() is resolved to file size update (which completely
30415 + * defines new size of logical cluster (number of file's bytes in a logical
30416 + * cluster).
30417 + * checkin_page_cluster() captures jnode of a page cluster and installs
30418 + * jnode's dirty flag (if needed) to indicate that modifications are
30419 + * successfully checked in.
30420 + *
30421 + * 3. Checking out modifications
30422 + *
30423 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
30424 + * This is the time of synchronizing primary and secondary caches.
30425 + * checkout_logical_cluster() includes:
30426 + * . checkout_page_cluster (retrieving checked in pages).
30427 + * . uncapture jnode (including clear dirty flag and unlock)
30428 + *
30429 + * 4. Committing modifications
30430 + *
30431 + * Proceeding a synchronization of primary and secondary caches. When checking
30432 + * out page cluster (the phase above) pages are locked/flushed/unlocked
30433 + * one-by-one in ascending order of their indexes to contiguous stream, which
30434 + * is supposed to be transformed (compressed, encrypted), chopped up into items
30435 + * and committed to disk as a disk cluster.
30436 + *
30437 + * 5. Managing page references
30438 + *
30439 + * Every checked in page have a special additional "control" reference,
30440 + * which is dropped at checkout. We need this to avoid unexpected evicting
30441 + * pages from memory before checkout. Control references are managed so
30442 + * they are not accumulated with every checkin:
30443 + *
30444 + * 0
30445 + * checkin -> 1
30446 + * 0 -> checkout
30447 + * checkin -> 1
30448 + * checkin -> 1
30449 + * checkin -> 1
30450 + * 0 -> checkout
30451 + * ...
30452 + *
30453 + * Every page cluster has its own unique "cluster lock". Update/drop
30454 + * references are serialized via this lock. Number of checked in cluster
30455 + * pages is calculated by i_size under cluster lock. File size is updated
30456 + * at every checkin action also under cluster lock (except cases of
30457 + * appending/truncating fake logical clusters).
30458 + *
30459 + * Proof of correctness:
30460 + *
30461 + * Since we update file size under cluster lock, in the case of non-fake
30462 + * logical cluster with its lock held we do have expected number of checked
30463 + * in pages. On the other hand, append/truncate of fake logical clusters
30464 + * doesn't change number of checked in pages of any cluster.
30465 + *
30466 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30467 + * Currently, I don't see any reason to create a special lock for those
30468 + * needs.
30469 + */
30470 +
30471 +static inline void lock_cluster(jnode * node)
30472 +{
30473 + spin_lock_jnode(node);
30474 +}
30475 +
30476 +static inline void unlock_cluster(jnode * node)
30477 +{
30478 + spin_unlock_jnode(node);
30479 +}
30480 +
30481 +static inline void unlock_cluster_uncapture(jnode * node)
30482 +{
30483 + uncapture_cluster_jnode(node);
30484 +}
30485 +
30486 +/* Set new file size by window. Cluster lock is required. */
30487 +static void checkin_file_size(struct cluster_handle * clust,
30488 + struct inode * inode)
30489 +{
30490 + loff_t new_size;
30491 + struct reiser4_slide * win;
30492 +
30493 + assert("edward-1181", clust != NULL);
30494 + assert("edward-1182", inode != NULL);
30495 + assert("edward-1473", clust->pages != NULL);
30496 + assert("edward-1474", clust->pages[0] != NULL);
30497 + assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30498 + assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30499 +
30500 +
30501 + win = clust->win;
30502 + assert("edward-1183", win != NULL);
30503 +
30504 + new_size = clust_to_off(clust->index, inode) + win->off;
30505 +
30506 + switch (clust->op) {
30507 + case LC_APPOV:
30508 + if (new_size + win->count <= i_size_read(inode))
30509 + /* overwrite only */
30510 + return;
30511 + new_size += win->count;
30512 + break;
30513 + case LC_TRUNC:
30514 + break;
30515 + default:
30516 + impossible("edward-1184", "bad page cluster option");
30517 + break;
30518 + }
30519 + inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30520 + i_size_write(inode, new_size);
30521 + return;
30522 +}
30523 +
30524 +static inline void checkin_cluster_size(struct cluster_handle * clust,
30525 + struct inode * inode)
30526 +{
30527 + if (clust->win)
30528 + checkin_file_size(clust, inode);
30529 +}
30530 +
30531 +static int checkin_page_cluster(struct cluster_handle * clust,
30532 + struct inode * inode)
30533 +{
30534 + int result;
30535 + jnode * node;
30536 + int old_nrpages = clust->old_nrpages;
30537 + int new_nrpages = get_new_nrpages(clust);
30538 +
30539 + node = clust->node;
30540 +
30541 + assert("edward-221", node != NULL);
30542 + assert("edward-971", clust->reserved == 1);
30543 + assert("edward-1263",
30544 + clust->reserved_prepped == estimate_update_cluster(inode));
30545 + assert("edward-1264", clust->reserved_unprepped == 0);
30546 +
30547 + if (JF_ISSET(node, JNODE_DIRTY)) {
30548 + /*
30549 + * page cluster was checked in, but not yet
30550 + * checked out, so release related resources
30551 + */
30552 + free_reserved4cluster(inode, clust,
30553 + estimate_update_cluster(inode));
30554 + __put_page_cluster(0, clust->old_nrpages,
30555 + clust->pages, inode);
30556 + } else {
30557 + result = capture_cluster_jnode(node);
30558 + if (unlikely(result)) {
30559 + unlock_cluster(node);
30560 + return result;
30561 + }
30562 + jnode_make_dirty_locked(node);
30563 + clust->reserved = 0;
30564 + }
30565 + unlock_cluster(node);
30566 +
30567 + if (new_nrpages < old_nrpages) {
30568 + /* truncate >= 1 complete pages */
30569 + __put_page_cluster(new_nrpages,
30570 + old_nrpages - new_nrpages,
30571 + clust->pages, inode);
30572 + truncate_page_cluster_range(inode,
30573 + clust->pages, clust->index,
30574 + new_nrpages,
30575 + old_nrpages - new_nrpages,
30576 + 0);
30577 + }
30578 +#if REISER4_DEBUG
30579 + clust->reserved_prepped -= estimate_update_cluster(inode);
30580 +#endif
30581 + return 0;
30582 +}
30583 +
30584 +/* Submit modifications of a logical cluster */
30585 +static int checkin_logical_cluster(struct cluster_handle * clust,
30586 + struct inode *inode)
30587 +{
30588 + int result = 0;
30589 + jnode * node;
30590 +
30591 + node = clust->node;
30592 +
30593 + assert("edward-1035", node != NULL);
30594 + assert("edward-1029", clust != NULL);
30595 + assert("edward-1030", clust->reserved == 1);
30596 + assert("edward-1031", clust->nr_pages != 0);
30597 + assert("edward-1032", clust->pages != NULL);
30598 + assert("edward-1033", clust->pages[0] != NULL);
30599 + assert("edward-1446", jnode_is_cluster_page(node));
30600 + assert("edward-1476", node == jprivate(clust->pages[0]));
30601 +
30602 + lock_cluster(node);
30603 + checkin_cluster_size(clust, inode);
30604 + /* this will unlock cluster */
30605 + result = checkin_page_cluster(clust, inode);
30606 + jput(node);
30607 + clust->node = NULL;
30608 + return result;
30609 +}
30610 +
30611 +/*
30612 + * Retrieve size of logical cluster that was checked in at
30613 + * the latest modifying session (cluster lock is required)
30614 + */
30615 +static inline void checkout_cluster_size(struct cluster_handle * clust,
30616 + struct inode * inode)
30617 +{
30618 + struct tfm_cluster *tc = &clust->tc;
30619 +
30620 + tc->len = lbytes(clust->index, inode);
30621 + assert("edward-1478", tc->len != 0);
30622 +}
30623 +
30624 +/*
30625 + * Retrieve a page cluster with the latest submitted modifications
30626 + * and flush its pages to previously allocated contiguous stream.
30627 + */
30628 +static void checkout_page_cluster(struct cluster_handle * clust,
30629 + jnode * node, struct inode * inode)
30630 +{
30631 + int i;
30632 + int found;
30633 + int to_put;
30634 + struct tfm_cluster *tc = &clust->tc;
30635 +
30636 + /* find and put checked in pages: cluster is locked,
30637 + * so we must get expected number (to_put) of pages
30638 + */
30639 + to_put = size_in_pages(lbytes(clust->index, inode));
30640 + found = find_get_pages(inode->i_mapping,
30641 + clust_to_pg(clust->index, inode),
30642 + to_put, clust->pages);
30643 + BUG_ON(found != to_put);
30644 +
30645 + __put_page_cluster(0, to_put, clust->pages, inode);
30646 + unlock_cluster_uncapture(node);
30647 +
30648 + /* Flush found pages.
30649 + *
30650 + * Note, that we don't disable modifications while flushing,
30651 + * moreover, some found pages can be truncated, as we have
30652 + * released cluster lock.
30653 + */
30654 + for (i = 0; i < found; i++) {
30655 + int in_page;
30656 + char * data;
30657 + assert("edward-1479",
30658 + clust->pages[i]->index == clust->pages[0]->index + i);
30659 +
30660 + lock_page(clust->pages[i]);
30661 + if (!PageUptodate(clust->pages[i])) {
30662 + /* page was truncated */
30663 + assert("edward-1480",
30664 + i_size_read(inode) <= page_offset(clust->pages[i]));
30665 + assert("edward-1481",
30666 + clust->pages[i]->mapping != inode->i_mapping);
30667 + unlock_page(clust->pages[i]);
30668 + break;
30669 + }
30670 + /* Update the number of bytes in the logical cluster,
30671 + * as it could be partially truncated. Note, that only
30672 + * partial truncate is possible (complete truncate can
30673 + * not go here, as it is performed via ->kill_hook()
30674 + * called by cut_file_items(), and the last one must
30675 + * wait for znode locked with parent coord).
30676 + */
30677 + checkout_cluster_size(clust, inode);
30678 +
30679 + /* this can be zero, as new file size is
30680 + checked in before truncating pages */
30681 + in_page = __mbp(tc->len, i);
30682 +
30683 + data = kmap(clust->pages[i]);
30684 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30685 + data, in_page);
30686 + kunmap(clust->pages[i]);
30687 +
30688 + if (PageDirty(clust->pages[i]))
30689 + cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30690 +
30691 + unlock_page(clust->pages[i]);
30692 +
30693 + if (in_page < PAGE_CACHE_SIZE)
30694 + /* end of the file */
30695 + break;
30696 + }
30697 + put_found_pages(clust->pages, found); /* find_get_pages */
30698 + tc->lsize = tc->len;
30699 + return;
30700 +}
30701 +
30702 +/* Check out modifications of a logical cluster */
30703 +int checkout_logical_cluster(struct cluster_handle * clust,
30704 + jnode * node, struct inode *inode)
30705 +{
30706 + int result;
30707 + struct tfm_cluster *tc = &clust->tc;
30708 +
30709 + assert("edward-980", node != NULL);
30710 + assert("edward-236", inode != NULL);
30711 + assert("edward-237", clust != NULL);
30712 + assert("edward-240", !clust->win);
30713 + assert("edward-241", reiser4_schedulable());
30714 + assert("edward-718", cryptcompress_inode_ok(inode));
30715 +
30716 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30717 + if (result) {
30718 + warning("edward-1430", "alloc stream failed with ret=%d",
30719 + result);
30720 + return RETERR(-E_REPEAT);
30721 + }
30722 + lock_cluster(node);
30723 +
30724 + if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30725 + /* race with another flush */
30726 + warning("edward-982",
30727 + "checking out logical cluster %lu of inode %llu: "
30728 + "jnode is not dirty", clust->index,
30729 + (unsigned long long)get_inode_oid(inode));
30730 + unlock_cluster(node);
30731 + return RETERR(-E_REPEAT);
30732 + }
30733 + cluster_reserved2grabbed(estimate_update_cluster(inode));
30734 +
30735 + /* this will unlock cluster */
30736 + checkout_page_cluster(clust, node, inode);
30737 + return 0;
30738 +}
30739 +
30740 +/* set hint for the cluster of the index @index */
30741 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
30742 + cloff_t index, znode_lock_mode mode)
30743 +{
30744 + reiser4_key key;
30745 + assert("edward-722", cryptcompress_inode_ok(inode));
30746 + assert("edward-723",
30747 + inode_file_plugin(inode) ==
30748 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30749 +
30750 + inode_file_plugin(inode)->key_by_inode(inode,
30751 + clust_to_off(index, inode),
30752 + &key);
30753 +
30754 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30755 + hint->offset = get_key_offset(&key);
30756 + hint->mode = mode;
30757 +}
30758 +
30759 +void invalidate_hint_cluster(struct cluster_handle * clust)
30760 +{
30761 + assert("edward-1291", clust != NULL);
30762 + assert("edward-1292", clust->hint != NULL);
30763 +
30764 + done_lh(&clust->hint->lh);
30765 + hint_clr_valid(clust->hint);
30766 +}
30767 +
30768 +static void put_hint_cluster(struct cluster_handle * clust,
30769 + struct inode *inode, znode_lock_mode mode)
30770 +{
30771 + assert("edward-1286", clust != NULL);
30772 + assert("edward-1287", clust->hint != NULL);
30773 +
30774 + set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30775 + invalidate_hint_cluster(clust);
30776 +}
30777 +
30778 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
30779 + struct inode *inode, loff_t off,
30780 + loff_t to_file,
30781 + int nr_dirtied)
30782 +{
30783 + int result;
30784 + struct cryptcompress_info * info;
30785 +
30786 + assert("edward-724", inode != NULL);
30787 + assert("edward-725", cryptcompress_inode_ok(inode));
30788 + assert("edward-1547",
30789 + nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode));
30790 +
30791 + /* set next window params */
30792 + move_update_window(inode, clust, off, to_file);
30793 +
30794 + result = update_sd_cryptcompress(inode);
30795 + if (result)
30796 + return result;
30797 + assert("edward-726", clust->hint->lh.owner == NULL);
30798 + info = cryptcompress_inode_data(inode);
30799 +
30800 + mutex_unlock(&info->checkin_mutex);
30801 + reiser4_throttle_write(inode, nr_dirtied);
30802 + mutex_lock(&info->checkin_mutex);
30803 + return 0;
30804 +}
30805 +
30806 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30807 + its pages */
30808 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
30809 + loff_t file_off, loff_t to_file)
30810 +{
30811 + int result = 0;
30812 + unsigned cl_off, cl_count = 0;
30813 + unsigned to_pg, pg_off;
30814 + struct reiser4_slide * win;
30815 +
30816 + assert("edward-190", clust != NULL);
30817 + assert("edward-1069", clust->win != NULL);
30818 + assert("edward-191", inode != NULL);
30819 + assert("edward-727", cryptcompress_inode_ok(inode));
30820 + assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30821 + assert("edward-1154",
30822 + ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30823 +
30824 + win = clust->win;
30825 +
30826 + assert("edward-1070", win != NULL);
30827 + assert("edward-201", win->stat == HOLE_WINDOW);
30828 + assert("edward-192", cluster_ok(clust, inode));
30829 +
30830 + if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30831 + /* This part of the hole will be represented by "fake"
30832 + * logical cluster, i.e. which doesn't have appropriate
30833 + * disk cluster until someone modify this logical cluster
30834 + * and make it dirty.
30835 + * So go forward here..
30836 + */
30837 + move_update_window(inode, clust, file_off, to_file);
30838 + return 0;
30839 + }
30840 + cl_count = win->count; /* number of zeroes to write */
30841 + cl_off = win->off;
30842 + pg_off = off_to_pgoff(win->off);
30843 +
30844 + while (cl_count) {
30845 + struct page *page;
30846 + page = clust->pages[off_to_pg(cl_off)];
30847 +
30848 + assert("edward-284", page != NULL);
30849 +
30850 + to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30851 + lock_page(page);
30852 + zero_user(page, pg_off, to_pg);
30853 + SetPageUptodate(page);
30854 + set_page_dirty_notag(page);
30855 + mark_page_accessed(page);
30856 + unlock_page(page);
30857 +
30858 + cl_off += to_pg;
30859 + cl_count -= to_pg;
30860 + pg_off = 0;
30861 + }
30862 + if (!win->delta) {
30863 + /* only zeroes in this window, try to capture
30864 + */
30865 + result = checkin_logical_cluster(clust, inode);
30866 + if (result)
30867 + return result;
30868 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30869 + result = balance_dirty_page_cluster(clust,
30870 + inode, file_off, to_file,
30871 + win_count_to_nrpages(win));
30872 + } else
30873 + move_update_window(inode, clust, file_off, to_file);
30874 + return result;
30875 +}
30876 +
30877 +/*
30878 + The main disk search procedure for cryptcompress plugin, which
30879 + . scans all items of disk cluster with the lock mode @mode
30880 + . maybe reads each one (if @read)
30881 + . maybe makes its znode dirty (if write lock mode was specified)
30882 +
30883 + NOTE-EDWARD: Callers should handle the case when disk cluster
30884 + is incomplete (-EIO)
30885 +*/
30886 +int find_disk_cluster(struct cluster_handle * clust,
30887 + struct inode *inode, int read, znode_lock_mode mode)
30888 +{
30889 + flow_t f;
30890 + hint_t *hint;
30891 + int result = 0;
30892 + int was_grabbed;
30893 + ra_info_t ra_info;
30894 + file_plugin *fplug;
30895 + item_plugin *iplug;
30896 + struct tfm_cluster *tc;
30897 + struct cryptcompress_info * info;
30898 +
30899 + assert("edward-138", clust != NULL);
30900 + assert("edward-728", clust->hint != NULL);
30901 + assert("edward-226", reiser4_schedulable());
30902 + assert("edward-137", inode != NULL);
30903 + assert("edward-729", cryptcompress_inode_ok(inode));
30904 +
30905 + hint = clust->hint;
30906 + fplug = inode_file_plugin(inode);
30907 + was_grabbed = get_current_context()->grabbed_blocks;
30908 + info = cryptcompress_inode_data(inode);
30909 + tc = &clust->tc;
30910 +
30911 + assert("edward-462", !tfm_cluster_is_uptodate(tc));
30912 + assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30913 +
30914 + dclust_init_extension(hint);
30915 +
30916 + /* set key of the first disk cluster item */
30917 + fplug->flow_by_inode(inode,
30918 + (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30919 + 0 /* kernel space */ ,
30920 + inode_scaled_cluster_size(inode),
30921 + clust_to_off(clust->index, inode), READ_OP, &f);
30922 + if (mode == ZNODE_WRITE_LOCK) {
30923 + /* reserve for flush to make dirty all the leaf nodes
30924 + which contain disk cluster */
30925 + result =
30926 + reiser4_grab_space_force(estimate_dirty_cluster(inode),
30927 + BA_CAN_COMMIT);
30928 + if (result)
30929 + goto out;
30930 + }
30931 +
30932 + ra_info.key_to_stop = f.key;
30933 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30934 +
30935 + while (f.length) {
30936 + result = find_cluster_item(hint, &f.key, mode,
30937 + NULL, FIND_EXACT,
30938 + (mode == ZNODE_WRITE_LOCK ?
30939 + CBK_FOR_INSERT : 0));
30940 + switch (result) {
30941 + case CBK_COORD_NOTFOUND:
30942 + result = 0;
30943 + if (inode_scaled_offset
30944 + (inode, clust_to_off(clust->index, inode)) ==
30945 + get_key_offset(&f.key)) {
30946 + /* first item not found, this is treated
30947 + as disk cluster is absent */
30948 + clust->dstat = FAKE_DISK_CLUSTER;
30949 + goto out;
30950 + }
30951 + /* we are outside the cluster, stop search here */
30952 + assert("edward-146",
30953 + f.length != inode_scaled_cluster_size(inode));
30954 + goto ok;
30955 + case CBK_COORD_FOUND:
30956 + assert("edward-148",
30957 + hint->ext_coord.coord.between == AT_UNIT);
30958 + assert("edward-460",
30959 + hint->ext_coord.coord.unit_pos == 0);
30960 +
30961 + coord_clear_iplug(&hint->ext_coord.coord);
30962 + result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30963 + if (unlikely(result))
30964 + goto out;
30965 + iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30966 + assert("edward-147",
30967 + item_id_by_coord(&hint->ext_coord.coord) ==
30968 + CTAIL_ID);
30969 +
30970 + result = iplug->s.file.read(NULL, &f, hint);
30971 + if (result) {
30972 + zrelse(hint->ext_coord.coord.node);
30973 + goto out;
30974 + }
30975 + if (mode == ZNODE_WRITE_LOCK) {
30976 + /* Don't make dirty more nodes then it was
30977 + estimated (see comments before
30978 + estimate_dirty_cluster). Missed nodes will be
30979 + read up in flush time if they are evicted from
30980 + memory */
30981 + if (dclust_get_extension_ncount(hint) <=
30982 + estimate_dirty_cluster(inode))
30983 + znode_make_dirty(hint->ext_coord.coord.node);
30984 +
30985 + znode_set_convertible(hint->ext_coord.coord.
30986 + node);
30987 + }
30988 + zrelse(hint->ext_coord.coord.node);
30989 + break;
30990 + default:
30991 + goto out;
30992 + }
30993 + }
30994 + ok:
30995 + /* at least one item was found */
30996 + /* NOTE-EDWARD: Callers should handle the case
30997 + when disk cluster is incomplete (-EIO) */
30998 + tc->len = inode_scaled_cluster_size(inode) - f.length;
30999 + tc->lsize = lbytes(clust->index, inode);
31000 + assert("edward-1196", tc->len > 0);
31001 + assert("edward-1406", tc->lsize > 0);
31002 +
31003 + if (hint_is_unprepped_dclust(clust->hint)) {
31004 + clust->dstat = UNPR_DISK_CLUSTER;
31005 + } else if (clust->index == info->trunc_index) {
31006 + clust->dstat = TRNC_DISK_CLUSTER;
31007 + } else {
31008 + clust->dstat = PREP_DISK_CLUSTER;
31009 + dclust_set_extension_dsize(clust->hint, tc->len);
31010 + }
31011 + out:
31012 + assert("edward-1339",
31013 + get_current_context()->grabbed_blocks >= was_grabbed);
31014 + grabbed2free(get_current_context(),
31015 + get_current_super_private(),
31016 + get_current_context()->grabbed_blocks - was_grabbed);
31017 + return result;
31018 +}
31019 +
31020 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
31021 + znode_lock_mode lock_mode)
31022 +{
31023 + reiser4_key key;
31024 + ra_info_t ra_info;
31025 +
31026 + assert("edward-730", reiser4_schedulable());
31027 + assert("edward-731", clust != NULL);
31028 + assert("edward-732", inode != NULL);
31029 +
31030 + if (hint_is_valid(clust->hint)) {
31031 + assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
31032 + assert("edward-1294",
31033 + znode_is_write_locked(clust->hint->lh.node));
31034 + /* already have a valid locked position */
31035 + return (clust->dstat ==
31036 + FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
31037 + CBK_COORD_FOUND);
31038 + }
31039 + key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
31040 + &key);
31041 + ra_info.key_to_stop = key;
31042 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
31043 +
31044 + return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
31045 + CBK_FOR_INSERT);
31046 +}
31047 +
31048 +/* Read needed cluster pages before modifying.
31049 + If success, @clust->hint contains locked position in the tree.
31050 + Also:
31051 + . find and set disk cluster state
31052 + . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
31053 +*/
31054 +static int read_some_cluster_pages(struct inode * inode,
31055 + struct cluster_handle * clust)
31056 +{
31057 + int i;
31058 + int result = 0;
31059 + item_plugin *iplug;
31060 + struct reiser4_slide * win = clust->win;
31061 + znode_lock_mode mode = ZNODE_WRITE_LOCK;
31062 +
31063 + iplug = item_plugin_by_id(CTAIL_ID);
31064 +
31065 + assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
31066 +
31067 +#if REISER4_DEBUG
31068 + if (clust->nr_pages == 0) {
31069 + /* start write hole from fake disk cluster */
31070 + assert("edward-1117", win != NULL);
31071 + assert("edward-1118", win->stat == HOLE_WINDOW);
31072 + assert("edward-1119", new_logical_cluster(clust, inode));
31073 + }
31074 +#endif
31075 + if (new_logical_cluster(clust, inode)) {
31076 + /*
31077 + new page cluster is about to be written, nothing to read,
31078 + */
31079 + assert("edward-734", reiser4_schedulable());
31080 + assert("edward-735", clust->hint->lh.owner == NULL);
31081 +
31082 + if (clust->nr_pages) {
31083 + int off;
31084 + struct page * pg;
31085 + assert("edward-1419", clust->pages != NULL);
31086 + pg = clust->pages[clust->nr_pages - 1];
31087 + assert("edward-1420", pg != NULL);
31088 + off = off_to_pgoff(win->off+win->count+win->delta);
31089 + if (off) {
31090 + lock_page(pg);
31091 + zero_user_segment(pg, off, PAGE_CACHE_SIZE);
31092 + unlock_page(pg);
31093 + }
31094 + }
31095 + clust->dstat = FAKE_DISK_CLUSTER;
31096 + return 0;
31097 + }
31098 + /*
31099 + Here we should search for disk cluster to figure out its real state.
31100 + Also there is one more important reason to do disk search: we need
31101 + to make disk cluster _dirty_ if it exists
31102 + */
31103 +
31104 + /* if windows is specified, read the only pages
31105 + that will be modified partially */
31106 +
31107 + for (i = 0; i < clust->nr_pages; i++) {
31108 + struct page *pg = clust->pages[i];
31109 +
31110 + lock_page(pg);
31111 + if (PageUptodate(pg)) {
31112 + unlock_page(pg);
31113 + continue;
31114 + }
31115 + unlock_page(pg);
31116 +
31117 + if (win &&
31118 + i >= size_in_pages(win->off) &&
31119 + i < off_to_pg(win->off + win->count + win->delta))
31120 + /* page will be completely overwritten */
31121 + continue;
31122 +
31123 + if (win && (i == clust->nr_pages - 1) &&
31124 + /* the last page is
31125 + partially modified,
31126 + not uptodate .. */
31127 + (size_in_pages(i_size_read(inode)) <= pg->index)) {
31128 + /* .. and appended,
31129 + so set zeroes to the rest */
31130 + int offset;
31131 + lock_page(pg);
31132 + assert("edward-1260",
31133 + size_in_pages(win->off + win->count +
31134 + win->delta) - 1 == i);
31135 +
31136 + offset =
31137 + off_to_pgoff(win->off + win->count + win->delta);
31138 + zero_user_segment(pg, offset, PAGE_CACHE_SIZE);
31139 + unlock_page(pg);
31140 + /* still not uptodate */
31141 + break;
31142 + }
31143 + lock_page(pg);
31144 + result = do_readpage_ctail(inode, clust, pg, mode);
31145 +
31146 + assert("edward-1526", ergo(!result, PageUptodate(pg)));
31147 + unlock_page(pg);
31148 + if (result) {
31149 + warning("edward-219", "do_readpage_ctail failed");
31150 + goto out;
31151 + }
31152 + }
31153 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
31154 + /* disk cluster unclaimed, but we need to make its znodes dirty
31155 + * to make flush update convert its content
31156 + */
31157 + result = find_disk_cluster(clust, inode,
31158 + 0 /* do not read items */,
31159 + mode);
31160 + }
31161 + out:
31162 + tfm_cluster_clr_uptodate(&clust->tc);
31163 + return result;
31164 +}
31165 +
31166 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
31167 + struct inode * inode)
31168 +{
31169 + assert("edward-737", clust != NULL);
31170 +
31171 + switch (clust->dstat) {
31172 + case PREP_DISK_CLUSTER:
31173 + case UNPR_DISK_CLUSTER:
31174 + return 0;
31175 + case FAKE_DISK_CLUSTER:
31176 + if (clust->win &&
31177 + clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
31178 + assert("edward-1172",
31179 + new_logical_cluster(clust, inode));
31180 + return 0;
31181 + }
31182 + return 1;
31183 + default:
31184 + impossible("edward-1173", "bad disk cluster state");
31185 + return 0;
31186 + }
31187 +}
31188 +
31189 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
31190 + struct inode *inode)
31191 +{
31192 + int result;
31193 +
31194 + assert("edward-1123", reiser4_schedulable());
31195 + assert("edward-737", clust != NULL);
31196 + assert("edward-738", inode != NULL);
31197 + assert("edward-739", cryptcompress_inode_ok(inode));
31198 + assert("edward-1053", clust->hint != NULL);
31199 +
31200 + if (!should_create_unprepped_cluster(clust, inode)) {
31201 + if (clust->reserved) {
31202 + cluster_reserved2free(estimate_insert_cluster(inode));
31203 +#if REISER4_DEBUG
31204 + assert("edward-1267",
31205 + clust->reserved_unprepped ==
31206 + estimate_insert_cluster(inode));
31207 + clust->reserved_unprepped -=
31208 + estimate_insert_cluster(inode);
31209 +#endif
31210 + }
31211 + return 0;
31212 + }
31213 + assert("edward-1268", clust->reserved);
31214 + cluster_reserved2grabbed(estimate_insert_cluster(inode));
31215 +#if REISER4_DEBUG
31216 + assert("edward-1441",
31217 + clust->reserved_unprepped == estimate_insert_cluster(inode));
31218 + clust->reserved_unprepped -= estimate_insert_cluster(inode);
31219 +#endif
31220 + result = ctail_insert_unprepped_cluster(clust, inode);
31221 + if (result)
31222 + return result;
31223 +
31224 + inode_add_bytes(inode, inode_cluster_size(inode));
31225 +
31226 + assert("edward-743", cryptcompress_inode_ok(inode));
31227 + assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
31228 +
31229 + clust->dstat = UNPR_DISK_CLUSTER;
31230 + return 0;
31231 +}
31232 +
31233 +/* . Grab page cluster for read, write, setattr, etc. operations;
31234 + * . Truncate its complete pages, if needed;
31235 + */
31236 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
31237 + rw_op rw)
31238 +{
31239 + assert("edward-177", inode != NULL);
31240 + assert("edward-741", cryptcompress_inode_ok(inode));
31241 + assert("edward-740", clust->pages != NULL);
31242 +
31243 + set_cluster_nrpages(clust, inode);
31244 + reset_cluster_pgset(clust, cluster_nrpages(inode));
31245 + return grab_page_cluster(inode, clust, rw);
31246 +}
31247 +
31248 +/* Truncate complete page cluster of index @index.
31249 + * This is called by ->kill_hook() method of item
31250 + * plugin when deleting a disk cluster of such index.
31251 + */
31252 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
31253 + int even_cows)
31254 +{
31255 + int found;
31256 + int nr_pages;
31257 + jnode *node;
31258 + struct page *pages[MAX_CLUSTER_NRPAGES];
31259 +
31260 + node = jlookup(current_tree, get_inode_oid(inode),
31261 + clust_to_pg(index, inode));
31262 + nr_pages = size_in_pages(lbytes(index, inode));
31263 + assert("edward-1483", nr_pages != 0);
31264 + if (!node)
31265 + goto truncate;
31266 + found = find_get_pages(inode->i_mapping,
31267 + clust_to_pg(index, inode),
31268 + cluster_nrpages(inode), pages);
31269 + if (!found) {
31270 + assert("edward-1484", jnode_truncate_ok(inode, index));
31271 + return;
31272 + }
31273 + lock_cluster(node);
31274 +
31275 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
31276 + && index == 0)
31277 + /* converting to unix_file is in progress */
31278 + JF_CLR(node, JNODE_CLUSTER_PAGE);
31279 + if (JF_ISSET(node, JNODE_DIRTY)) {
31280 + /*
31281 + * @nr_pages were checked in, but not yet checked out -
31282 + * we need to release them. (also there can be pages
31283 + * attached to page cache by read(), etc. - don't take
31284 + * them into account).
31285 + */
31286 + assert("edward-1198", found >= nr_pages);
31287 +
31288 + /* free disk space grabbed for disk cluster converting */
31289 + cluster_reserved2grabbed(estimate_update_cluster(inode));
31290 + grabbed2free(get_current_context(),
31291 + get_current_super_private(),
31292 + estimate_update_cluster(inode));
31293 + __put_page_cluster(0, nr_pages, pages, inode);
31294 +
31295 + /* This will clear dirty bit, uncapture and unlock jnode */
31296 + unlock_cluster_uncapture(node);
31297 + } else
31298 + unlock_cluster(node);
31299 + jput(node); /* jlookup */
31300 + put_found_pages(pages, found); /* find_get_pages */
31301 + truncate:
31302 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
31303 + index == 0)
31304 + return;
31305 + truncate_page_cluster_range(inode, pages, index, 0,
31306 + cluster_nrpages(inode),
31307 + even_cows);
31308 + assert("edward-1201",
31309 + ergo(!reiser4_inode_get_flag(inode,
31310 + REISER4_FILE_CONV_IN_PROGRESS),
31311 + jnode_truncate_ok(inode, index)));
31312 + return;
31313 +}
31314 +
31315 +/*
31316 + * Set cluster handle @clust of a logical cluster before
31317 + * modifications which are supposed to be committed.
31318 + *
31319 + * . grab cluster pages;
31320 + * . reserve disk space;
31321 + * . maybe read pages from disk and set the disk cluster dirty;
31322 + * . maybe write hole and check in (partially zeroed) logical cluster;
31323 + * . create 'unprepped' disk cluster for new or fake logical one.
31324 + */
31325 +static int prepare_logical_cluster(struct inode *inode,
31326 + loff_t file_off, /* write position
31327 + in the file */
31328 + loff_t to_file, /* bytes of users data
31329 + to write to the file */
31330 + struct cluster_handle * clust,
31331 + logical_cluster_op op)
31332 +{
31333 + int result = 0;
31334 + struct reiser4_slide * win = clust->win;
31335 +
31336 + reset_cluster_params(clust);
31337 + cluster_set_tfm_act(&clust->tc, TFMA_READ);
31338 +#if REISER4_DEBUG
31339 + clust->ctx = get_current_context();
31340 +#endif
31341 + assert("edward-1190", op != LC_INVAL);
31342 +
31343 + clust->op = op;
31344 +
31345 + result = prepare_page_cluster(inode, clust, WRITE_OP);
31346 + if (result)
31347 + return result;
31348 + assert("edward-1447",
31349 + ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
31350 + assert("edward-1448",
31351 + ergo(clust->nr_pages != 0,
31352 + jnode_is_cluster_page(jprivate(clust->pages[0]))));
31353 +
31354 + result = reserve4cluster(inode, clust);
31355 + if (result)
31356 + goto err1;
31357 + result = read_some_cluster_pages(inode, clust);
31358 + if (result) {
31359 + free_reserved4cluster(inode,
31360 + clust,
31361 + estimate_update_cluster(inode) +
31362 + estimate_insert_cluster(inode));
31363 + goto err1;
31364 + }
31365 + assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31366 +
31367 + result = cryptcompress_make_unprepped_cluster(clust, inode);
31368 + if (result)
31369 + goto err2;
31370 + if (win && win->stat == HOLE_WINDOW) {
31371 + result = write_hole(inode, clust, file_off, to_file);
31372 + if (result)
31373 + goto err2;
31374 + }
31375 + return 0;
31376 + err2:
31377 + free_reserved4cluster(inode, clust,
31378 + estimate_update_cluster(inode));
31379 + err1:
31380 + put_page_cluster(clust, inode, WRITE_OP);
31381 + assert("edward-1125", result == -ENOSPC);
31382 + return result;
31383 +}
31384 +
31385 +/* set window by two offsets */
31386 +static void set_window(struct cluster_handle * clust,
31387 + struct reiser4_slide * win, struct inode *inode,
31388 + loff_t o1, loff_t o2)
31389 +{
31390 + assert("edward-295", clust != NULL);
31391 + assert("edward-296", inode != NULL);
31392 + assert("edward-1071", win != NULL);
31393 + assert("edward-297", o1 <= o2);
31394 +
31395 + clust->index = off_to_clust(o1, inode);
31396 +
31397 + win->off = off_to_cloff(o1, inode);
31398 + win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31399 + o2 - o1);
31400 + win->delta = 0;
31401 +
31402 + clust->win = win;
31403 +}
31404 +
31405 +static int set_cluster_by_window(struct inode *inode,
31406 + struct cluster_handle * clust,
31407 + struct reiser4_slide * win, size_t length,
31408 + loff_t file_off)
31409 +{
31410 + int result;
31411 +
31412 + assert("edward-197", clust != NULL);
31413 + assert("edward-1072", win != NULL);
31414 + assert("edward-198", inode != NULL);
31415 +
31416 + result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31417 + if (result)
31418 + return result;
31419 +
31420 + if (file_off > i_size_read(inode)) {
31421 + /* Uhmm, hole in cryptcompress file... */
31422 + loff_t hole_size;
31423 + hole_size = file_off - inode->i_size;
31424 +
31425 + set_window(clust, win, inode, inode->i_size, file_off);
31426 + win->stat = HOLE_WINDOW;
31427 + if (win->off + hole_size < inode_cluster_size(inode))
31428 + /* there is also user's data to append to the hole */
31429 + win->delta = min(inode_cluster_size(inode) -
31430 + (win->off + win->count), length);
31431 + return 0;
31432 + }
31433 + set_window(clust, win, inode, file_off, file_off + length);
31434 + win->stat = DATA_WINDOW;
31435 + return 0;
31436 +}
31437 +
31438 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31439 + int count)
31440 +{
31441 + int result = 0;
31442 + int (*setting_actor)(struct cluster_handle * clust, int count);
31443 +
31444 + assert("edward-1358", clust != NULL);
31445 + assert("edward-1359", page != NULL);
31446 + assert("edward-1360", page->mapping != NULL);
31447 + assert("edward-1361", page->mapping->host != NULL);
31448 +
31449 + setting_actor =
31450 + (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31451 + result = setting_actor(clust, count);
31452 + clust->index = pg_to_clust(page->index, page->mapping->host);
31453 + return result;
31454 +}
31455 +
31456 +/* reset all the params that not get updated */
31457 +void reset_cluster_params(struct cluster_handle * clust)
31458 +{
31459 + assert("edward-197", clust != NULL);
31460 +
31461 + clust->dstat = INVAL_DISK_CLUSTER;
31462 + clust->tc.uptodate = 0;
31463 + clust->tc.len = 0;
31464 +}
31465 +
31466 +/* the heart of write_cryptcompress */
31467 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31468 + const char __user *buf, size_t to_write,
31469 + loff_t pos, struct dispatch_context *cont)
31470 +{
31471 + int i;
31472 + hint_t *hint;
31473 + int result = 0;
31474 + size_t count;
31475 + struct reiser4_slide win;
31476 + struct cluster_handle clust;
31477 + struct cryptcompress_info * info;
31478 +
31479 + assert("edward-154", buf != NULL);
31480 + assert("edward-161", reiser4_schedulable());
31481 + assert("edward-748", cryptcompress_inode_ok(inode));
31482 + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31483 + assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31484 +
31485 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31486 + if (hint == NULL)
31487 + return RETERR(-ENOMEM);
31488 +
31489 + result = load_file_hint(file, hint);
31490 + if (result) {
31491 + kfree(hint);
31492 + return result;
31493 + }
31494 + count = to_write;
31495 +
31496 + reiser4_slide_init(&win);
31497 + cluster_init_read(&clust, &win);
31498 + clust.hint = hint;
31499 + info = cryptcompress_inode_data(inode);
31500 +
31501 + mutex_lock(&info->checkin_mutex);
31502 +
31503 + result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31504 + if (result)
31505 + goto out;
31506 +
31507 + if (next_window_stat(&win) == HOLE_WINDOW) {
31508 + /* write hole in this iteration
31509 + separated from the loop below */
31510 + result = write_dispatch_hook(file, inode,
31511 + pos, &clust, cont);
31512 + if (result)
31513 + goto out;
31514 + result = prepare_logical_cluster(inode, pos, count, &clust,
31515 + LC_APPOV);
31516 + if (result)
31517 + goto out;
31518 + }
31519 + do {
31520 + const char __user * src;
31521 + unsigned page_off, to_page;
31522 +
31523 + assert("edward-750", reiser4_schedulable());
31524 +
31525 + result = write_dispatch_hook(file, inode,
31526 + pos + to_write - count,
31527 + &clust, cont);
31528 + if (result)
31529 + goto out;
31530 + if (cont->state == DISPATCH_ASSIGNED_NEW)
31531 + /* done_lh was called in write_dispatch_hook */
31532 + goto out_no_longterm_lock;
31533 +
31534 + result = prepare_logical_cluster(inode, pos, count, &clust,
31535 + LC_APPOV);
31536 + if (result)
31537 + goto out;
31538 +
31539 + assert("edward-751", cryptcompress_inode_ok(inode));
31540 + assert("edward-204", win.stat == DATA_WINDOW);
31541 + assert("edward-1288", hint_is_valid(clust.hint));
31542 + assert("edward-752",
31543 + znode_is_write_locked(hint->ext_coord.coord.node));
31544 + put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31545 +
31546 + /* set write position in page */
31547 + page_off = off_to_pgoff(win.off);
31548 +
31549 + /* copy user's data to cluster pages */
31550 + for (i = off_to_pg(win.off), src = buf;
31551 + i < size_in_pages(win.off + win.count);
31552 + i++, src += to_page) {
31553 + to_page = __mbp(win.off + win.count, i) - page_off;
31554 + assert("edward-1039",
31555 + page_off + to_page <= PAGE_CACHE_SIZE);
31556 + assert("edward-287", clust.pages[i] != NULL);
31557 +
31558 + fault_in_pages_readable(src, to_page);
31559 +
31560 + lock_page(clust.pages[i]);
31561 + result =
31562 + __copy_from_user((char *)kmap(clust.pages[i]) +
31563 + page_off, src, to_page);
31564 + kunmap(clust.pages[i]);
31565 + if (unlikely(result)) {
31566 + unlock_page(clust.pages[i]);
31567 + result = -EFAULT;
31568 + goto err2;
31569 + }
31570 + SetPageUptodate(clust.pages[i]);
31571 + set_page_dirty_notag(clust.pages[i]);
31572 + flush_dcache_page(clust.pages[i]);
31573 + mark_page_accessed(clust.pages[i]);
31574 + unlock_page(clust.pages[i]);
31575 + page_off = 0;
31576 + }
31577 + assert("edward-753", cryptcompress_inode_ok(inode));
31578 +
31579 + result = checkin_logical_cluster(&clust, inode);
31580 + if (result)
31581 + goto err2;
31582 +
31583 + buf += win.count;
31584 + count -= win.count;
31585 +
31586 + result = balance_dirty_page_cluster(&clust, inode, 0, count,
31587 + win_count_to_nrpages(&win));
31588 + if (result)
31589 + goto err1;
31590 + assert("edward-755", hint->lh.owner == NULL);
31591 + reset_cluster_params(&clust);
31592 + continue;
31593 + err2:
31594 + put_page_cluster(&clust, inode, WRITE_OP);
31595 + err1:
31596 + if (clust.reserved)
31597 + free_reserved4cluster(inode,
31598 + &clust,
31599 + estimate_update_cluster(inode));
31600 + break;
31601 + } while (count);
31602 + out:
31603 + done_lh(&hint->lh);
31604 + save_file_hint(file, hint);
31605 + out_no_longterm_lock:
31606 + mutex_unlock(&info->checkin_mutex);
31607 + kfree(hint);
31608 + put_cluster_handle(&clust);
31609 + assert("edward-195",
31610 + ergo((to_write == count),
31611 + (result < 0 || cont->state == DISPATCH_ASSIGNED_NEW)));
31612 + return (to_write - count) ? (to_write - count) : result;
31613 +}
31614 +
31615 +/**
31616 + * plugin->write()
31617 + * @file: file to write to
31618 + * @buf: address of user-space buffer
31619 + * @read_amount: number of bytes to write
31620 + * @off: position in file to write to
31621 + */
31622 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31623 + size_t count, loff_t *off,
31624 + struct dispatch_context *cont)
31625 +{
31626 + ssize_t result;
31627 + struct inode *inode;
31628 + reiser4_context *ctx;
31629 + loff_t pos = *off;
31630 + struct cryptcompress_info *info;
31631 +
31632 + assert("edward-1449", cont->state == DISPATCH_INVAL_STATE);
31633 +
31634 + inode = file->f_dentry->d_inode;
31635 + assert("edward-196", cryptcompress_inode_ok(inode));
31636 +
31637 + info = cryptcompress_inode_data(inode);
31638 + ctx = get_current_context();
31639 +
31640 + result = generic_write_checks(file, &pos, &count, 0);
31641 + if (unlikely(result != 0)) {
31642 + context_set_commit_async(ctx);
31643 + return result;
31644 + }
31645 + if (unlikely(count == 0))
31646 + return 0;
31647 + result = file_remove_suid(file);
31648 + if (unlikely(result != 0)) {
31649 + context_set_commit_async(ctx);
31650 + return result;
31651 + }
31652 + /* remove_suid might create a transaction */
31653 + reiser4_txn_restart(ctx);
31654 +
31655 + result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
31656 +
31657 + if (unlikely(result < 0)) {
31658 + context_set_commit_async(ctx);
31659 + return result;
31660 + }
31661 + /* update position in a file */
31662 + *off = pos + result;
31663 + return result;
31664 +}
31665 +
31666 +/* plugin->readpages */
31667 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31668 + struct list_head *pages, unsigned nr_pages)
31669 +{
31670 + reiser4_context * ctx;
31671 + int ret;
31672 +
31673 + ctx = reiser4_init_context(mapping->host->i_sb);
31674 + if (IS_ERR(ctx)) {
31675 + ret = PTR_ERR(ctx);
31676 + goto err;
31677 + }
31678 + /* cryptcompress file can be built of ctail items only */
31679 + ret = readpages_ctail(file, mapping, pages);
31680 + reiser4_txn_restart(ctx);
31681 + reiser4_exit_context(ctx);
31682 + if (ret) {
31683 +err:
31684 + put_pages_list(pages);
31685 + }
31686 + return ret;
31687 +}
31688 +
31689 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31690 +{
31691 + /* reserve one block to update stat data item */
31692 + assert("edward-1193",
31693 + inode_file_plugin(inode)->estimate.update ==
31694 + estimate_update_common);
31695 + return estimate_update_common(inode);
31696 +}
31697 +
31698 +/**
31699 + * plugin->read
31700 + * @file: file to read from
31701 + * @buf: address of user-space buffer
31702 + * @read_amount: number of bytes to read
31703 + * @off: position in file to read from
31704 + */
31705 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31706 + loff_t * off)
31707 +{
31708 + ssize_t result;
31709 + struct inode *inode;
31710 + reiser4_context *ctx;
31711 + struct cryptcompress_info *info;
31712 + reiser4_block_nr needed;
31713 +
31714 + inode = file->f_dentry->d_inode;
31715 + assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31716 +
31717 + ctx = reiser4_init_context(inode->i_sb);
31718 + if (IS_ERR(ctx))
31719 + return PTR_ERR(ctx);
31720 +
31721 + info = cryptcompress_inode_data(inode);
31722 + needed = cryptcompress_estimate_read(inode);
31723 +
31724 + result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31725 + if (result != 0) {
31726 + reiser4_exit_context(ctx);
31727 + return result;
31728 + }
31729 + result = do_sync_read(file, buf, size, off);
31730 +
31731 + context_set_commit_async(ctx);
31732 + reiser4_exit_context(ctx);
31733 +
31734 + return result;
31735 +}
31736 +
31737 +/* Look for a disk cluster and keep lookup result in @found.
31738 + * If @index > 0, then find disk cluster of the index (@index - 1);
31739 + * If @index == 0, then find the rightmost disk cluster.
31740 + * Keep incremented index of the found disk cluster in @found.
31741 + * @found == 0 means that disk cluster was not found (in the last
31742 + * case (@index == 0) it means that file doesn't have disk clusters).
31743 + */
31744 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31745 + cloff_t index)
31746 +{
31747 + int result;
31748 + reiser4_key key;
31749 + loff_t offset;
31750 + hint_t *hint;
31751 + lock_handle *lh;
31752 + lookup_bias bias;
31753 + coord_t *coord;
31754 + item_plugin *iplug;
31755 +
31756 + assert("edward-1131", inode != NULL);
31757 + assert("edward-95", cryptcompress_inode_ok(inode));
31758 +
31759 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31760 + if (hint == NULL)
31761 + return RETERR(-ENOMEM);
31762 + hint_init_zero(hint);
31763 + lh = &hint->lh;
31764 +
31765 + bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31766 + offset =
31767 + (index ? clust_to_off(index, inode) -
31768 + 1 : get_key_offset(reiser4_max_key()));
31769 +
31770 + key_by_inode_cryptcompress(inode, offset, &key);
31771 +
31772 + /* find the last item of this object */
31773 + result =
31774 + find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31775 + bias, 0);
31776 + if (cbk_errored(result)) {
31777 + done_lh(lh);
31778 + kfree(hint);
31779 + return result;
31780 + }
31781 + if (result == CBK_COORD_NOTFOUND) {
31782 + /* no real disk clusters */
31783 + done_lh(lh);
31784 + kfree(hint);
31785 + *found = 0;
31786 + return 0;
31787 + }
31788 + /* disk cluster is found */
31789 + coord = &hint->ext_coord.coord;
31790 + coord_clear_iplug(coord);
31791 + result = zload(coord->node);
31792 + if (unlikely(result)) {
31793 + done_lh(lh);
31794 + kfree(hint);
31795 + return result;
31796 + }
31797 + iplug = item_plugin_by_coord(coord);
31798 + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31799 + assert("edward-1202", ctail_ok(coord));
31800 +
31801 + item_key_by_coord(coord, &key);
31802 + *found = off_to_clust(get_key_offset(&key), inode) + 1;
31803 +
31804 + assert("edward-1132", ergo(index, index == *found));
31805 +
31806 + zrelse(coord->node);
31807 + done_lh(lh);
31808 + kfree(hint);
31809 + return 0;
31810 +}
31811 +
31812 +static int find_fake_appended(struct inode *inode, cloff_t * index)
31813 +{
31814 + return lookup_disk_cluster(inode, index,
31815 + 0 /* find last real one */ );
31816 +}
31817 +
31818 +/* Set left coord when unit is not found after node_lookup()
31819 + This takes into account that there can be holes in a sequence
31820 + of disk clusters */
31821 +
31822 +static void adjust_left_coord(coord_t * left_coord)
31823 +{
31824 + switch (left_coord->between) {
31825 + case AFTER_UNIT:
31826 + left_coord->between = AFTER_ITEM;
31827 + case AFTER_ITEM:
31828 + case BEFORE_UNIT:
31829 + break;
31830 + default:
31831 + impossible("edward-1204", "bad left coord to cut");
31832 + }
31833 + return;
31834 +}
31835 +
31836 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
31837 +
31838 +/* plugin->cut_tree_worker */
31839 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31840 + const reiser4_key * to_key,
31841 + reiser4_key * smallest_removed,
31842 + struct inode *object, int truncate,
31843 + int *progress)
31844 +{
31845 + lock_handle next_node_lock;
31846 + coord_t left_coord;
31847 + int result;
31848 +
31849 + assert("edward-1158", tap->coord->node != NULL);
31850 + assert("edward-1159", znode_is_write_locked(tap->coord->node));
31851 + assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31852 +
31853 + *progress = 0;
31854 + init_lh(&next_node_lock);
31855 +
31856 + while (1) {
31857 + znode *node; /* node from which items are cut */
31858 + node_plugin *nplug; /* node plugin for @node */
31859 +
31860 + node = tap->coord->node;
31861 +
31862 + /* Move next_node_lock to the next node on the left. */
31863 + result =
31864 + reiser4_get_left_neighbor(&next_node_lock, node,
31865 + ZNODE_WRITE_LOCK,
31866 + GN_CAN_USE_UPPER_LEVELS);
31867 + if (result != 0 && result != -E_NO_NEIGHBOR)
31868 + break;
31869 + /* FIXME-EDWARD: Check can we delete the node as a whole. */
31870 + result = reiser4_tap_load(tap);
31871 + if (result)
31872 + return result;
31873 +
31874 + /* Prepare the second (right) point for cut_node() */
31875 + if (*progress)
31876 + coord_init_last_unit(tap->coord, node);
31877 +
31878 + else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31879 + /* set rightmost unit for the items without lookup method */
31880 + tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31881 +
31882 + nplug = node->nplug;
31883 +
31884 + assert("edward-1161", nplug);
31885 + assert("edward-1162", nplug->lookup);
31886 +
31887 + /* left_coord is leftmost unit cut from @node */
31888 + result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31889 +
31890 + if (IS_CBKERR(result))
31891 + break;
31892 +
31893 + if (result == CBK_COORD_NOTFOUND)
31894 + adjust_left_coord(&left_coord);
31895 +
31896 + /* adjust coordinates so that they are set to existing units */
31897 + if (coord_set_to_right(&left_coord)
31898 + || coord_set_to_left(tap->coord)) {
31899 + result = 0;
31900 + break;
31901 + }
31902 +
31903 + if (coord_compare(&left_coord, tap->coord) ==
31904 + COORD_CMP_ON_RIGHT) {
31905 + /* keys from @from_key to @to_key are not in the tree */
31906 + result = 0;
31907 + break;
31908 + }
31909 +
31910 + /* cut data from one node */
31911 + *smallest_removed = *reiser4_min_key();
31912 + result = kill_node_content(&left_coord,
31913 + tap->coord,
31914 + from_key,
31915 + to_key,
31916 + smallest_removed,
31917 + next_node_lock.node,
31918 + object, truncate);
31919 + reiser4_tap_relse(tap);
31920 +
31921 + if (result)
31922 + break;
31923 +
31924 + ++(*progress);
31925 +
31926 + /* Check whether all items with keys >= from_key were removed
31927 + * from the tree. */
31928 + if (keyle(smallest_removed, from_key))
31929 + /* result = 0; */
31930 + break;
31931 +
31932 + if (next_node_lock.node == NULL)
31933 + break;
31934 +
31935 + result = reiser4_tap_move(tap, &next_node_lock);
31936 + done_lh(&next_node_lock);
31937 + if (result)
31938 + break;
31939 +
31940 + /* Break long cut_tree operation (deletion of a large file) if
31941 + * atom requires commit. */
31942 + if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31943 + && current_atom_should_commit()) {
31944 + result = -E_REPEAT;
31945 + break;
31946 + }
31947 + }
31948 + done_lh(&next_node_lock);
31949 + return result;
31950 +}
31951 +
31952 +/* Append or expand hole in two steps:
31953 + * 1) set zeroes to the rightmost page of the rightmost non-fake
31954 + * logical cluster;
31955 + * 2) expand hole via fake logical clusters (just increase i_size)
31956 + */
31957 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31958 + loff_t new_size)
31959 +{
31960 + int result = 0;
31961 + hint_t *hint;
31962 + lock_handle *lh;
31963 + loff_t hole_size;
31964 + int nr_zeroes;
31965 + struct reiser4_slide win;
31966 + struct cluster_handle clust;
31967 +
31968 + assert("edward-1133", inode->i_size < new_size);
31969 + assert("edward-1134", reiser4_schedulable());
31970 + assert("edward-1135", cryptcompress_inode_ok(inode));
31971 + assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31972 + assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31973 +
31974 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31975 + if (hint == NULL)
31976 + return RETERR(-ENOMEM);
31977 + hint_init_zero(hint);
31978 + lh = &hint->lh;
31979 +
31980 + reiser4_slide_init(&win);
31981 + cluster_init_read(&clust, &win);
31982 + clust.hint = hint;
31983 +
31984 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31985 + if (result)
31986 + goto out;
31987 + if (off_to_cloff(inode->i_size, inode) == 0)
31988 + goto append_fake;
31989 + hole_size = new_size - inode->i_size;
31990 + nr_zeroes =
31991 + inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31992 + if (hole_size < nr_zeroes)
31993 + nr_zeroes = hole_size;
31994 + set_window(&clust, &win, inode, inode->i_size,
31995 + inode->i_size + nr_zeroes);
31996 + win.stat = HOLE_WINDOW;
31997 +
31998 + assert("edward-1137",
31999 + clust.index == off_to_clust(inode->i_size, inode));
32000 +
32001 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
32002 +
32003 + assert("edward-1271", !result || result == -ENOSPC);
32004 + if (result)
32005 + goto out;
32006 + assert("edward-1139",
32007 + clust.dstat == PREP_DISK_CLUSTER ||
32008 + clust.dstat == UNPR_DISK_CLUSTER);
32009 +
32010 + assert("edward-1431", hole_size >= nr_zeroes);
32011 + if (hole_size == nr_zeroes)
32012 + /* nothing to append anymore */
32013 + goto out;
32014 + append_fake:
32015 + INODE_SET_SIZE(inode, new_size);
32016 + out:
32017 + done_lh(lh);
32018 + kfree(hint);
32019 + put_cluster_handle(&clust);
32020 + return result;
32021 +}
32022 +
32023 +static int update_cryptcompress_size(struct inode *inode, loff_t new_size,
32024 + int update_sd)
32025 +{
32026 + return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1)
32027 + ? 0 : reiser4_update_file_size(inode, new_size, update_sd));
32028 +}
32029 +
32030 +/* Prune cryptcompress file in two steps:
32031 + * 1) cut all nominated logical clusters except the leftmost one which
32032 + * is to be partially truncated. Note, that there can be "holes"
32033 + * represented by fake logical clusters.
32034 + * 2) set zeroes and capture leftmost partially truncated logical
32035 + * cluster, if it is not fake; otherwise prune fake logical cluster
32036 + * (just decrease i_size).
32037 + */
32038 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
32039 + int update_sd, cloff_t aidx)
32040 +{
32041 + int result = 0;
32042 + unsigned nr_zeroes;
32043 + loff_t to_prune;
32044 + loff_t old_size;
32045 + cloff_t ridx;
32046 +
32047 + hint_t *hint;
32048 + lock_handle *lh;
32049 + struct reiser4_slide win;
32050 + struct cluster_handle clust;
32051 +
32052 + assert("edward-1140", inode->i_size >= new_size);
32053 + assert("edward-1141", reiser4_schedulable());
32054 + assert("edward-1142", cryptcompress_inode_ok(inode));
32055 + assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
32056 +
32057 + old_size = inode->i_size;
32058 +
32059 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32060 + if (hint == NULL)
32061 + return RETERR(-ENOMEM);
32062 + hint_init_zero(hint);
32063 + lh = &hint->lh;
32064 +
32065 + reiser4_slide_init(&win);
32066 + cluster_init_read(&clust, &win);
32067 + clust.hint = hint;
32068 +
32069 + /* calculate index of the rightmost logical cluster
32070 + that will be completely truncated */
32071 + ridx = size_in_lc(new_size, inode);
32072 +
32073 + /* truncate all disk clusters starting from @ridx */
32074 + assert("edward-1174", ridx <= aidx);
32075 + old_size = inode->i_size;
32076 + if (ridx != aidx) {
32077 + struct cryptcompress_info * info;
32078 + info = cryptcompress_inode_data(inode);
32079 + result = cut_file_items(inode,
32080 + clust_to_off(ridx, inode),
32081 + update_sd,
32082 + clust_to_off(aidx, inode),
32083 + update_cryptcompress_size);
32084 + info->trunc_index = ULONG_MAX;
32085 + if (result)
32086 + goto out;
32087 + }
32088 + /*
32089 + * there can be pages of fake logical clusters, truncate them
32090 + */
32091 + truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
32092 + assert("edward-1524",
32093 + pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
32094 + /*
32095 + * now perform partial truncate of last logical cluster
32096 + */
32097 + if (!off_to_cloff(new_size, inode)) {
32098 + /* no partial truncate is needed */
32099 + assert("edward-1145", inode->i_size == new_size);
32100 + goto truncate_fake;
32101 + }
32102 + assert("edward-1146", new_size < inode->i_size);
32103 +
32104 + to_prune = inode->i_size - new_size;
32105 +
32106 + /* check if the last logical cluster is fake */
32107 + result = lookup_disk_cluster(inode, &aidx, ridx);
32108 + if (result)
32109 + goto out;
32110 + if (!aidx)
32111 + /* yup, this is fake one */
32112 + goto truncate_fake;
32113 +
32114 + assert("edward-1148", aidx == ridx);
32115 +
32116 + /* do partial truncate of the last page cluster,
32117 + and try to capture this one */
32118 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32119 + if (result)
32120 + goto out;
32121 + nr_zeroes = (off_to_pgoff(new_size) ?
32122 + PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
32123 + set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
32124 + win.stat = HOLE_WINDOW;
32125 +
32126 + assert("edward-1149", clust.index == ridx - 1);
32127 +
32128 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
32129 + if (result)
32130 + goto out;
32131 + assert("edward-1151",
32132 + clust.dstat == PREP_DISK_CLUSTER ||
32133 + clust.dstat == UNPR_DISK_CLUSTER);
32134 +
32135 + assert("edward-1191", inode->i_size == new_size);
32136 + assert("edward-1206", body_truncate_ok(inode, ridx));
32137 + truncate_fake:
32138 + /* drop all the pages that don't have jnodes (i.e. pages
32139 + which can not be truncated by cut_file_items() because
32140 + of holes represented by fake disk clusters) including
32141 + the pages of partially truncated cluster which was
32142 + released by prepare_logical_cluster() */
32143 + INODE_SET_SIZE(inode, new_size);
32144 + truncate_inode_pages(inode->i_mapping, new_size);
32145 + out:
32146 + assert("edward-1334", !result || result == -ENOSPC);
32147 + assert("edward-1497",
32148 + pages_truncate_ok(inode, size_in_pages(new_size)));
32149 +
32150 + done_lh(lh);
32151 + kfree(hint);
32152 + put_cluster_handle(&clust);
32153 + return result;
32154 +}
32155 +
32156 +/* Prepare cryptcompress file for truncate:
32157 + * prune or append rightmost fake logical clusters (if any)
32158 + */
32159 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
32160 + loff_t new_size, int update_sd)
32161 +{
32162 + int result = 0;
32163 + int bytes;
32164 +
32165 + if (new_size > inode->i_size) {
32166 + /* append */
32167 + if (inode->i_size < clust_to_off(aidx, inode))
32168 + /* no fake bytes */
32169 + return 0;
32170 + bytes = new_size - inode->i_size;
32171 + INODE_SET_SIZE(inode, inode->i_size + bytes);
32172 + } else {
32173 + /* prune */
32174 + if (inode->i_size <= clust_to_off(aidx, inode))
32175 + /* no fake bytes */
32176 + return 0;
32177 + bytes = inode->i_size -
32178 + max(new_size, clust_to_off(aidx, inode));
32179 + if (!bytes)
32180 + return 0;
32181 + INODE_SET_SIZE(inode, inode->i_size - bytes);
32182 + /* In the case of fake prune we need to drop page cluster.
32183 + There are only 2 cases for partially truncated page:
32184 + 1. If is is dirty, therefore it is anonymous
32185 + (was dirtied via mmap), and will be captured
32186 + later via ->capture().
32187 + 2. If is clean, therefore it is filled by zeroes.
32188 + In both cases we don't need to make it dirty and
32189 + capture here.
32190 + */
32191 + truncate_inode_pages(inode->i_mapping, inode->i_size);
32192 + }
32193 + if (update_sd)
32194 + result = update_sd_cryptcompress(inode);
32195 + return result;
32196 +}
32197 +
32198 +/**
32199 + * This is called in setattr_cryptcompress when it is used to truncate,
32200 + * and in delete_object_cryptcompress
32201 + */
32202 +static int cryptcompress_truncate(struct inode *inode, /* old size */
32203 + loff_t new_size, /* new size */
32204 + int update_sd)
32205 +{
32206 + int result;
32207 + cloff_t aidx;
32208 +
32209 + result = find_fake_appended(inode, &aidx);
32210 + if (result)
32211 + return result;
32212 + assert("edward-1208",
32213 + ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
32214 +
32215 + result = start_truncate_fake(inode, aidx, new_size, update_sd);
32216 + if (result)
32217 + return result;
32218 + if (inode->i_size == new_size)
32219 + /* nothing to truncate anymore */
32220 + return 0;
32221 + result = (inode->i_size < new_size ?
32222 + cryptcompress_append_hole(inode, new_size) :
32223 + prune_cryptcompress(inode, new_size, update_sd, aidx));
32224 + if (!result && update_sd)
32225 + result = update_sd_cryptcompress(inode);
32226 + return result;
32227 +}
32228 +
32229 +/**
32230 + * Capture a pager cluster.
32231 + * @clust must be set up by a caller.
32232 + */
32233 +static int capture_page_cluster(struct cluster_handle * clust,
32234 + struct inode * inode)
32235 +{
32236 + int result;
32237 +
32238 + assert("edward-1073", clust != NULL);
32239 + assert("edward-1074", inode != NULL);
32240 + assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
32241 +
32242 + result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
32243 + if (result)
32244 + return result;
32245 +
32246 + set_cluster_pages_dirty(clust, inode);
32247 + result = checkin_logical_cluster(clust, inode);
32248 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32249 + if (unlikely(result))
32250 + put_page_cluster(clust, inode, WRITE_OP);
32251 + return result;
32252 +}
32253 +
32254 +/* Starting from @index find tagged pages of the same page cluster.
32255 + * Clear the tag for each of them. Return number of found pages.
32256 + */
32257 +static int find_anon_page_cluster(struct address_space * mapping,
32258 + pgoff_t * index, struct page ** pages)
32259 +{
32260 + int i = 0;
32261 + int found;
32262 + spin_lock_irq(&mapping->tree_lock);
32263 + do {
32264 + /* looking for one page */
32265 + found = radix_tree_gang_lookup_tag(&mapping->page_tree,
32266 + (void **)&pages[i],
32267 + *index, 1,
32268 + PAGECACHE_TAG_REISER4_MOVED);
32269 + if (!found)
32270 + break;
32271 + if (!same_page_cluster(pages[0], pages[i]))
32272 + break;
32273 +
32274 + /* found */
32275 + page_cache_get(pages[i]);
32276 + *index = pages[i]->index + 1;
32277 +
32278 + radix_tree_tag_clear(&mapping->page_tree,
32279 + pages[i]->index,
32280 + PAGECACHE_TAG_REISER4_MOVED);
32281 + if (last_page_in_cluster(pages[i++]))
32282 + break;
32283 + } while (1);
32284 + spin_unlock_irq(&mapping->tree_lock);
32285 + return i;
32286 +}
32287 +
32288 +#define MAX_PAGES_TO_CAPTURE (1024)
32289 +
32290 +/* Capture anonymous page clusters */
32291 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
32292 + int to_capture)
32293 +{
32294 + int count = 0;
32295 + int found = 0;
32296 + int result = 0;
32297 + hint_t *hint;
32298 + lock_handle *lh;
32299 + struct inode * inode;
32300 + struct cluster_handle clust;
32301 + struct page * pages[MAX_CLUSTER_NRPAGES];
32302 +
32303 + assert("edward-1127", mapping != NULL);
32304 + assert("edward-1128", mapping->host != NULL);
32305 + assert("edward-1440", mapping->host->i_mapping == mapping);
32306 +
32307 + inode = mapping->host;
32308 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32309 + if (hint == NULL)
32310 + return RETERR(-ENOMEM);
32311 + hint_init_zero(hint);
32312 + lh = &hint->lh;
32313 +
32314 + cluster_init_read(&clust, NULL);
32315 + clust.hint = hint;
32316 +
32317 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32318 + if (result)
32319 + goto out;
32320 +
32321 + while (to_capture > 0) {
32322 + found = find_anon_page_cluster(mapping, index, pages);
32323 + if (!found) {
32324 + *index = (pgoff_t) - 1;
32325 + break;
32326 + }
32327 + move_cluster_forward(&clust, inode, pages[0]->index);
32328 + result = capture_page_cluster(&clust, inode);
32329 +
32330 + put_found_pages(pages, found); /* find_anon_page_cluster */
32331 + if (result)
32332 + break;
32333 + to_capture -= clust.nr_pages;
32334 + count += clust.nr_pages;
32335 + }
32336 + if (result) {
32337 + warning("edward-1077",
32338 + "Capture failed (inode %llu, result=%i, captured=%d)\n",
32339 + (unsigned long long)get_inode_oid(inode), result, count);
32340 + } else {
32341 + assert("edward-1078", ergo(found > 0, count > 0));
32342 + if (to_capture <= 0)
32343 + /* there may be left more pages */
32344 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
32345 + result = count;
32346 + }
32347 + out:
32348 + done_lh(lh);
32349 + kfree(hint);
32350 + put_cluster_handle(&clust);
32351 + return result;
32352 +}
32353 +
32354 +/* Returns true if inode's mapping has dirty pages
32355 + which do not belong to any atom */
32356 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
32357 +{
32358 + int result;
32359 + spin_lock_irq(&inode->i_mapping->tree_lock);
32360 + result = radix_tree_tagged(&inode->i_mapping->page_tree,
32361 + PAGECACHE_TAG_REISER4_MOVED);
32362 + spin_unlock_irq(&inode->i_mapping->tree_lock);
32363 + return result;
32364 +}
32365 +
32366 +/* plugin->writepages */
32367 +int writepages_cryptcompress(struct address_space *mapping,
32368 + struct writeback_control *wbc)
32369 +{
32370 + int result = 0;
32371 + long to_capture;
32372 + pgoff_t nrpages;
32373 + pgoff_t index = 0;
32374 + struct inode *inode;
32375 + struct cryptcompress_info *info;
32376 +
32377 + inode = mapping->host;
32378 + if (!cryptcompress_inode_has_anon_pages(inode))
32379 + goto end;
32380 + info = cryptcompress_inode_data(inode);
32381 + nrpages = size_in_pages(i_size_read(inode));
32382 +
32383 + if (wbc->sync_mode != WB_SYNC_ALL)
32384 + to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32385 + else
32386 + to_capture = MAX_PAGES_TO_CAPTURE;
32387 + do {
32388 + reiser4_context *ctx;
32389 +
32390 + ctx = reiser4_init_context(inode->i_sb);
32391 + if (IS_ERR(ctx)) {
32392 + result = PTR_ERR(ctx);
32393 + break;
32394 + }
32395 + /* avoid recursive calls to ->sync_inodes */
32396 + ctx->nobalance = 1;
32397 +
32398 + assert("edward-1079",
32399 + lock_stack_isclean(get_current_lock_stack()));
32400 +
32401 + reiser4_txn_restart_current();
32402 +
32403 + if (get_current_context()->entd) {
32404 + if (mutex_trylock(&info->checkin_mutex) == 0) {
32405 + /* the mutex might be occupied by
32406 + entd caller */
32407 + result = RETERR(-EBUSY);
32408 + reiser4_exit_context(ctx);
32409 + break;
32410 + }
32411 + } else
32412 + mutex_lock(&info->checkin_mutex);
32413 +
32414 + result = capture_anon_pages(inode->i_mapping, &index,
32415 + to_capture);
32416 + mutex_unlock(&info->checkin_mutex);
32417 +
32418 + if (result < 0) {
32419 + reiser4_exit_context(ctx);
32420 + break;
32421 + }
32422 + wbc->nr_to_write -= result;
32423 + if (wbc->sync_mode != WB_SYNC_ALL) {
32424 + reiser4_exit_context(ctx);
32425 + break;
32426 + }
32427 + result = txnmgr_force_commit_all(inode->i_sb, 0);
32428 + reiser4_exit_context(ctx);
32429 + } while (result >= 0 && index < nrpages);
32430 +
32431 + end:
32432 + if (is_in_reiser4_context()) {
32433 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32434 + /* there are already pages to flush, flush them out,
32435 + do not delay until end of reiser4_sync_inodes */
32436 + reiser4_writeout(inode->i_sb, wbc);
32437 + get_current_context()->nr_captured = 0;
32438 + }
32439 + }
32440 + return result;
32441 +}
32442 +
32443 +/* plugin->ioctl */
32444 +int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32445 + unsigned int cmd, unsigned long arg)
32446 +{
32447 + return RETERR(-ENOSYS);
32448 +}
32449 +
32450 +/* plugin->mmap */
32451 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32452 +{
32453 + int result;
32454 + struct inode *inode;
32455 + reiser4_context *ctx;
32456 +
32457 + inode = file->f_dentry->d_inode;
32458 + ctx = reiser4_init_context(inode->i_sb);
32459 + if (IS_ERR(ctx))
32460 + return PTR_ERR(ctx);
32461 + /*
32462 + * generic_file_mmap will do update_atime. Grab space for stat data
32463 + * update.
32464 + */
32465 + result = reiser4_grab_space_force
32466 + (inode_file_plugin(inode)->estimate.update(inode),
32467 + BA_CAN_COMMIT);
32468 + if (result) {
32469 + reiser4_exit_context(ctx);
32470 + return result;
32471 + }
32472 + result = generic_file_mmap(file, vma);
32473 + reiser4_exit_context(ctx);
32474 + return result;
32475 +}
32476 +
32477 +/* plugin->delete_object */
32478 +int delete_object_cryptcompress(struct inode *inode)
32479 +{
32480 + int result;
32481 + struct cryptcompress_info * info;
32482 +
32483 + assert("edward-429", inode->i_nlink == 0);
32484 +
32485 + reiser4_txn_restart_current();
32486 + info = cryptcompress_inode_data(inode);
32487 +
32488 + mutex_lock(&info->checkin_mutex);
32489 + result = cryptcompress_truncate(inode, 0, 0);
32490 + mutex_unlock(&info->checkin_mutex);
32491 +
32492 + if (result) {
32493 + warning("edward-430",
32494 + "cannot truncate cryptcompress file %lli: %i",
32495 + (unsigned long long)get_inode_oid(inode),
32496 + result);
32497 + }
32498 + truncate_inode_pages(inode->i_mapping, 0);
32499 + assert("edward-1487", pages_truncate_ok(inode, 0));
32500 + /* and remove stat data */
32501 + return reiser4_delete_object_common(inode);
32502 +}
32503 +
32504 +/*
32505 + * plugin->setattr
32506 + * This implements actual truncate (see comments in reiser4/page_cache.c)
32507 + */
32508 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32509 +{
32510 + int result;
32511 + struct inode *inode;
32512 + struct cryptcompress_info * info;
32513 +
32514 + inode = dentry->d_inode;
32515 + info = cryptcompress_inode_data(inode);
32516 +
32517 + if (attr->ia_valid & ATTR_SIZE) {
32518 + if (i_size_read(inode) != attr->ia_size) {
32519 + reiser4_context *ctx;
32520 + loff_t old_size;
32521 +
32522 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
32523 + if (IS_ERR(ctx))
32524 + return PTR_ERR(ctx);
32525 + result = setattr_dispatch_hook(inode);
32526 + if (result) {
32527 + context_set_commit_async(ctx);
32528 + reiser4_exit_context(ctx);
32529 + return result;
32530 + }
32531 + old_size = i_size_read(inode);
32532 + inode_check_scale(inode, old_size, attr->ia_size);
32533 +
32534 + mutex_lock(&info->checkin_mutex);
32535 + result = cryptcompress_truncate(inode,
32536 + attr->ia_size,
32537 + 1/* update sd */);
32538 + mutex_unlock(&info->checkin_mutex);
32539 + if (result) {
32540 + warning("edward-1192",
32541 + "truncate_cryptcompress failed: oid %lli, "
32542 + "old size %lld, new size %lld, retval %d",
32543 + (unsigned long long)
32544 + get_inode_oid(inode), old_size,
32545 + attr->ia_size, result);
32546 + }
32547 + context_set_commit_async(ctx);
32548 + reiser4_exit_context(ctx);
32549 + } else
32550 + result = 0;
32551 + } else
32552 + result = reiser4_setattr_common(dentry, attr);
32553 + return result;
32554 +}
32555 +
32556 +/* plugin->release */
32557 +int release_cryptcompress(struct inode *inode, struct file *file)
32558 +{
32559 + reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32560 +
32561 + if (IS_ERR(ctx))
32562 + return PTR_ERR(ctx);
32563 + reiser4_free_file_fsdata(file);
32564 + reiser4_exit_context(ctx);
32565 + return 0;
32566 +}
32567 +
32568 +/* plugin->prepare_write */
32569 +int write_begin_cryptcompress(struct file *file, struct page *page,
32570 + unsigned from, unsigned to)
32571 +{
32572 + return do_prepare_write(file, page, from, to);
32573 +}
32574 +
32575 +/* plugin->commit_write */
32576 +int write_end_cryptcompress(struct file *file, struct page *page,
32577 + unsigned from, unsigned to)
32578 +{
32579 + int ret;
32580 + hint_t *hint;
32581 + lock_handle *lh;
32582 + struct inode * inode;
32583 + struct cluster_handle clust;
32584 +
32585 + unlock_page(page);
32586 +
32587 + inode = page->mapping->host;
32588 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32589 + if (hint == NULL)
32590 + return RETERR(-ENOMEM);
32591 + hint_init_zero(hint);
32592 + lh = &hint->lh;
32593 +
32594 + cluster_init_read(&clust, NULL);
32595 + clust.hint = hint;
32596 +
32597 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32598 + if (ret)
32599 + goto out;
32600 + clust.index = pg_to_clust(page->index, inode);
32601 + ret = capture_page_cluster(&clust, inode);
32602 + if (ret)
32603 + warning("edward-1557",
32604 + "Capture failed (inode %llu, result=%i)",
32605 + (unsigned long long)get_inode_oid(inode), ret);
32606 + out:
32607 + done_lh(lh);
32608 + kfree(hint);
32609 + put_cluster_handle(&clust);
32610 + return ret;
32611 +}
32612 +
32613 +/* plugin->bmap */
32614 +sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32615 +{
32616 + return -EINVAL;
32617 +}
32618 +
32619 +/*
32620 + Local variables:
32621 + c-indentation-style: "K&R"
32622 + mode-name: "LC"
32623 + c-basic-offset: 8
32624 + tab-width: 8
32625 + fill-column: 80
32626 + scroll-step: 1
32627 + End:
32628 +*/
32629 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.h
32630 --- linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 01:00:00.000000000 +0100
32631 +++ linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.h 2010-03-04 19:33:22.000000000 +0100
32632 @@ -0,0 +1,616 @@
32633 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32634 +/* See http://www.namesys.com/cryptcompress_design.html */
32635 +
32636 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32637 +#define __FS_REISER4_CRYPTCOMPRESS_H__
32638 +
32639 +#include "../../page_cache.h"
32640 +#include "../compress/compress.h"
32641 +#include "../crypto/cipher.h"
32642 +
32643 +#include <linux/pagemap.h>
32644 +
32645 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32646 +#define MAX_CLUSTER_SHIFT 16
32647 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32648 +#define DC_CHECKSUM_SIZE 4
32649 +
32650 +#define MIN_LATTICE_FACTOR 1
32651 +#define MAX_LATTICE_FACTOR 32
32652 +
32653 +/* this mask contains all non-standard plugins that might
32654 + be present in reiser4-specific part of inode managed by
32655 + cryptcompress file plugin */
32656 +#define cryptcompress_mask \
32657 + ((1 << PSET_FILE) | \
32658 + (1 << PSET_CLUSTER) | \
32659 + (1 << PSET_CIPHER) | \
32660 + (1 << PSET_DIGEST) | \
32661 + (1 << PSET_COMPRESSION) | \
32662 + (1 << PSET_COMPRESSION_MODE))
32663 +
32664 +#if REISER4_DEBUG
32665 +static inline int cluster_shift_ok(int shift)
32666 +{
32667 + return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32668 +}
32669 +#endif
32670 +
32671 +#if REISER4_DEBUG
32672 +#define INODE_PGCOUNT(inode) \
32673 +({ \
32674 + assert("edward-1530", inode_file_plugin(inode) == \
32675 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32676 + atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
32677 + })
32678 +#define INODE_PGCOUNT_INC(inode) \
32679 +do { \
32680 + assert("edward-1531", inode_file_plugin(inode) == \
32681 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32682 + atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
32683 +} while (0)
32684 +#define INODE_PGCOUNT_DEC(inode) \
32685 +do { \
32686 + if (inode_file_plugin(inode) == \
32687 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
32688 + atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
32689 +} while (0)
32690 +#else
32691 +#define INODE_PGCOUNT(inode) (0)
32692 +#define INODE_PGCOUNT_INC(inode)
32693 +#define INODE_PGCOUNT_DEC(inode)
32694 +#endif /* REISER4_DEBUG */
32695 +
32696 +struct tfm_stream {
32697 + __u8 *data;
32698 + size_t size;
32699 +};
32700 +
32701 +typedef enum {
32702 + INPUT_STREAM,
32703 + OUTPUT_STREAM,
32704 + LAST_STREAM
32705 +} tfm_stream_id;
32706 +
32707 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32708 +
32709 +static inline __u8 *ts_data(struct tfm_stream * stm)
32710 +{
32711 + assert("edward-928", stm != NULL);
32712 + return stm->data;
32713 +}
32714 +
32715 +static inline size_t ts_size(struct tfm_stream * stm)
32716 +{
32717 + assert("edward-929", stm != NULL);
32718 + return stm->size;
32719 +}
32720 +
32721 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32722 +{
32723 + assert("edward-930", stm != NULL);
32724 +
32725 + stm->size = size;
32726 +}
32727 +
32728 +static inline int alloc_ts(struct tfm_stream ** stm)
32729 +{
32730 + assert("edward-931", stm);
32731 + assert("edward-932", *stm == NULL);
32732 +
32733 + *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32734 + if (!*stm)
32735 + return -ENOMEM;
32736 + return 0;
32737 +}
32738 +
32739 +static inline void free_ts(struct tfm_stream * stm)
32740 +{
32741 + assert("edward-933", !ts_data(stm));
32742 + assert("edward-934", !ts_size(stm));
32743 +
32744 + kfree(stm);
32745 +}
32746 +
32747 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32748 +{
32749 + assert("edward-935", !ts_data(stm));
32750 + assert("edward-936", !ts_size(stm));
32751 + assert("edward-937", size != 0);
32752 +
32753 + stm->data = reiser4_vmalloc(size);
32754 + if (!stm->data)
32755 + return -ENOMEM;
32756 + set_ts_size(stm, size);
32757 + return 0;
32758 +}
32759 +
32760 +static inline void free_ts_data(struct tfm_stream * stm)
32761 +{
32762 + assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32763 +
32764 + if (ts_data(stm))
32765 + vfree(ts_data(stm));
32766 + memset(stm, 0, sizeof *stm);
32767 +}
32768 +
32769 +/* Write modes for item conversion in flush convert phase */
32770 +typedef enum {
32771 + CRC_APPEND_ITEM = 1,
32772 + CRC_OVERWRITE_ITEM = 2,
32773 + CRC_CUT_ITEM = 3
32774 +} cryptcompress_write_mode_t;
32775 +
32776 +typedef enum {
32777 + LC_INVAL = 0, /* invalid value */
32778 + LC_APPOV = 1, /* append and/or overwrite */
32779 + LC_TRUNC = 2 /* truncate */
32780 +} logical_cluster_op;
32781 +
32782 +/* Transform cluster.
32783 + * Intermediate state between page cluster and disk cluster
32784 + * Is used for data transform (compression/encryption)
32785 + */
32786 +struct tfm_cluster {
32787 + coa_set coa; /* compression algorithms info */
32788 + tfm_unit tun; /* plain and transformed streams */
32789 + tfm_action act;
32790 + int uptodate;
32791 + int lsize; /* number of bytes in logical cluster */
32792 + int len; /* length of the transform stream */
32793 +};
32794 +
32795 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32796 + tfm_action act)
32797 +{
32798 + return tc->coa[id][act];
32799 +}
32800 +
32801 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32802 + tfm_action act, coa_t coa)
32803 +{
32804 + tc->coa[id][act] = coa;
32805 +}
32806 +
32807 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32808 +{
32809 + coa_t coa;
32810 +
32811 + coa = cplug->alloc(tc->act);
32812 + if (IS_ERR(coa))
32813 + return PTR_ERR(coa);
32814 + set_coa(tc, cplug->h.id, tc->act, coa);
32815 + return 0;
32816 +}
32817 +
32818 +static inline int
32819 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32820 +{
32821 + return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32822 + alloc_coa(tc, cplug) : 0);
32823 +}
32824 +
32825 +static inline void free_coa_set(struct tfm_cluster * tc)
32826 +{
32827 + tfm_action j;
32828 + reiser4_compression_id i;
32829 + compression_plugin *cplug;
32830 +
32831 + assert("edward-810", tc != NULL);
32832 +
32833 + for (j = 0; j < TFMA_LAST; j++)
32834 + for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32835 + if (!get_coa(tc, i, j))
32836 + continue;
32837 + cplug = compression_plugin_by_id(i);
32838 + assert("edward-812", cplug->free != NULL);
32839 + cplug->free(get_coa(tc, i, j), j);
32840 + set_coa(tc, i, j, 0);
32841 + }
32842 + return;
32843 +}
32844 +
32845 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32846 + tfm_stream_id id)
32847 +{
32848 + return tc->tun[id];
32849 +}
32850 +
32851 +static inline void set_tfm_stream(struct tfm_cluster * tc,
32852 + tfm_stream_id id, struct tfm_stream * ts)
32853 +{
32854 + tc->tun[id] = ts;
32855 +}
32856 +
32857 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32858 +{
32859 + return ts_data(get_tfm_stream(tc, id));
32860 +}
32861 +
32862 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32863 + tfm_stream_id id, __u8 * data)
32864 +{
32865 + get_tfm_stream(tc, id)->data = data;
32866 +}
32867 +
32868 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32869 +{
32870 + return ts_size(get_tfm_stream(tc, id));
32871 +}
32872 +
32873 +static inline void
32874 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32875 +{
32876 + get_tfm_stream(tc, id)->size = size;
32877 +}
32878 +
32879 +static inline int
32880 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32881 +{
32882 + assert("edward-939", tc != NULL);
32883 + assert("edward-940", !get_tfm_stream(tc, id));
32884 +
32885 + tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32886 + reiser4_ctx_gfp_mask_get());
32887 + if (!tc->tun[id])
32888 + return -ENOMEM;
32889 + return alloc_ts_data(get_tfm_stream(tc, id), size);
32890 +}
32891 +
32892 +static inline int
32893 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32894 +{
32895 + assert("edward-941", tfm_stream_size(tc, id) < size);
32896 + free_ts_data(get_tfm_stream(tc, id));
32897 + return alloc_ts_data(get_tfm_stream(tc, id), size);
32898 +}
32899 +
32900 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32901 +{
32902 + free_ts_data(get_tfm_stream(tc, id));
32903 + free_ts(get_tfm_stream(tc, id));
32904 + set_tfm_stream(tc, id, 0);
32905 +}
32906 +
32907 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32908 +{
32909 + return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32910 +}
32911 +
32912 +static inline void free_tfm_unit(struct tfm_cluster * tc)
32913 +{
32914 + tfm_stream_id id;
32915 + for (id = 0; id < LAST_STREAM; id++) {
32916 + if (!get_tfm_stream(tc, id))
32917 + continue;
32918 + free_tfm_stream(tc, id);
32919 + }
32920 +}
32921 +
32922 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
32923 +{
32924 + assert("edward-942", tc != NULL);
32925 + free_coa_set(tc);
32926 + free_tfm_unit(tc);
32927 +}
32928 +
32929 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32930 +{
32931 + assert("edward-943", tc != NULL);
32932 + assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32933 + return (tc->uptodate == 1);
32934 +}
32935 +
32936 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32937 +{
32938 + assert("edward-945", tc != NULL);
32939 + assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32940 + tc->uptodate = 1;
32941 + return;
32942 +}
32943 +
32944 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32945 +{
32946 + assert("edward-947", tc != NULL);
32947 + assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32948 + tc->uptodate = 0;
32949 + return;
32950 +}
32951 +
32952 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32953 +{
32954 + return (get_tfm_stream(tc, id) &&
32955 + tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32956 +}
32957 +
32958 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32959 +{
32960 + int i;
32961 + for (i = 0; i < LAST_STREAM; i++)
32962 + if (!tfm_stream_is_set(tc, i))
32963 + return 0;
32964 + return 1;
32965 +}
32966 +
32967 +static inline void alternate_streams(struct tfm_cluster * tc)
32968 +{
32969 + struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32970 +
32971 + set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32972 + set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32973 +}
32974 +
32975 +/* Set of states to indicate a kind of data
32976 + * that will be written to the window */
32977 +typedef enum {
32978 + DATA_WINDOW, /* user's data */
32979 + HOLE_WINDOW /* zeroes (such kind of data can be written
32980 + * if we start to write from offset > i_size) */
32981 +} window_stat;
32982 +
32983 +/* Window (of logical cluster size) discretely sliding along a file.
32984 + * Is used to locate hole region in a logical cluster to be properly
32985 + * represented on disk.
32986 + * We split a write to cryptcompress file into writes to its logical
32987 + * clusters. Before writing to a logical cluster we set a window, i.e.
32988 + * calculate values of the following fields:
32989 + */
32990 +struct reiser4_slide {
32991 + unsigned off; /* offset to write from */
32992 + unsigned count; /* number of bytes to write */
32993 + unsigned delta; /* number of bytes to append to the hole */
32994 + window_stat stat; /* what kind of data will be written starting
32995 + from @off */
32996 +};
32997 +
32998 +/* Possible states of a disk cluster */
32999 +typedef enum {
33000 + INVAL_DISK_CLUSTER, /* unknown state */
33001 + PREP_DISK_CLUSTER, /* disk cluster got converted by flush
33002 + * at least 1 time */
33003 + UNPR_DISK_CLUSTER, /* disk cluster just created and should be
33004 + * converted by flush */
33005 + FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
33006 + * nor on disk */
33007 + TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
33008 +} disk_cluster_stat;
33009 +
33010 +/* The following structure represents various stages of the same logical
33011 + * cluster of index @index:
33012 + * . fixed slide
33013 + * . page cluster (stage in primary cache)
33014 + * . transform cluster (transition stage)
33015 + * . disk cluster (stage in secondary cache)
33016 + * This structure is used in transition and synchronizing operations, e.g.
33017 + * transform cluster is a transition state when synchronizing page cluster
33018 + * and disk cluster.
33019 + * FIXME: Encapsulate page cluster, disk cluster.
33020 + */
33021 +struct cluster_handle {
33022 + cloff_t index; /* offset in a file (unit is a cluster size) */
33023 + int index_valid; /* for validating the index above, if needed */
33024 + struct file *file; /* host file */
33025 +
33026 + /* logical cluster */
33027 + struct reiser4_slide *win; /* sliding window to locate holes */
33028 + logical_cluster_op op; /* logical cluster operation (truncate or
33029 + append/overwrite) */
33030 + /* transform cluster */
33031 + struct tfm_cluster tc; /* contains all needed info to synchronize
33032 + page cluster and disk cluster) */
33033 + /* page cluster */
33034 + int nr_pages; /* number of pages of current checkin action */
33035 + int old_nrpages; /* number of pages of last checkin action */
33036 + struct page **pages; /* attached pages */
33037 + jnode * node; /* jnode for capture */
33038 +
33039 + /* disk cluster */
33040 + hint_t *hint; /* current position in the tree */
33041 + disk_cluster_stat dstat; /* state of the current disk cluster */
33042 + int reserved; /* is space for disk cluster reserved */
33043 +#if REISER4_DEBUG
33044 + reiser4_context *ctx;
33045 + int reserved_prepped;
33046 + int reserved_unprepped;
33047 +#endif
33048 +
33049 +};
33050 +
33051 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
33052 +{
33053 + return tfm_stream_data(&clust->tc, INPUT_STREAM);
33054 +}
33055 +
33056 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
33057 +{
33058 + return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
33059 +}
33060 +
33061 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
33062 + int nrpages)
33063 +{
33064 + assert("edward-1057", clust->pages != NULL);
33065 + memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
33066 + return 0;
33067 +}
33068 +
33069 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
33070 + int nrpages)
33071 +{
33072 + assert("edward-949", clust != NULL);
33073 + assert("edward-1362", clust->pages == NULL);
33074 + assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
33075 +
33076 + clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
33077 + reiser4_ctx_gfp_mask_get());
33078 + if (!clust->pages)
33079 + return RETERR(-ENOMEM);
33080 + return 0;
33081 +}
33082 +
33083 +static inline void move_cluster_pgset(struct cluster_handle *clust,
33084 + struct page ***pages, int * nr_pages)
33085 +{
33086 + assert("edward-1545", clust != NULL && clust->pages != NULL);
33087 + assert("edward-1546", pages != NULL && *pages == NULL);
33088 + *pages = clust->pages;
33089 + *nr_pages = clust->nr_pages;
33090 + clust->pages = NULL;
33091 +}
33092 +
33093 +static inline void free_cluster_pgset(struct cluster_handle * clust)
33094 +{
33095 + assert("edward-951", clust->pages != NULL);
33096 + kfree(clust->pages);
33097 + clust->pages = NULL;
33098 +}
33099 +
33100 +static inline void put_cluster_handle(struct cluster_handle * clust)
33101 +{
33102 + assert("edward-435", clust != NULL);
33103 +
33104 + put_tfm_cluster(&clust->tc);
33105 + if (clust->pages)
33106 + free_cluster_pgset(clust);
33107 + memset(clust, 0, sizeof *clust);
33108 +}
33109 +
33110 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
33111 +{
33112 + assert("edward-1410", data != NULL);
33113 + data->keyload_count++;
33114 +}
33115 +
33116 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
33117 +{
33118 + assert("edward-1411", data != NULL);
33119 + assert("edward-1412", data->keyload_count > 0);
33120 + data->keyload_count--;
33121 +}
33122 +
33123 +static inline int capture_cluster_jnode(jnode * node)
33124 +{
33125 + return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33126 +}
33127 +
33128 +/* cryptcompress specific part of reiser4_inode */
33129 +struct cryptcompress_info {
33130 + struct mutex checkin_mutex; /* This is to serialize
33131 + * checkin_logical_cluster operations */
33132 + cloff_t trunc_index; /* Index of the leftmost truncated disk
33133 + * cluster (to resolve races with read) */
33134 + struct reiser4_crypto_info *crypt;
33135 + /*
33136 + * the following 2 fields are controlled by compression mode plugin
33137 + */
33138 + int compress_toggle; /* Current status of compressibility */
33139 + int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
33140 + * a compression_toggle to keep the factor
33141 + */
33142 +#if REISER4_DEBUG
33143 + atomic_t pgcount; /* number of grabbed pages */
33144 +#endif
33145 +};
33146 +
33147 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
33148 +{
33149 + info->compress_toggle = val;
33150 +}
33151 +
33152 +static inline int get_compression_toggle (struct cryptcompress_info * info)
33153 +{
33154 + return info->compress_toggle;
33155 +}
33156 +
33157 +static inline int compression_is_on(struct cryptcompress_info * info)
33158 +{
33159 + return get_compression_toggle(info) == 1;
33160 +}
33161 +
33162 +static inline void turn_on_compression(struct cryptcompress_info * info)
33163 +{
33164 + set_compression_toggle(info, 1);
33165 +}
33166 +
33167 +static inline void turn_off_compression(struct cryptcompress_info * info)
33168 +{
33169 + set_compression_toggle(info, 0);
33170 +}
33171 +
33172 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
33173 +{
33174 + info->lattice_factor = val;
33175 +}
33176 +
33177 +static inline int get_lattice_factor(struct cryptcompress_info * info)
33178 +{
33179 + return info->lattice_factor;
33180 +}
33181 +
33182 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
33183 +int equal_to_rdk(znode *, const reiser4_key *);
33184 +int goto_right_neighbor(coord_t *, lock_handle *);
33185 +int cryptcompress_inode_ok(struct inode *inode);
33186 +int coord_is_unprepped_ctail(const coord_t * coord);
33187 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
33188 + struct page * page, znode_lock_mode mode);
33189 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
33190 + struct inode * inode);
33191 +extern int readpages_cryptcompress(struct file*, struct address_space*,
33192 + struct list_head*, unsigned);
33193 +int bind_cryptcompress(struct inode *child, struct inode *parent);
33194 +void destroy_inode_cryptcompress(struct inode * inode);
33195 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
33196 + rw_op rw);
33197 +int write_dispatch_hook(struct file *file, struct inode * inode,
33198 + loff_t pos, struct cluster_handle * clust,
33199 + struct dispatch_context * cont);
33200 +int setattr_dispatch_hook(struct inode * inode);
33201 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
33202 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
33203 + int (*can_inherit)(struct inode * child,
33204 + struct inode * parent));
33205 +void reiser4_attach_crypto_info(struct inode * inode,
33206 + struct reiser4_crypto_info * info);
33207 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
33208 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
33209 +
33210 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
33211 +{
33212 + return info->cipher;
33213 +}
33214 +
33215 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
33216 + struct crypto_blkcipher * tfm)
33217 +{
33218 + info->cipher = tfm;
33219 +}
33220 +
33221 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
33222 +{
33223 + return info->digest;
33224 +}
33225 +
33226 +static inline void info_set_digest(struct reiser4_crypto_info * info,
33227 + struct crypto_hash * tfm)
33228 +{
33229 + info->digest = tfm;
33230 +}
33231 +
33232 +static inline void put_cluster_page(struct page * page)
33233 +{
33234 + page_cache_release(page);
33235 +}
33236 +
33237 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
33238 +
33239 +/* Make Linus happy.
33240 + Local variables:
33241 + c-indentation-style: "K&R"
33242 + mode-name: "LC"
33243 + c-basic-offset: 8
33244 + tab-width: 8
33245 + fill-column: 120
33246 + scroll-step: 1
33247 + End:
33248 +*/
33249 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file.c linux-2.6.33/fs/reiser4/plugin/file/file.c
33250 --- linux-2.6.33.orig/fs/reiser4/plugin/file/file.c 1970-01-01 01:00:00.000000000 +0100
33251 +++ linux-2.6.33/fs/reiser4/plugin/file/file.c 2010-03-04 19:33:22.000000000 +0100
33252 @@ -0,0 +1,2688 @@
33253 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
33254 + * reiser4/README */
33255 +
33256 +/*
33257 + * this file contains implementations of inode/file/address_space/file plugin
33258 + * operations specific for "unix file plugin" (plugin id is
33259 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
33260 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
33261 + * no items but stat data)
33262 + */
33263 +
33264 +#include "../../inode.h"
33265 +#include "../../super.h"
33266 +#include "../../tree_walk.h"
33267 +#include "../../carry.h"
33268 +#include "../../page_cache.h"
33269 +#include "../../ioctl.h"
33270 +#include "../object.h"
33271 +#include "../cluster.h"
33272 +#include "../../safe_link.h"
33273 +
33274 +#include <linux/writeback.h>
33275 +#include <linux/pagevec.h>
33276 +#include <linux/syscalls.h>
33277 +
33278 +
33279 +static int unpack(struct file *file, struct inode *inode, int forever);
33280 +static void drop_access(struct unix_file_info *);
33281 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33282 + znode_lock_mode lock_mode);
33283 +
33284 +/* Get exclusive access and make sure that file is not partially
33285 + * converted (It may happen that another process is doing tail
33286 + * conversion. If so, wait until it completes)
33287 + */
33288 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
33289 + struct inode *inode)
33290 +{
33291 + do {
33292 + get_exclusive_access(uf_info);
33293 + if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
33294 + break;
33295 + drop_exclusive_access(uf_info);
33296 + schedule();
33297 + } while (1);
33298 +}
33299 +
33300 +/* get unix file plugin specific portion of inode */
33301 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
33302 +{
33303 + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
33304 +}
33305 +
33306 +/**
33307 + * equal_to_rdk - compare key and znode's right delimiting key
33308 + * @node: node whose right delimiting key to compare with @key
33309 + * @key: key to compare with @node's right delimiting key
33310 + *
33311 + * Returns true if @key is equal to right delimiting key of @node.
33312 + */
33313 +int equal_to_rdk(znode *node, const reiser4_key *key)
33314 +{
33315 + int result;
33316 +
33317 + read_lock_dk(znode_get_tree(node));
33318 + result = keyeq(key, znode_get_rd_key(node));
33319 + read_unlock_dk(znode_get_tree(node));
33320 + return result;
33321 +}
33322 +
33323 +#if REISER4_DEBUG
33324 +
33325 +/**
33326 + * equal_to_ldk - compare key and znode's left delimiting key
33327 + * @node: node whose left delimiting key to compare with @key
33328 + * @key: key to compare with @node's left delimiting key
33329 + *
33330 + * Returns true if @key is equal to left delimiting key of @node.
33331 + */
33332 +int equal_to_ldk(znode *node, const reiser4_key *key)
33333 +{
33334 + int result;
33335 +
33336 + read_lock_dk(znode_get_tree(node));
33337 + result = keyeq(key, znode_get_ld_key(node));
33338 + read_unlock_dk(znode_get_tree(node));
33339 + return result;
33340 +}
33341 +
33342 +/**
33343 + * check_coord - check whether coord corresponds to key
33344 + * @coord: coord to check
33345 + * @key: key @coord has to correspond to
33346 + *
33347 + * Returns true if @coord is set as if it was set as result of lookup with @key
33348 + * in coord->node.
33349 + */
33350 +static int check_coord(const coord_t *coord, const reiser4_key *key)
33351 +{
33352 + coord_t twin;
33353 +
33354 + node_plugin_by_node(coord->node)->lookup(coord->node, key,
33355 + FIND_MAX_NOT_MORE_THAN, &twin);
33356 + return coords_equal(coord, &twin);
33357 +}
33358 +
33359 +#endif /* REISER4_DEBUG */
33360 +
33361 +/**
33362 + * init_uf_coord - initialize extended coord
33363 + * @uf_coord:
33364 + * @lh:
33365 + *
33366 + *
33367 + */
33368 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
33369 +{
33370 + coord_init_zero(&uf_coord->coord);
33371 + coord_clear_iplug(&uf_coord->coord);
33372 + uf_coord->lh = lh;
33373 + init_lh(lh);
33374 + memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
33375 + uf_coord->valid = 0;
33376 +}
33377 +
33378 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
33379 +{
33380 + assert("vs-1333", uf_coord->valid == 0);
33381 +
33382 + if (coord_is_between_items(&uf_coord->coord))
33383 + return;
33384 +
33385 + assert("vs-1348",
33386 + item_plugin_by_coord(&uf_coord->coord)->s.file.
33387 + init_coord_extension);
33388 +
33389 + item_body_by_coord(&uf_coord->coord);
33390 + item_plugin_by_coord(&uf_coord->coord)->s.file.
33391 + init_coord_extension(uf_coord, offset);
33392 +}
33393 +
33394 +/**
33395 + * goto_right_neighbor - lock right neighbor, drop current node lock
33396 + * @coord:
33397 + * @lh:
33398 + *
33399 + * Obtain lock on right neighbor and drop lock on current node.
33400 + */
33401 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33402 +{
33403 + int result;
33404 + lock_handle lh_right;
33405 +
33406 + assert("vs-1100", znode_is_locked(coord->node));
33407 +
33408 + init_lh(&lh_right);
33409 + result = reiser4_get_right_neighbor(&lh_right, coord->node,
33410 + znode_is_wlocked(coord->node) ?
33411 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33412 + GN_CAN_USE_UPPER_LEVELS);
33413 + if (result) {
33414 + done_lh(&lh_right);
33415 + return result;
33416 + }
33417 +
33418 + /*
33419 + * we hold two longterm locks on neighboring nodes. Unlock left of
33420 + * them
33421 + */
33422 + done_lh(lh);
33423 +
33424 + coord_init_first_unit_nocheck(coord, lh_right.node);
33425 + move_lh(lh, &lh_right);
33426 +
33427 + return 0;
33428 +
33429 +}
33430 +
33431 +/**
33432 + * set_file_state
33433 + * @uf_info:
33434 + * @cbk_result:
33435 + * @level:
33436 + *
33437 + * This is to be used by find_file_item and in find_file_state to
33438 + * determine real state of file
33439 + */
33440 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33441 + tree_level level)
33442 +{
33443 + if (cbk_errored(cbk_result))
33444 + /* error happened in find_file_item */
33445 + return;
33446 +
33447 + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33448 +
33449 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33450 + if (cbk_result == CBK_COORD_NOTFOUND)
33451 + uf_info->container = UF_CONTAINER_EMPTY;
33452 + else if (level == LEAF_LEVEL)
33453 + uf_info->container = UF_CONTAINER_TAILS;
33454 + else
33455 + uf_info->container = UF_CONTAINER_EXTENTS;
33456 + } else {
33457 + /*
33458 + * file state is known, check whether it is set correctly if
33459 + * file is not being tail converted
33460 + */
33461 + if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33462 + REISER4_PART_IN_CONV)) {
33463 + assert("vs-1162",
33464 + ergo(level == LEAF_LEVEL &&
33465 + cbk_result == CBK_COORD_FOUND,
33466 + uf_info->container == UF_CONTAINER_TAILS));
33467 + assert("vs-1165",
33468 + ergo(level == TWIG_LEVEL &&
33469 + cbk_result == CBK_COORD_FOUND,
33470 + uf_info->container == UF_CONTAINER_EXTENTS));
33471 + }
33472 + }
33473 +}
33474 +
33475 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33476 + const reiser4_key *key, znode_lock_mode lock_mode,
33477 + struct inode *inode)
33478 +{
33479 + return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33480 + FIND_MAX_NOT_MORE_THAN,
33481 + TWIG_LEVEL, LEAF_LEVEL,
33482 + (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33483 + (CBK_UNIQUE | CBK_FOR_INSERT),
33484 + NULL /* ra_info */ );
33485 +}
33486 +
33487 +/**
33488 + * find_file_item - look for file item in the tree
33489 + * @hint: provides coordinate, lock handle, seal
33490 + * @key: key for search
33491 + * @mode: mode of lock to put on returned node
33492 + * @ra_info:
33493 + * @inode:
33494 + *
33495 + * This finds position in the tree corresponding to @key. It first tries to use
33496 + * @hint's seal if it is set.
33497 + */
33498 +int find_file_item(hint_t *hint, const reiser4_key *key,
33499 + znode_lock_mode lock_mode,
33500 + struct inode *inode)
33501 +{
33502 + int result;
33503 + coord_t *coord;
33504 + lock_handle *lh;
33505 +
33506 + assert("nikita-3030", reiser4_schedulable());
33507 + assert("vs-1707", hint != NULL);
33508 + assert("vs-47", inode != NULL);
33509 +
33510 + coord = &hint->ext_coord.coord;
33511 + lh = hint->ext_coord.lh;
33512 + init_lh(lh);
33513 +
33514 + result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33515 + if (!result) {
33516 + if (coord->between == AFTER_UNIT &&
33517 + equal_to_rdk(coord->node, key)) {
33518 + result = goto_right_neighbor(coord, lh);
33519 + if (result == -E_NO_NEIGHBOR)
33520 + return RETERR(-EIO);
33521 + if (result)
33522 + return result;
33523 + assert("vs-1152", equal_to_ldk(coord->node, key));
33524 + /*
33525 + * we moved to different node. Invalidate coord
33526 + * extension, zload is necessary to init it again
33527 + */
33528 + hint->ext_coord.valid = 0;
33529 + }
33530 +
33531 + set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33532 + znode_get_level(coord->node));
33533 +
33534 + return CBK_COORD_FOUND;
33535 + }
33536 +
33537 + coord_init_zero(coord);
33538 + result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33539 + set_file_state(unix_file_inode_data(inode), result,
33540 + znode_get_level(coord->node));
33541 +
33542 + /* FIXME: we might already have coord extension initialized */
33543 + hint->ext_coord.valid = 0;
33544 + return result;
33545 +}
33546 +
33547 +/* plugin->u.file.write_flowom = NULL
33548 + plugin->u.file.read_flow = NULL */
33549 +
33550 +void hint_init_zero(hint_t * hint)
33551 +{
33552 + memset(hint, 0, sizeof(*hint));
33553 + init_lh(&hint->lh);
33554 + hint->ext_coord.lh = &hint->lh;
33555 +}
33556 +
33557 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33558 +{
33559 + int result;
33560 + reiser4_key key;
33561 + coord_t coord;
33562 + lock_handle lh;
33563 +
33564 + assert("vs-1628", ea_obtained(uf_info));
33565 +
33566 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33567 + key_by_inode_and_offset_common(inode, 0, &key);
33568 + init_lh(&lh);
33569 + result = find_file_item_nohint(&coord, &lh, &key,
33570 + ZNODE_READ_LOCK, inode);
33571 + set_file_state(uf_info, result, znode_get_level(coord.node));
33572 + done_lh(&lh);
33573 + if (!cbk_errored(result))
33574 + result = 0;
33575 + } else
33576 + result = 0;
33577 + assert("vs-1074",
33578 + ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33579 + reiser4_txn_restart_current();
33580 + return result;
33581 +}
33582 +
33583 +/**
33584 + * Estimate and reserve space needed to truncate page
33585 + * which gets partially truncated: one block for page
33586 + * itself, stat-data update (estimate_one_insert_into_item)
33587 + * and one item insertion (estimate_one_insert_into_item)
33588 + * which may happen if page corresponds to hole extent and
33589 + * unallocated one will have to be created
33590 + */
33591 +static int reserve_partial_page(reiser4_tree * tree)
33592 +{
33593 + grab_space_enable();
33594 + return reiser4_grab_reserved(reiser4_get_current_sb(),
33595 + 1 +
33596 + 2 * estimate_one_insert_into_item(tree),
33597 + BA_CAN_COMMIT);
33598 +}
33599 +
33600 +/* estimate and reserve space needed to cut one item and update one stat data */
33601 +static int reserve_cut_iteration(reiser4_tree * tree)
33602 +{
33603 + __u64 estimate = estimate_one_item_removal(tree)
33604 + + estimate_one_insert_into_item(tree);
33605 +
33606 + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33607 +
33608 + grab_space_enable();
33609 + /* We need to double our estimate now that we can delete more than one
33610 + node. */
33611 + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33612 + BA_CAN_COMMIT);
33613 +}
33614 +
33615 +int reiser4_update_file_size(struct inode *inode, loff_t new_size,
33616 + int update_sd)
33617 +{
33618 + int result = 0;
33619 +
33620 + INODE_SET_SIZE(inode, new_size);
33621 + if (update_sd) {
33622 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33623 + result = reiser4_update_sd(inode);
33624 + }
33625 + return result;
33626 +}
33627 +
33628 +/**
33629 + * Cut file items one by one starting from the last one until
33630 + * new file size (inode->i_size) is reached. Reserve space
33631 + * and update file stat data on every single cut from the tree
33632 + */
33633 +int cut_file_items(struct inode *inode, loff_t new_size,
33634 + int update_sd, loff_t cur_size,
33635 + int (*update_actor) (struct inode *, loff_t, int))
33636 +{
33637 + reiser4_key from_key, to_key;
33638 + reiser4_key smallest_removed;
33639 + file_plugin *fplug = inode_file_plugin(inode);
33640 + int result;
33641 + int progress = 0;
33642 +
33643 + assert("vs-1248",
33644 + fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33645 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33646 +
33647 + fplug->key_by_inode(inode, new_size, &from_key);
33648 + to_key = from_key;
33649 + set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33650 + /* this loop normally runs just once */
33651 + while (1) {
33652 + result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33653 + if (result)
33654 + break;
33655 +
33656 + result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33657 + &smallest_removed, inode, 1,
33658 + &progress);
33659 + if (result == -E_REPEAT) {
33660 + /**
33661 + * -E_REPEAT is a signal to interrupt a long
33662 + * file truncation process
33663 + */
33664 + if (progress) {
33665 + result = update_actor(inode,
33666 + get_key_offset(&smallest_removed),
33667 + update_sd);
33668 + if (result)
33669 + break;
33670 + }
33671 + /* the below does up(sbinfo->delete_mutex).
33672 + * Do not get folled */
33673 + reiser4_release_reserved(inode->i_sb);
33674 + /**
33675 + * reiser4_cut_tree_object() was interrupted probably
33676 + * because current atom requires commit, we have to
33677 + * release transaction handle to allow atom commit.
33678 + */
33679 + reiser4_txn_restart_current();
33680 + continue;
33681 + }
33682 + if (result
33683 + && !(result == CBK_COORD_NOTFOUND && new_size == 0
33684 + && inode->i_size == 0))
33685 + break;
33686 +
33687 + set_key_offset(&smallest_removed, new_size);
33688 + /* Final sd update after the file gets its correct size */
33689 + result = update_actor(inode, get_key_offset(&smallest_removed),
33690 + update_sd);
33691 + break;
33692 + }
33693 +
33694 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
33695 + reiser4_release_reserved(inode->i_sb);
33696 +
33697 + return result;
33698 +}
33699 +
33700 +int find_or_create_extent(struct page *page);
33701 +
33702 +/* part of truncate_file_body: it is called when truncate is used to make file
33703 + shorter */
33704 +static int shorten_file(struct inode *inode, loff_t new_size)
33705 +{
33706 + int result;
33707 + struct page *page;
33708 + int padd_from;
33709 + unsigned long index;
33710 + struct unix_file_info *uf_info;
33711 +
33712 + /*
33713 + * all items of ordinary reiser4 file are grouped together. That is why
33714 + * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33715 + * truncated that simply
33716 + */
33717 + result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33718 + get_key_offset(reiser4_max_key()),
33719 + reiser4_update_file_size);
33720 + if (result)
33721 + return result;
33722 +
33723 + uf_info = unix_file_inode_data(inode);
33724 + assert("vs-1105", new_size == inode->i_size);
33725 + if (new_size == 0) {
33726 + uf_info->container = UF_CONTAINER_EMPTY;
33727 + return 0;
33728 + }
33729 +
33730 + result = find_file_state(inode, uf_info);
33731 + if (result)
33732 + return result;
33733 + if (uf_info->container == UF_CONTAINER_TAILS)
33734 + /*
33735 + * No need to worry about zeroing last page after new file
33736 + * end
33737 + */
33738 + return 0;
33739 +
33740 + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33741 + if (!padd_from)
33742 + /* file is truncated to page boundary */
33743 + return 0;
33744 +
33745 + result = reserve_partial_page(reiser4_tree_by_inode(inode));
33746 + if (result) {
33747 + reiser4_release_reserved(inode->i_sb);
33748 + return result;
33749 + }
33750 +
33751 + /* last page is partially truncated - zero its content */
33752 + index = (inode->i_size >> PAGE_CACHE_SHIFT);
33753 + page = read_mapping_page(inode->i_mapping, index, NULL);
33754 + if (IS_ERR(page)) {
33755 + /*
33756 + * the below does up(sbinfo->delete_mutex). Do not get
33757 + * confused
33758 + */
33759 + reiser4_release_reserved(inode->i_sb);
33760 + if (likely(PTR_ERR(page) == -EINVAL)) {
33761 + /* looks like file is built of tail items */
33762 + return 0;
33763 + }
33764 + return PTR_ERR(page);
33765 + }
33766 + wait_on_page_locked(page);
33767 + if (!PageUptodate(page)) {
33768 + page_cache_release(page);
33769 + /*
33770 + * the below does up(sbinfo->delete_mutex). Do not get
33771 + * confused
33772 + */
33773 + reiser4_release_reserved(inode->i_sb);
33774 + return RETERR(-EIO);
33775 + }
33776 +
33777 + /*
33778 + * if page correspons to hole extent unit - unallocated one will be
33779 + * created here. This is not necessary
33780 + */
33781 + result = find_or_create_extent(page);
33782 +
33783 + /*
33784 + * FIXME: cut_file_items has already updated inode. Probably it would
33785 + * be better to update it here when file is really truncated
33786 + */
33787 + if (result) {
33788 + page_cache_release(page);
33789 + /*
33790 + * the below does up(sbinfo->delete_mutex). Do not get
33791 + * confused
33792 + */
33793 + reiser4_release_reserved(inode->i_sb);
33794 + return result;
33795 + }
33796 +
33797 + lock_page(page);
33798 + assert("vs-1066", PageLocked(page));
33799 + zero_user_segment(page, padd_from, PAGE_CACHE_SIZE);
33800 + unlock_page(page);
33801 + page_cache_release(page);
33802 + /* the below does up(sbinfo->delete_mutex). Do not get confused */
33803 + reiser4_release_reserved(inode->i_sb);
33804 + return 0;
33805 +}
33806 +
33807 +/**
33808 + * should_have_notail
33809 + * @uf_info:
33810 + * @new_size:
33811 + *
33812 + * Calls formatting plugin to see whether file of size @new_size has to be
33813 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
33814 + */
33815 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33816 +{
33817 + if (!uf_info->tplug)
33818 + return 1;
33819 + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33820 + new_size);
33821 +
33822 +}
33823 +
33824 +/**
33825 + * truncate_file_body - change length of file
33826 + * @inode: inode of file
33827 + * @new_size: new file length
33828 + *
33829 + * Adjusts items file @inode is built of to match @new_size. It may either cut
33830 + * items or add them to represent a hole at the end of file. The caller has to
33831 + * obtain exclusive access to the file.
33832 + */
33833 +static int truncate_file_body(struct inode *inode, struct iattr *attr)
33834 +{
33835 + int result;
33836 + loff_t new_size = attr->ia_size;
33837 +
33838 + if (inode->i_size < new_size) {
33839 + /* expanding truncate */
33840 + struct unix_file_info *uf_info = unix_file_inode_data(inode);
33841 +
33842 + result = find_file_state(inode, uf_info);
33843 + if (result)
33844 + return result;
33845 +
33846 + if (should_have_notail(uf_info, new_size)) {
33847 + /*
33848 + * file of size @new_size has to be built of
33849 + * extents. If it is built of tails - convert to
33850 + * extents
33851 + */
33852 + if (uf_info->container == UF_CONTAINER_TAILS) {
33853 + /*
33854 + * if file is being convered by another process
33855 + * - wait until it completes
33856 + */
33857 + while (1) {
33858 + if (reiser4_inode_get_flag(inode,
33859 + REISER4_PART_IN_CONV)) {
33860 + drop_exclusive_access(uf_info);
33861 + schedule();
33862 + get_exclusive_access(uf_info);
33863 + continue;
33864 + }
33865 + break;
33866 + }
33867 +
33868 + if (uf_info->container == UF_CONTAINER_TAILS) {
33869 + result = tail2extent(uf_info);
33870 + if (result)
33871 + return result;
33872 + }
33873 + }
33874 + result = reiser4_write_extent(NULL, inode, NULL,
33875 + 0, &new_size);
33876 + if (result)
33877 + return result;
33878 + uf_info->container = UF_CONTAINER_EXTENTS;
33879 + } else {
33880 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
33881 + result = reiser4_write_extent(NULL, inode, NULL,
33882 + 0, &new_size);
33883 + if (result)
33884 + return result;
33885 + } else {
33886 + result = reiser4_write_tail(NULL, inode, NULL,
33887 + 0, &new_size);
33888 + if (result)
33889 + return result;
33890 + uf_info->container = UF_CONTAINER_TAILS;
33891 + }
33892 + }
33893 + BUG_ON(result > 0);
33894 + result = reiser4_update_file_size(inode, new_size, 1);
33895 + BUG_ON(result != 0);
33896 + } else
33897 + result = shorten_file(inode, new_size);
33898 + return result;
33899 +}
33900 +
33901 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33902 +
33903 +/**
33904 + * load_file_hint - copy hint from struct file to local variable
33905 + * @file: file to get hint from
33906 + * @hint: structure to fill
33907 + *
33908 + * Reiser4 specific portion of struct file may contain information (hint)
33909 + * stored on exiting from previous read or write. That information includes
33910 + * seal of znode and coord within that znode where previous read or write
33911 + * stopped. This function copies that information to @hint if it was stored or
33912 + * initializes @hint by 0s otherwise.
33913 + */
33914 +int load_file_hint(struct file *file, hint_t *hint)
33915 +{
33916 + reiser4_file_fsdata *fsdata;
33917 +
33918 + if (file) {
33919 + fsdata = reiser4_get_file_fsdata(file);
33920 + if (IS_ERR(fsdata))
33921 + return PTR_ERR(fsdata);
33922 +
33923 + spin_lock_inode(file->f_dentry->d_inode);
33924 + if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33925 + *hint = fsdata->reg.hint;
33926 + init_lh(&hint->lh);
33927 + hint->ext_coord.lh = &hint->lh;
33928 + spin_unlock_inode(file->f_dentry->d_inode);
33929 + /*
33930 + * force re-validation of the coord on the first
33931 + * iteration of the read/write loop.
33932 + */
33933 + hint->ext_coord.valid = 0;
33934 + assert("nikita-19892", coords_equal(&hint->seal.coord1,
33935 + &hint->ext_coord.
33936 + coord));
33937 + return 0;
33938 + }
33939 + memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33940 + spin_unlock_inode(file->f_dentry->d_inode);
33941 + }
33942 + hint_init_zero(hint);
33943 + return 0;
33944 +}
33945 +
33946 +/**
33947 + * save_file_hint - copy hint to reiser4 private struct file's part
33948 + * @file: file to save hint in
33949 + * @hint: hint to save
33950 + *
33951 + * This copies @hint to reiser4 private part of struct file. It can help
33952 + * speedup future accesses to the file.
33953 + */
33954 +void save_file_hint(struct file *file, const hint_t *hint)
33955 +{
33956 + reiser4_file_fsdata *fsdata;
33957 +
33958 + assert("edward-1337", hint != NULL);
33959 +
33960 + if (!file || !reiser4_seal_is_set(&hint->seal))
33961 + return;
33962 + fsdata = reiser4_get_file_fsdata(file);
33963 + assert("vs-965", !IS_ERR(fsdata));
33964 + assert("nikita-19891",
33965 + coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33966 + assert("vs-30", hint->lh.owner == NULL);
33967 + spin_lock_inode(file->f_dentry->d_inode);
33968 + fsdata->reg.hint = *hint;
33969 + spin_unlock_inode(file->f_dentry->d_inode);
33970 + return;
33971 +}
33972 +
33973 +void reiser4_unset_hint(hint_t * hint)
33974 +{
33975 + assert("vs-1315", hint);
33976 + hint->ext_coord.valid = 0;
33977 + reiser4_seal_done(&hint->seal);
33978 + done_lh(&hint->lh);
33979 +}
33980 +
33981 +/* coord must be set properly. So, that reiser4_set_hint
33982 + has nothing to do */
33983 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33984 + znode_lock_mode mode)
33985 +{
33986 + ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33987 + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33988 +
33989 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33990 + hint->offset = get_key_offset(key);
33991 + hint->mode = mode;
33992 + done_lh(&hint->lh);
33993 +}
33994 +
33995 +int hint_is_set(const hint_t * hint)
33996 +{
33997 + return reiser4_seal_is_set(&hint->seal);
33998 +}
33999 +
34000 +#if REISER4_DEBUG
34001 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
34002 +{
34003 + return (get_key_locality(k1) == get_key_locality(k2) &&
34004 + get_key_type(k1) == get_key_type(k2) &&
34005 + get_key_band(k1) == get_key_band(k2) &&
34006 + get_key_ordering(k1) == get_key_ordering(k2) &&
34007 + get_key_objectid(k1) == get_key_objectid(k2));
34008 +}
34009 +#endif
34010 +
34011 +static int
34012 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
34013 + znode_lock_mode lock_mode)
34014 +{
34015 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
34016 + /* hint either not set or set by different operation */
34017 + return RETERR(-E_REPEAT);
34018 +
34019 + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
34020 +
34021 + if (check_key && get_key_offset(key) != hint->offset)
34022 + /* hint is set for different key */
34023 + return RETERR(-E_REPEAT);
34024 +
34025 + assert("vs-31", hint->ext_coord.lh == &hint->lh);
34026 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
34027 + hint->ext_coord.lh, lock_mode,
34028 + ZNODE_LOCK_LOPRI);
34029 +}
34030 +
34031 +/**
34032 + * Look for place at twig level for extent corresponding to page,
34033 + * call extent's writepage method to create unallocated extent if
34034 + * it does not exist yet, initialize jnode, capture page
34035 + */
34036 +int find_or_create_extent(struct page *page)
34037 +{
34038 + int result;
34039 + struct inode *inode;
34040 + int plugged_hole;
34041 +
34042 + jnode *node;
34043 +
34044 + assert("vs-1065", page->mapping && page->mapping->host);
34045 + inode = page->mapping->host;
34046 +
34047 + lock_page(page);
34048 + node = jnode_of_page(page);
34049 + if (IS_ERR(node)) {
34050 + unlock_page(page);
34051 + return PTR_ERR(node);
34052 + }
34053 + JF_SET(node, JNODE_WRITE_PREPARED);
34054 + unlock_page(page);
34055 +
34056 + if (node->blocknr == 0) {
34057 + plugged_hole = 0;
34058 + result = reiser4_update_extent(inode, node, page_offset(page),
34059 + &plugged_hole);
34060 + if (result) {
34061 + JF_CLR(node, JNODE_WRITE_PREPARED);
34062 + jput(node);
34063 + warning("edward-1549",
34064 + "reiser4_update_extent failed: %d", result);
34065 + return result;
34066 + }
34067 + if (plugged_hole)
34068 + reiser4_update_sd(inode);
34069 + } else {
34070 + spin_lock_jnode(node);
34071 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
34072 + BUG_ON(result != 0);
34073 + jnode_make_dirty_locked(node);
34074 + spin_unlock_jnode(node);
34075 + }
34076 +
34077 + BUG_ON(node->atom == NULL);
34078 + JF_CLR(node, JNODE_WRITE_PREPARED);
34079 +
34080 + if (get_current_context()->entd) {
34081 + entd_context *ent = get_entd_context(node->tree->super);
34082 +
34083 + if (ent->cur_request->page == page)
34084 + /* the following reference will be
34085 + dropped in reiser4_writeout */
34086 + ent->cur_request->node = jref(node);
34087 + }
34088 + jput(node);
34089 + return 0;
34090 +}
34091 +
34092 +/**
34093 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
34094 + * @inode: inode to check
34095 + *
34096 + * Returns true if inode's mapping has dirty pages which do not belong to any
34097 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
34098 + * tree or were eflushed and can be found via jnodes tagged
34099 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
34100 + */
34101 +static int has_anonymous_pages(struct inode *inode)
34102 +{
34103 + int result;
34104 +
34105 + spin_lock_irq(&inode->i_mapping->tree_lock);
34106 + result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
34107 + spin_unlock_irq(&inode->i_mapping->tree_lock);
34108 + return result;
34109 +}
34110 +
34111 +/**
34112 + * capture_page_and_create_extent -
34113 + * @page: page to be captured
34114 + *
34115 + * Grabs space for extent creation and stat data update and calls function to
34116 + * do actual work.
34117 + */
34118 +static int capture_page_and_create_extent(struct page *page)
34119 +{
34120 + int result;
34121 + struct inode *inode;
34122 +
34123 + assert("vs-1084", page->mapping && page->mapping->host);
34124 + inode = page->mapping->host;
34125 + assert("vs-1139",
34126 + unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
34127 + /* page belongs to file */
34128 + assert("vs-1393",
34129 + inode->i_size > page_offset(page));
34130 +
34131 + /* page capture may require extent creation (if it does not exist yet)
34132 + and stat data's update (number of blocks changes on extent
34133 + creation) */
34134 + grab_space_enable();
34135 + result = reiser4_grab_space(2 * estimate_one_insert_into_item
34136 + (reiser4_tree_by_inode(inode)),
34137 + BA_CAN_COMMIT);
34138 + if (likely(!result))
34139 + result = find_or_create_extent(page);
34140 +
34141 + if (result != 0)
34142 + SetPageError(page);
34143 + return result;
34144 +}
34145 +
34146 +/* plugin->write_end() */
34147 +int write_end_unix_file(struct file *file, struct page *page,
34148 + unsigned from, unsigned to)
34149 +{
34150 + unlock_page(page);
34151 + return capture_page_and_create_extent(page);
34152 +}
34153 +
34154 +/*
34155 + * Support for "anonymous" pages and jnodes.
34156 + *
34157 + * When file is write-accessed through mmap pages can be dirtied from the user
34158 + * level. In this case kernel is not notified until one of following happens:
34159 + *
34160 + * (1) msync()
34161 + *
34162 + * (2) truncate() (either explicit or through unlink)
34163 + *
34164 + * (3) VM scanner starts reclaiming mapped pages, dirtying them before
34165 + * starting write-back.
34166 + *
34167 + * As a result of (3) ->writepage may be called on a dirty page without
34168 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
34169 + * (iozone) generate huge number of anonymous pages.
34170 + *
34171 + * reiser4_sync_sb() method tries to insert anonymous pages into
34172 + * tree. This is done by capture_anonymous_*() functions below.
34173 + */
34174 +
34175 +/**
34176 + * capture_anonymous_page - involve page into transaction
34177 + * @pg: page to deal with
34178 + *
34179 + * Takes care that @page has corresponding metadata in the tree, creates jnode
34180 + * for @page and captures it. On success 1 is returned.
34181 + */
34182 +static int capture_anonymous_page(struct page *page)
34183 +{
34184 + int result;
34185 +
34186 + if (PageWriteback(page))
34187 + /* FIXME: do nothing? */
34188 + return 0;
34189 +
34190 + result = capture_page_and_create_extent(page);
34191 + if (result == 0) {
34192 + result = 1;
34193 + } else
34194 + warning("nikita-3329",
34195 + "Cannot capture anon page: %i", result);
34196 +
34197 + return result;
34198 +}
34199 +
34200 +/**
34201 + * capture_anonymous_pages - find and capture pages dirtied via mmap
34202 + * @mapping: address space where to look for pages
34203 + * @index: start index
34204 + * @to_capture: maximum number of pages to capture
34205 + *
34206 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
34207 + * captures (involves into atom) them, returns number of captured pages,
34208 + * updates @index to next page after the last captured one.
34209 + */
34210 +static int
34211 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
34212 + unsigned int to_capture)
34213 +{
34214 + int result;
34215 + struct pagevec pvec;
34216 + unsigned int i, count;
34217 + int nr;
34218 +
34219 + pagevec_init(&pvec, 0);
34220 + count = min(pagevec_space(&pvec), to_capture);
34221 + nr = 0;
34222 +
34223 + /* find pages tagged MOVED */
34224 + spin_lock_irq(&mapping->tree_lock);
34225 + pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
34226 + (void **)pvec.pages, *index, count,
34227 + PAGECACHE_TAG_REISER4_MOVED);
34228 + if (pagevec_count(&pvec) == 0) {
34229 + /*
34230 + * there are no pages tagged MOVED in mapping->page_tree
34231 + * starting from *index
34232 + */
34233 + spin_unlock_irq(&mapping->tree_lock);
34234 + *index = (pgoff_t)-1;
34235 + return 0;
34236 + }
34237 +
34238 + /* clear MOVED tag for all found pages */
34239 + for (i = 0; i < pagevec_count(&pvec); i++) {
34240 + page_cache_get(pvec.pages[i]);
34241 + radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
34242 + PAGECACHE_TAG_REISER4_MOVED);
34243 + }
34244 + spin_unlock_irq(&mapping->tree_lock);
34245 +
34246 +
34247 + *index = pvec.pages[i - 1]->index + 1;
34248 +
34249 + for (i = 0; i < pagevec_count(&pvec); i++) {
34250 + result = capture_anonymous_page(pvec.pages[i]);
34251 + if (result == 1)
34252 + nr++;
34253 + else {
34254 + if (result < 0) {
34255 + warning("vs-1454",
34256 + "failed to capture page: "
34257 + "result=%d, captured=%d)\n",
34258 + result, i);
34259 +
34260 + /*
34261 + * set MOVED tag to all pages which left not
34262 + * captured
34263 + */
34264 + spin_lock_irq(&mapping->tree_lock);
34265 + for (; i < pagevec_count(&pvec); i ++) {
34266 + radix_tree_tag_set(&mapping->page_tree,
34267 + pvec.pages[i]->index,
34268 + PAGECACHE_TAG_REISER4_MOVED);
34269 + }
34270 + spin_unlock_irq(&mapping->tree_lock);
34271 +
34272 + pagevec_release(&pvec);
34273 + return result;
34274 + } else {
34275 + /*
34276 + * result == 0. capture_anonymous_page returns
34277 + * 0 for Writeback-ed page. Set MOVED tag on
34278 + * that page
34279 + */
34280 + spin_lock_irq(&mapping->tree_lock);
34281 + radix_tree_tag_set(&mapping->page_tree,
34282 + pvec.pages[i]->index,
34283 + PAGECACHE_TAG_REISER4_MOVED);
34284 + spin_unlock_irq(&mapping->tree_lock);
34285 + if (i == 0)
34286 + *index = pvec.pages[0]->index;
34287 + else
34288 + *index = pvec.pages[i - 1]->index + 1;
34289 + }
34290 + }
34291 + }
34292 + pagevec_release(&pvec);
34293 + return nr;
34294 +}
34295 +
34296 +/**
34297 + * capture_anonymous_jnodes - find and capture anonymous jnodes
34298 + * @mapping: address space where to look for jnodes
34299 + * @from: start index
34300 + * @to: end index
34301 + * @to_capture: maximum number of jnodes to capture
34302 + *
34303 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
34304 + * the range of indexes @from-@to and captures them, returns number of captured
34305 + * jnodes, updates @from to next jnode after the last captured one.
34306 + */
34307 +static int
34308 +capture_anonymous_jnodes(struct address_space *mapping,
34309 + pgoff_t *from, pgoff_t to, int to_capture)
34310 +{
34311 + *from = to;
34312 + return 0;
34313 +}
34314 +
34315 +/*
34316 + * Commit atom of the jnode of a page.
34317 + */
34318 +static int sync_page(struct page *page)
34319 +{
34320 + int result;
34321 + do {
34322 + jnode *node;
34323 + txn_atom *atom;
34324 +
34325 + lock_page(page);
34326 + node = jprivate(page);
34327 + if (node != NULL) {
34328 + spin_lock_jnode(node);
34329 + atom = jnode_get_atom(node);
34330 + spin_unlock_jnode(node);
34331 + } else
34332 + atom = NULL;
34333 + unlock_page(page);
34334 + result = reiser4_sync_atom(atom);
34335 + } while (result == -E_REPEAT);
34336 + /*
34337 + * ZAM-FIXME-HANS: document the logic of this loop, is it just to
34338 + * handle the case where more pages get added to the atom while we are
34339 + * syncing it?
34340 + */
34341 + assert("nikita-3485", ergo(result == 0,
34342 + get_current_context()->trans->atom == NULL));
34343 + return result;
34344 +}
34345 +
34346 +/*
34347 + * Commit atoms of pages on @pages list.
34348 + * call sync_page for each page from mapping's page tree
34349 + */
34350 +static int sync_page_list(struct inode *inode)
34351 +{
34352 + int result;
34353 + struct address_space *mapping;
34354 + unsigned long from; /* start index for radix_tree_gang_lookup */
34355 + unsigned int found; /* return value for radix_tree_gang_lookup */
34356 +
34357 + mapping = inode->i_mapping;
34358 + from = 0;
34359 + result = 0;
34360 + spin_lock_irq(&mapping->tree_lock);
34361 + while (result == 0) {
34362 + struct page *page;
34363 +
34364 + found =
34365 + radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34366 + from, 1);
34367 + assert("edward-1550", found < 2);
34368 + if (found == 0)
34369 + break;
34370 + /**
34371 + * page may not leave radix tree because it is protected from
34372 + * truncating by inode->i_mutex locked by sys_fsync
34373 + */
34374 + page_cache_get(page);
34375 + spin_unlock_irq(&mapping->tree_lock);
34376 +
34377 + from = page->index + 1;
34378 +
34379 + result = sync_page(page);
34380 +
34381 + page_cache_release(page);
34382 + spin_lock_irq(&mapping->tree_lock);
34383 + }
34384 +
34385 + spin_unlock_irq(&mapping->tree_lock);
34386 + return result;
34387 +}
34388 +
34389 +static int commit_file_atoms(struct inode *inode)
34390 +{
34391 + int result;
34392 + struct unix_file_info *uf_info;
34393 +
34394 + uf_info = unix_file_inode_data(inode);
34395 +
34396 + get_exclusive_access(uf_info);
34397 + /*
34398 + * find what items file is made from
34399 + */
34400 + result = find_file_state(inode, uf_info);
34401 + drop_exclusive_access(uf_info);
34402 + if (result != 0)
34403 + return result;
34404 +
34405 + /*
34406 + * file state cannot change because we are under ->i_mutex
34407 + */
34408 + switch (uf_info->container) {
34409 + case UF_CONTAINER_EXTENTS:
34410 + /* find_file_state might open join an atom */
34411 + reiser4_txn_restart_current();
34412 + result =
34413 + /*
34414 + * when we are called by
34415 + * filemap_fdatawrite->
34416 + * do_writepages()->
34417 + * reiser4_writepages()
34418 + *
34419 + * inode->i_mapping->dirty_pages are spices into
34420 + * ->io_pages, leaving ->dirty_pages dirty.
34421 + *
34422 + * When we are called from
34423 + * reiser4_fsync()->sync_unix_file(), we have to
34424 + * commit atoms of all pages on the ->dirty_list.
34425 + *
34426 + * So for simplicity we just commit ->io_pages and
34427 + * ->dirty_pages.
34428 + */
34429 + sync_page_list(inode);
34430 + break;
34431 + case UF_CONTAINER_TAILS:
34432 + /*
34433 + * NOTE-NIKITA probably we can be smarter for tails. For now
34434 + * just commit all existing atoms.
34435 + */
34436 + result = txnmgr_force_commit_all(inode->i_sb, 0);
34437 + break;
34438 + case UF_CONTAINER_EMPTY:
34439 + result = 0;
34440 + break;
34441 + case UF_CONTAINER_UNKNOWN:
34442 + default:
34443 + result = -EIO;
34444 + break;
34445 + }
34446 +
34447 + /*
34448 + * commit current transaction: there can be captured nodes from
34449 + * find_file_state() and finish_conversion().
34450 + */
34451 + reiser4_txn_restart_current();
34452 + return result;
34453 +}
34454 +
34455 +/**
34456 + * writepages_unix_file - writepages of struct address_space_operations
34457 + * @mapping:
34458 + * @wbc:
34459 + *
34460 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34461 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34462 + * created by reiser4_writepage.
34463 + */
34464 +int writepages_unix_file(struct address_space *mapping,
34465 + struct writeback_control *wbc)
34466 +{
34467 + int result;
34468 + struct unix_file_info *uf_info;
34469 + pgoff_t pindex, jindex, nr_pages;
34470 + long to_capture;
34471 + struct inode *inode;
34472 +
34473 + inode = mapping->host;
34474 + if (!has_anonymous_pages(inode)) {
34475 + result = 0;
34476 + goto end;
34477 + }
34478 + jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34479 + result = 0;
34480 + nr_pages = size_in_pages(i_size_read(inode));
34481 +
34482 + uf_info = unix_file_inode_data(inode);
34483 +
34484 + do {
34485 + reiser4_context *ctx;
34486 +
34487 + if (wbc->sync_mode != WB_SYNC_ALL)
34488 + to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34489 + else
34490 + to_capture = CAPTURE_APAGE_BURST;
34491 +
34492 + ctx = reiser4_init_context(inode->i_sb);
34493 + if (IS_ERR(ctx)) {
34494 + result = PTR_ERR(ctx);
34495 + break;
34496 + }
34497 + /* avoid recursive calls to ->sync_inodes */
34498 + ctx->nobalance = 1;
34499 + assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34500 + assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
34501 + assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
34502 +
34503 + reiser4_txn_restart_current();
34504 +
34505 + /* we have to get nonexclusive access to the file */
34506 + if (get_current_context()->entd) {
34507 + /*
34508 + * use nonblocking version of nonexclusive_access to
34509 + * avoid deadlock which might look like the following:
34510 + * process P1 holds NEA on file F1 and called entd to
34511 + * reclaim some memory. Entd works for P1 and is going
34512 + * to capture pages of file F2. To do that entd has to
34513 + * get NEA to F2. F2 is held by process P2 which also
34514 + * called entd. But entd is serving P1 at the moment
34515 + * and P2 has to wait. Process P3 trying to get EA to
34516 + * file F2. Existence of pending EA request to file F2
34517 + * makes impossible for entd to get NEA to file
34518 + * F2. Neither of these process can continue. Using
34519 + * nonblocking version of gettign NEA is supposed to
34520 + * avoid this deadlock.
34521 + */
34522 + if (try_to_get_nonexclusive_access(uf_info) == 0) {
34523 + result = RETERR(-EBUSY);
34524 + reiser4_exit_context(ctx);
34525 + break;
34526 + }
34527 + } else
34528 + get_nonexclusive_access(uf_info);
34529 +
34530 + while (to_capture > 0) {
34531 + pgoff_t start;
34532 +
34533 + assert("vs-1727", jindex <= pindex);
34534 + if (pindex == jindex) {
34535 + start = pindex;
34536 + result =
34537 + capture_anonymous_pages(inode->i_mapping,
34538 + &pindex,
34539 + to_capture);
34540 + if (result <= 0)
34541 + break;
34542 + to_capture -= result;
34543 + wbc->nr_to_write -= result;
34544 + if (start + result == pindex) {
34545 + jindex = pindex;
34546 + continue;
34547 + }
34548 + if (to_capture <= 0)
34549 + break;
34550 + }
34551 + /* deal with anonymous jnodes between jindex and pindex */
34552 + result =
34553 + capture_anonymous_jnodes(inode->i_mapping, &jindex,
34554 + pindex, to_capture);
34555 + if (result < 0)
34556 + break;
34557 + to_capture -= result;
34558 + get_current_context()->nr_captured += result;
34559 +
34560 + if (jindex == (pgoff_t) - 1) {
34561 + assert("vs-1728", pindex == (pgoff_t) - 1);
34562 + break;
34563 + }
34564 + }
34565 + if (to_capture <= 0)
34566 + /* there may be left more pages */
34567 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
34568 +
34569 + drop_nonexclusive_access(uf_info);
34570 + if (result < 0) {
34571 + /* error happened */
34572 + reiser4_exit_context(ctx);
34573 + return result;
34574 + }
34575 + if (wbc->sync_mode != WB_SYNC_ALL) {
34576 + reiser4_exit_context(ctx);
34577 + return 0;
34578 + }
34579 + result = commit_file_atoms(inode);
34580 + reiser4_exit_context(ctx);
34581 + if (pindex >= nr_pages && jindex == pindex)
34582 + break;
34583 + } while (1);
34584 +
34585 + end:
34586 + if (is_in_reiser4_context()) {
34587 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34588 + /*
34589 + * there are already pages to flush, flush them out, do
34590 + * not delay until end of reiser4_sync_inodes
34591 + */
34592 + reiser4_writeout(inode->i_sb, wbc);
34593 + get_current_context()->nr_captured = 0;
34594 + }
34595 + }
34596 + return result;
34597 +}
34598 +
34599 +/**
34600 + * readpage_unix_file_nolock - readpage of struct address_space_operations
34601 + * @file:
34602 + * @page:
34603 + *
34604 + * Compose a key and search for item containing information about @page
34605 + * data. If item is found - its readpage method is called.
34606 + */
34607 +int readpage_unix_file(struct file *file, struct page *page)
34608 +{
34609 + reiser4_context *ctx;
34610 + int result;
34611 + struct inode *inode;
34612 + reiser4_key key;
34613 + item_plugin *iplug;
34614 + hint_t *hint;
34615 + lock_handle *lh;
34616 + coord_t *coord;
34617 +
34618 + assert("vs-1062", PageLocked(page));
34619 + assert("vs-976", !PageUptodate(page));
34620 + assert("vs-1061", page->mapping && page->mapping->host);
34621 +
34622 + if (page->mapping->host->i_size <= page_offset(page)) {
34623 + /* page is out of file */
34624 + zero_user(page, 0, PAGE_CACHE_SIZE);
34625 + SetPageUptodate(page);
34626 + unlock_page(page);
34627 + return 0;
34628 + }
34629 +
34630 + inode = page->mapping->host;
34631 + ctx = reiser4_init_context(inode->i_sb);
34632 + if (IS_ERR(ctx)) {
34633 + unlock_page(page);
34634 + return PTR_ERR(ctx);
34635 + }
34636 +
34637 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34638 + if (hint == NULL) {
34639 + unlock_page(page);
34640 + reiser4_exit_context(ctx);
34641 + return RETERR(-ENOMEM);
34642 + }
34643 +
34644 + result = load_file_hint(file, hint);
34645 + if (result) {
34646 + kfree(hint);
34647 + unlock_page(page);
34648 + reiser4_exit_context(ctx);
34649 + return result;
34650 + }
34651 + lh = &hint->lh;
34652 +
34653 + /* get key of first byte of the page */
34654 + key_by_inode_and_offset_common(inode, page_offset(page), &key);
34655 +
34656 + /* look for file metadata corresponding to first byte of page */
34657 + page_cache_get(page);
34658 + unlock_page(page);
34659 + result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34660 + lock_page(page);
34661 + page_cache_release(page);
34662 +
34663 + if (page->mapping == NULL) {
34664 + /*
34665 + * readpage allows truncate to run concurrently. Page was
34666 + * truncated while it was not locked
34667 + */
34668 + done_lh(lh);
34669 + kfree(hint);
34670 + unlock_page(page);
34671 + reiser4_txn_restart(ctx);
34672 + reiser4_exit_context(ctx);
34673 + return -EINVAL;
34674 + }
34675 +
34676 + if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34677 + if (result == CBK_COORD_FOUND &&
34678 + hint->ext_coord.coord.between != AT_UNIT)
34679 + /* file is truncated */
34680 + result = -EINVAL;
34681 + done_lh(lh);
34682 + kfree(hint);
34683 + unlock_page(page);
34684 + reiser4_txn_restart(ctx);
34685 + reiser4_exit_context(ctx);
34686 + return result;
34687 + }
34688 +
34689 + /*
34690 + * item corresponding to page is found. It can not be removed because
34691 + * znode lock is held
34692 + */
34693 + if (PageUptodate(page)) {
34694 + done_lh(lh);
34695 + kfree(hint);
34696 + unlock_page(page);
34697 + reiser4_txn_restart(ctx);
34698 + reiser4_exit_context(ctx);
34699 + return 0;
34700 + }
34701 +
34702 + coord = &hint->ext_coord.coord;
34703 + result = zload(coord->node);
34704 + if (result) {
34705 + done_lh(lh);
34706 + kfree(hint);
34707 + unlock_page(page);
34708 + reiser4_txn_restart(ctx);
34709 + reiser4_exit_context(ctx);
34710 + return result;
34711 + }
34712 +
34713 + validate_extended_coord(&hint->ext_coord, page_offset(page));
34714 +
34715 + if (!coord_is_existing_unit(coord)) {
34716 + /* this indicates corruption */
34717 + warning("vs-280",
34718 + "Looking for page %lu of file %llu (size %lli). "
34719 + "No file items found (%d). File is corrupted?\n",
34720 + page->index, (unsigned long long)get_inode_oid(inode),
34721 + inode->i_size, result);
34722 + zrelse(coord->node);
34723 + done_lh(lh);
34724 + kfree(hint);
34725 + unlock_page(page);
34726 + reiser4_txn_restart(ctx);
34727 + reiser4_exit_context(ctx);
34728 + return RETERR(-EIO);
34729 + }
34730 +
34731 + /*
34732 + * get plugin of found item or use plugin if extent if there are no
34733 + * one
34734 + */
34735 + iplug = item_plugin_by_coord(coord);
34736 + if (iplug->s.file.readpage)
34737 + result = iplug->s.file.readpage(coord, page);
34738 + else
34739 + result = RETERR(-EINVAL);
34740 +
34741 + if (!result) {
34742 + set_key_offset(&key,
34743 + (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34744 + /* FIXME should call reiser4_set_hint() */
34745 + reiser4_unset_hint(hint);
34746 + } else {
34747 + unlock_page(page);
34748 + reiser4_unset_hint(hint);
34749 + }
34750 + assert("vs-979",
34751 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34752 + assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34753 +
34754 + zrelse(coord->node);
34755 + done_lh(lh);
34756 +
34757 + save_file_hint(file, hint);
34758 + kfree(hint);
34759 +
34760 + /*
34761 + * FIXME: explain why it is needed. HINT: page allocation in write can
34762 + * not be done when atom is not NULL because reiser4_writepage can not
34763 + * kick entd and have to eflush
34764 + */
34765 + reiser4_txn_restart(ctx);
34766 + reiser4_exit_context(ctx);
34767 + return result;
34768 +}
34769 +
34770 +struct uf_readpages_context {
34771 + lock_handle lh;
34772 + coord_t coord;
34773 +};
34774 +
34775 +/* A callback function for readpages_unix_file/read_cache_pages.
34776 + * If the file is build of tails, then return error (-ENOENT).
34777 + *
34778 + * @data -- a pointer to reiser4_readpages_context object,
34779 + * to save the twig lock and the coord between
34780 + * read_cache_page iterations.
34781 + * @page -- page to start read.
34782 + */
34783 +static int uf_readpages_filler(void * data, struct page * page)
34784 +{
34785 + struct uf_readpages_context *rc = data;
34786 + jnode * node;
34787 + int ret = 0;
34788 + reiser4_extent *ext;
34789 + __u64 ext_index;
34790 + int cbk_done = 0;
34791 + struct address_space * mapping = page->mapping;
34792 +
34793 + if (PageUptodate(page)) {
34794 + unlock_page(page);
34795 + return 0;
34796 + }
34797 + page_cache_get(page);
34798 +
34799 + if (rc->lh.node == 0) {
34800 + /* no twig lock - have to do tree search. */
34801 + reiser4_key key;
34802 + repeat:
34803 + unlock_page(page);
34804 + key_by_inode_and_offset_common(
34805 + mapping->host, page_offset(page), &key);
34806 + ret = coord_by_key(
34807 + &get_super_private(mapping->host->i_sb)->tree,
34808 + &key, &rc->coord, &rc->lh,
34809 + ZNODE_READ_LOCK, FIND_EXACT,
34810 + TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34811 + if (unlikely(ret))
34812 + goto exit;
34813 + lock_page(page);
34814 + if (PageUptodate(page))
34815 + goto unlock;
34816 + cbk_done = 1;
34817 + }
34818 + ret = zload(rc->coord.node);
34819 + if (unlikely(ret))
34820 + goto unlock;
34821 + if (!coord_is_existing_item(&rc->coord) ||
34822 + !item_is_extent(&rc->coord)) {
34823 + zrelse(rc->coord.node);
34824 + ret = RETERR(-EIO);
34825 + goto unlock;
34826 + }
34827 + ext = extent_by_coord(&rc->coord);
34828 + ext_index = extent_unit_index(&rc->coord);
34829 + if (page->index < ext_index ||
34830 + page->index >= ext_index + extent_get_width(ext)) {
34831 + /* the page index doesn't belong to the extent unit
34832 + which the coord points to - release the lock and
34833 + repeat with tree search. */
34834 + zrelse(rc->coord.node);
34835 + done_lh(&rc->lh);
34836 + /* we can be here after a CBK call only in case of
34837 + corruption of the tree or the tree lookup algorithm bug. */
34838 + if (unlikely(cbk_done)) {
34839 + ret = RETERR(-EIO);
34840 + goto unlock;
34841 + }
34842 + goto repeat;
34843 + }
34844 + node = jnode_of_page(page);
34845 + if (unlikely(IS_ERR(node))) {
34846 + zrelse(rc->coord.node);
34847 + ret = PTR_ERR(node);
34848 + goto unlock;
34849 + }
34850 + ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34851 + jput(node);
34852 + zrelse(rc->coord.node);
34853 + if (likely(!ret))
34854 + goto exit;
34855 + unlock:
34856 + unlock_page(page);
34857 + exit:
34858 + page_cache_release(page);
34859 + return ret;
34860 +}
34861 +
34862 +/**
34863 + * readpages_unix_file - called by the readahead code, starts reading for each
34864 + * page of given list of pages
34865 + */
34866 +int readpages_unix_file(
34867 + struct file *file, struct address_space *mapping,
34868 + struct list_head *pages, unsigned nr_pages)
34869 +{
34870 + reiser4_context *ctx;
34871 + struct uf_readpages_context rc;
34872 + int ret;
34873 +
34874 + ctx = reiser4_init_context(mapping->host->i_sb);
34875 + if (IS_ERR(ctx)) {
34876 + put_pages_list(pages);
34877 + return PTR_ERR(ctx);
34878 + }
34879 + init_lh(&rc.lh);
34880 + ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
34881 + done_lh(&rc.lh);
34882 + context_set_commit_async(ctx);
34883 + /* close the transaction to protect further page allocation from deadlocks */
34884 + reiser4_txn_restart(ctx);
34885 + reiser4_exit_context(ctx);
34886 + return ret;
34887 +}
34888 +
34889 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34890 + loff_t count UNUSED_ARG)
34891 +{
34892 + /* We should reserve one block, because of updating of the stat data
34893 + item */
34894 + assert("vs-1249",
34895 + inode_file_plugin(inode)->estimate.update ==
34896 + estimate_update_common);
34897 + return estimate_update_common(inode);
34898 +}
34899 +
34900 +/* this is called with nonexclusive access obtained, file's container can not change */
34901 +static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
34902 + char __user *buf, /* address of user-space buffer */
34903 + size_t count, /* number of bytes to read */
34904 + loff_t *off)
34905 +{
34906 + int result;
34907 + struct inode *inode;
34908 + flow_t flow;
34909 + int (*read_f) (struct file *, flow_t *, hint_t *);
34910 + coord_t *coord;
34911 + znode *loaded;
34912 +
34913 + inode = file->f_dentry->d_inode;
34914 +
34915 + /* build flow */
34916 + assert("vs-1250",
34917 + inode_file_plugin(inode)->flow_by_inode ==
34918 + flow_by_inode_unix_file);
34919 + result =
34920 + flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34921 + *off, READ_OP, &flow);
34922 + if (unlikely(result))
34923 + return result;
34924 +
34925 + /* get seal and coord sealed with it from reiser4 private data
34926 + of struct file. The coord will tell us where our last read
34927 + of this file finished, and the seal will help to determine
34928 + if that location is still valid.
34929 + */
34930 + coord = &hint->ext_coord.coord;
34931 + while (flow.length && result == 0) {
34932 + result =
34933 + find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34934 + if (cbk_errored(result))
34935 + /* error happened */
34936 + break;
34937 +
34938 + if (coord->between != AT_UNIT) {
34939 + /* there were no items corresponding to given offset */
34940 + done_lh(hint->ext_coord.lh);
34941 + break;
34942 + }
34943 +
34944 + loaded = coord->node;
34945 + result = zload(loaded);
34946 + if (unlikely(result)) {
34947 + done_lh(hint->ext_coord.lh);
34948 + break;
34949 + }
34950 +
34951 + if (hint->ext_coord.valid == 0)
34952 + validate_extended_coord(&hint->ext_coord,
34953 + get_key_offset(&flow.key));
34954 +
34955 + assert("vs-4", hint->ext_coord.valid == 1);
34956 + assert("vs-33", hint->ext_coord.lh == &hint->lh);
34957 + /* call item's read method */
34958 + read_f = item_plugin_by_coord(coord)->s.file.read;
34959 + result = read_f(file, &flow, hint);
34960 + zrelse(loaded);
34961 + done_lh(hint->ext_coord.lh);
34962 + }
34963 +
34964 + return (count - flow.length) ? (count - flow.length) : result;
34965 +}
34966 +
34967 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34968 +
34969 +/**
34970 + * read_unix_file - read of struct file_operations
34971 + * @file: file to read from
34972 + * @buf: address of user-space buffer
34973 + * @read_amount: number of bytes to read
34974 + * @off: position in file to read from
34975 + *
34976 + * This is implementation of vfs's read method of struct file_operations for
34977 + * unix file plugin.
34978 + */
34979 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34980 + loff_t *off)
34981 +{
34982 + reiser4_context *ctx;
34983 + ssize_t result;
34984 + struct inode *inode;
34985 + struct unix_file_info *uf_info;
34986 +
34987 + if (unlikely(read_amount == 0))
34988 + return 0;
34989 +
34990 + assert("umka-072", file != NULL);
34991 + assert("umka-074", off != NULL);
34992 + inode = file->f_dentry->d_inode;
34993 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34994 +
34995 + ctx = reiser4_init_context(inode->i_sb);
34996 + if (IS_ERR(ctx))
34997 + return PTR_ERR(ctx);
34998 + uf_info = unix_file_inode_data(inode);
34999 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35000 + get_exclusive_access(uf_info);
35001 + result = find_file_state(inode, uf_info);
35002 + if (unlikely(result != 0))
35003 + goto out;
35004 + } else
35005 + get_nonexclusive_access(uf_info);
35006 + result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
35007 + BA_CAN_COMMIT);
35008 + if (unlikely(result != 0))
35009 + goto out;
35010 + if (uf_info->container == UF_CONTAINER_EXTENTS){
35011 + result = do_sync_read(file, buf, read_amount, off);
35012 + } else if (uf_info->container == UF_CONTAINER_TAILS ||
35013 + reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
35014 + reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
35015 + result = read_unix_file_container_tails(file, buf, read_amount, off);
35016 + } else {
35017 + assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
35018 + result = 0;
35019 + }
35020 +out:
35021 + drop_access(uf_info);
35022 + context_set_commit_async(ctx);
35023 + reiser4_exit_context(ctx);
35024 + return result;
35025 +}
35026 +
35027 +static ssize_t read_unix_file_container_tails(
35028 + struct file *file, char __user *buf, size_t read_amount, loff_t *off)
35029 +{
35030 + int result;
35031 + struct inode *inode;
35032 + hint_t *hint;
35033 + struct unix_file_info *uf_info;
35034 + size_t count, read, left;
35035 + loff_t size;
35036 +
35037 + assert("umka-072", file != NULL);
35038 + assert("umka-074", off != NULL);
35039 + inode = file->f_dentry->d_inode;
35040 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35041 +
35042 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
35043 + if (hint == NULL)
35044 + return RETERR(-ENOMEM);
35045 +
35046 + result = load_file_hint(file, hint);
35047 + if (result) {
35048 + kfree(hint);
35049 + return result;
35050 + }
35051 +
35052 + left = read_amount;
35053 + count = 0;
35054 + uf_info = unix_file_inode_data(inode);
35055 + while (left > 0) {
35056 + reiser4_txn_restart_current();
35057 + size = i_size_read(inode);
35058 + if (*off >= size)
35059 + /* position to read from is past the end of file */
35060 + break;
35061 + if (*off + left > size)
35062 + left = size - *off;
35063 + /* faultin user page */
35064 + result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
35065 + if (result)
35066 + return RETERR(-EFAULT);
35067 +
35068 + read = read_file(hint, file, buf,
35069 + left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
35070 + off);
35071 + if (read < 0) {
35072 + result = read;
35073 + break;
35074 + }
35075 + left -= read;
35076 + buf += read;
35077 +
35078 + /* update position in a file */
35079 + *off += read;
35080 + /* total number of read bytes */
35081 + count += read;
35082 + }
35083 + done_lh(&hint->lh);
35084 + save_file_hint(file, hint);
35085 + kfree(hint);
35086 + if (count)
35087 + file_accessed(file);
35088 + /* return number of read bytes or error code if nothing is read */
35089 + return count ? count : result;
35090 +}
35091 +
35092 +/* This function takes care about @file's pages. First of all it checks if
35093 + filesystems readonly and if so gets out. Otherwise, it throws out all
35094 + pages of file if it was mapped for read and going to be mapped for write
35095 + and consists of tails. This is done in order to not manage few copies
35096 + of the data (first in page cache and second one in tails them selves)
35097 + for the case of mapping files consisting tails.
35098 +
35099 + Here also tail2extent conversion is performed if it is allowed and file
35100 + is going to be written or mapped for write. This functions may be called
35101 + from write_unix_file() or mmap_unix_file(). */
35102 +static int check_pages_unix_file(struct file *file, struct inode *inode)
35103 +{
35104 + reiser4_invalidate_pages(inode->i_mapping, 0,
35105 + (inode->i_size + PAGE_CACHE_SIZE -
35106 + 1) >> PAGE_CACHE_SHIFT, 0);
35107 + return unpack(file, inode, 0 /* not forever */ );
35108 +}
35109 +
35110 +/**
35111 + * mmap_unix_file - mmap of struct file_operations
35112 + * @file: file to mmap
35113 + * @vma:
35114 + *
35115 + * This is implementation of vfs's mmap method of struct file_operations for
35116 + * unix file plugin. It converts file to extent if necessary. Sets
35117 + * reiser4_inode's flag - REISER4_HAS_MMAP.
35118 + */
35119 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
35120 +{
35121 + reiser4_context *ctx;
35122 + int result;
35123 + struct inode *inode;
35124 + struct unix_file_info *uf_info;
35125 + reiser4_block_nr needed;
35126 +
35127 + inode = file->f_dentry->d_inode;
35128 + ctx = reiser4_init_context(inode->i_sb);
35129 + if (IS_ERR(ctx))
35130 + return PTR_ERR(ctx);
35131 +
35132 + uf_info = unix_file_inode_data(inode);
35133 +
35134 + get_exclusive_access_careful(uf_info, inode);
35135 +
35136 + if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
35137 + /*
35138 + * we need file built of extent items. If it is still built of
35139 + * tail items we have to convert it. Find what items the file
35140 + * is built of
35141 + */
35142 + result = find_file_state(inode, uf_info);
35143 + if (result != 0) {
35144 + drop_exclusive_access(uf_info);
35145 + reiser4_exit_context(ctx);
35146 + return result;
35147 + }
35148 +
35149 + assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
35150 + uf_info->container == UF_CONTAINER_EXTENTS ||
35151 + uf_info->container == UF_CONTAINER_EMPTY));
35152 + if (uf_info->container == UF_CONTAINER_TAILS) {
35153 + /*
35154 + * invalidate all pages and convert file from tails to
35155 + * extents
35156 + */
35157 + result = check_pages_unix_file(file, inode);
35158 + if (result) {
35159 + drop_exclusive_access(uf_info);
35160 + reiser4_exit_context(ctx);
35161 + return result;
35162 + }
35163 + }
35164 + }
35165 +
35166 + /*
35167 + * generic_file_mmap will do update_atime. Grab space for stat data
35168 + * update.
35169 + */
35170 + needed = inode_file_plugin(inode)->estimate.update(inode);
35171 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
35172 + if (result) {
35173 + drop_exclusive_access(uf_info);
35174 + reiser4_exit_context(ctx);
35175 + return result;
35176 + }
35177 +
35178 + result = generic_file_mmap(file, vma);
35179 + if (result == 0) {
35180 + /* mark file as having mapping. */
35181 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
35182 + }
35183 +
35184 + drop_exclusive_access(uf_info);
35185 + reiser4_exit_context(ctx);
35186 + return result;
35187 +}
35188 +
35189 +/**
35190 + * find_first_item
35191 + * @inode:
35192 + *
35193 + * Finds file item which is responsible for first byte in the file.
35194 + */
35195 +static int find_first_item(struct inode *inode)
35196 +{
35197 + coord_t coord;
35198 + lock_handle lh;
35199 + reiser4_key key;
35200 + int result;
35201 +
35202 + coord_init_zero(&coord);
35203 + init_lh(&lh);
35204 + inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
35205 + result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
35206 + inode);
35207 + if (result == CBK_COORD_FOUND) {
35208 + if (coord.between == AT_UNIT) {
35209 + result = zload(coord.node);
35210 + if (result == 0) {
35211 + result = item_id_by_coord(&coord);
35212 + zrelse(coord.node);
35213 + if (result != EXTENT_POINTER_ID &&
35214 + result != FORMATTING_ID)
35215 + result = RETERR(-EIO);
35216 + }
35217 + } else
35218 + result = RETERR(-EIO);
35219 + }
35220 + done_lh(&lh);
35221 + return result;
35222 +}
35223 +
35224 +/**
35225 + * open_unix_file
35226 + * @inode:
35227 + * @file:
35228 + *
35229 + * If filesystem is not readonly - complete uncompleted tail conversion if
35230 + * there was one
35231 + */
35232 +int open_unix_file(struct inode *inode, struct file *file)
35233 +{
35234 + int result;
35235 + reiser4_context *ctx;
35236 + struct unix_file_info *uf_info;
35237 +
35238 + if (IS_RDONLY(inode))
35239 + return 0;
35240 +
35241 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
35242 + return 0;
35243 +
35244 + ctx = reiser4_init_context(inode->i_sb);
35245 + if (IS_ERR(ctx))
35246 + return PTR_ERR(ctx);
35247 +
35248 + uf_info = unix_file_inode_data(inode);
35249 +
35250 + get_exclusive_access_careful(uf_info, inode);
35251 +
35252 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
35253 + /*
35254 + * other process completed the conversion
35255 + */
35256 + drop_exclusive_access(uf_info);
35257 + reiser4_exit_context(ctx);
35258 + return 0;
35259 + }
35260 +
35261 + /*
35262 + * file left in semi converted state after unclean shutdown or another
35263 + * thread is doing conversion and dropped exclusive access which doing
35264 + * balance dirty pages. Complete the conversion
35265 + */
35266 + result = find_first_item(inode);
35267 + if (result == EXTENT_POINTER_ID)
35268 + /*
35269 + * first item is extent, therefore there was incomplete
35270 + * tail2extent conversion. Complete it
35271 + */
35272 + result = tail2extent(unix_file_inode_data(inode));
35273 + else if (result == FORMATTING_ID)
35274 + /*
35275 + * first item is formatting item, therefore there was
35276 + * incomplete extent2tail conversion. Complete it
35277 + */
35278 + result = extent2tail(file, unix_file_inode_data(inode));
35279 + else
35280 + result = -EIO;
35281 +
35282 + assert("vs-1712",
35283 + ergo(result == 0,
35284 + (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
35285 + !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
35286 + drop_exclusive_access(uf_info);
35287 + reiser4_exit_context(ctx);
35288 + return result;
35289 +}
35290 +
35291 +#define NEITHER_OBTAINED 0
35292 +#define EA_OBTAINED 1
35293 +#define NEA_OBTAINED 2
35294 +
35295 +static void drop_access(struct unix_file_info *uf_info)
35296 +{
35297 + if (uf_info->exclusive_use)
35298 + drop_exclusive_access(uf_info);
35299 + else
35300 + drop_nonexclusive_access(uf_info);
35301 +}
35302 +
35303 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
35304 + __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
35305 +
35306 +/**
35307 + * write_unix_file - private ->write() method of unix_file plugin.
35308 + *
35309 + * @file: file to write to
35310 + * @buf: address of user-space buffer
35311 + * @count: number of bytes to write
35312 + * @pos: position in file to write to
35313 + * @cont: unused argument, as we don't perform plugin conversion when being
35314 + * managed by unix_file plugin.
35315 + */
35316 +ssize_t write_unix_file(struct file *file,
35317 + const char __user *buf,
35318 + size_t count, loff_t *pos,
35319 + struct dispatch_context *cont)
35320 +{
35321 + int result;
35322 + reiser4_context *ctx;
35323 + struct inode *inode;
35324 + struct unix_file_info *uf_info;
35325 + ssize_t written;
35326 + int try_free_space;
35327 + int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
35328 + size_t left;
35329 + ssize_t (*write_op)(struct file *, struct inode *,
35330 + const char __user *, size_t,
35331 + loff_t *pos);
35332 + int ea;
35333 + loff_t new_size;
35334 +
35335 + ctx = get_current_context();
35336 + inode = file->f_dentry->d_inode;
35337 +
35338 + assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35339 + assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
35340 +
35341 + /* check amount of bytes to write and writing position */
35342 + result = generic_write_checks(file, pos, &count, 0);
35343 + if (result) {
35344 + context_set_commit_async(ctx);
35345 + return result;
35346 + }
35347 +
35348 + result = file_remove_suid(file);
35349 + if (result) {
35350 + context_set_commit_async(ctx);
35351 + return result;
35352 + }
35353 + /* remove_suid might create a transaction */
35354 + reiser4_txn_restart(ctx);
35355 +
35356 + uf_info = unix_file_inode_data(inode);
35357 +
35358 + current->backing_dev_info = inode->i_mapping->backing_dev_info;
35359 + written = 0;
35360 + try_free_space = 0;
35361 + left = count;
35362 + ea = NEITHER_OBTAINED;
35363 +
35364 + new_size = i_size_read(inode);
35365 + if (*pos + count > new_size)
35366 + new_size = *pos + count;
35367 +
35368 + while (left) {
35369 + if (left < to_write)
35370 + to_write = left;
35371 +
35372 + if (uf_info->container == UF_CONTAINER_EMPTY) {
35373 + get_exclusive_access(uf_info);
35374 + ea = EA_OBTAINED;
35375 + if (uf_info->container != UF_CONTAINER_EMPTY) {
35376 + /* file is made not empty by another process */
35377 + drop_exclusive_access(uf_info);
35378 + ea = NEITHER_OBTAINED;
35379 + continue;
35380 + }
35381 + } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35382 + /*
35383 + * get exclusive access directly just to not have to
35384 + * re-obtain it if file will appear empty
35385 + */
35386 + get_exclusive_access(uf_info);
35387 + ea = EA_OBTAINED;
35388 + result = find_file_state(inode, uf_info);
35389 + if (result) {
35390 + drop_exclusive_access(uf_info);
35391 + ea = NEITHER_OBTAINED;
35392 + break;
35393 + }
35394 + } else {
35395 + get_nonexclusive_access(uf_info);
35396 + ea = NEA_OBTAINED;
35397 + }
35398 +
35399 + /* either EA or NEA is obtained. Choose item write method */
35400 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
35401 + /* file is built of extent items */
35402 + write_op = reiser4_write_extent;
35403 + } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35404 + /* file is empty */
35405 + if (should_have_notail(uf_info, new_size))
35406 + write_op = reiser4_write_extent;
35407 + else
35408 + write_op = reiser4_write_tail;
35409 + } else {
35410 + /* file is built of tail items */
35411 + if (should_have_notail(uf_info, new_size)) {
35412 + if (ea == NEA_OBTAINED) {
35413 + drop_nonexclusive_access(uf_info);
35414 + get_exclusive_access(uf_info);
35415 + ea = EA_OBTAINED;
35416 + }
35417 + if (uf_info->container == UF_CONTAINER_TAILS) {
35418 + /*
35419 + * if file is being convered by another
35420 + * process - wait until it completes
35421 + */
35422 + while (1) {
35423 + if (reiser4_inode_get_flag(inode,
35424 + REISER4_PART_IN_CONV)) {
35425 + drop_exclusive_access(uf_info);
35426 + schedule();
35427 + get_exclusive_access(uf_info);
35428 + continue;
35429 + }
35430 + break;
35431 + }
35432 + if (uf_info->container == UF_CONTAINER_TAILS) {
35433 + result = tail2extent(uf_info);
35434 + if (result) {
35435 + drop_exclusive_access(uf_info);
35436 + context_set_commit_async(ctx);
35437 + break;
35438 + }
35439 + }
35440 + }
35441 + drop_exclusive_access(uf_info);
35442 + ea = NEITHER_OBTAINED;
35443 + continue;
35444 + }
35445 + write_op = reiser4_write_tail;
35446 + }
35447 +
35448 + written = write_op(file, inode, buf, to_write, pos);
35449 + if (written == -ENOSPC && try_free_space) {
35450 + drop_access(uf_info);
35451 + txnmgr_force_commit_all(inode->i_sb, 0);
35452 + try_free_space = 0;
35453 + continue;
35454 + }
35455 + if (written < 0) {
35456 + drop_access(uf_info);
35457 + result = written;
35458 + break;
35459 + }
35460 + /* something is written. */
35461 + if (uf_info->container == UF_CONTAINER_EMPTY) {
35462 + assert("edward-1553", ea == EA_OBTAINED);
35463 + uf_info->container =
35464 + (write_op == reiser4_write_extent) ?
35465 + UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35466 + } else {
35467 + assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35468 + write_op == reiser4_write_extent));
35469 + assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS,
35470 + write_op == reiser4_write_tail));
35471 + }
35472 + if (*pos + written > inode->i_size)
35473 + INODE_SET_FIELD(inode, i_size, *pos + written);
35474 + file_update_time(file);
35475 + result = reiser4_update_sd(inode);
35476 + if (result) {
35477 + current->backing_dev_info = NULL;
35478 + drop_access(uf_info);
35479 + context_set_commit_async(ctx);
35480 + break;
35481 + }
35482 + drop_access(uf_info);
35483 + ea = NEITHER_OBTAINED;
35484 +
35485 + /*
35486 + * tell VM how many pages were dirtied. Maybe number of pages
35487 + * which were dirty already should not be counted
35488 + */
35489 + reiser4_throttle_write(inode,
35490 + (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35491 + left -= written;
35492 + buf += written;
35493 + *pos += written;
35494 + }
35495 + if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35496 + reiser4_txn_restart_current();
35497 + grab_space_enable();
35498 + result = reiser4_sync_file_common(file, file->f_dentry,
35499 + 0 /* data and stat data */);
35500 + if (result)
35501 + warning("reiser4-7", "failed to sync file %llu",
35502 + (unsigned long long)get_inode_oid(inode));
35503 + }
35504 +
35505 + current->backing_dev_info = NULL;
35506 +
35507 + /*
35508 + * return number of written bytes or error code if nothing is
35509 + * written. Note, that it does not work correctly in case when
35510 + * sync_unix_file returns error
35511 + */
35512 + return (count - left) ? (count - left) : result;
35513 +}
35514 +
35515 +/**
35516 + * release_unix_file - release of struct file_operations
35517 + * @inode: inode of released file
35518 + * @file: file to release
35519 + *
35520 + * Implementation of release method of struct file_operations for unix file
35521 + * plugin. If last reference to indode is released - convert all extent items
35522 + * into tail items if necessary. Frees reiser4 specific file data.
35523 + */
35524 +int release_unix_file(struct inode *inode, struct file *file)
35525 +{
35526 + reiser4_context *ctx;
35527 + struct unix_file_info *uf_info;
35528 + int result;
35529 + int in_reiser4;
35530 +
35531 + in_reiser4 = is_in_reiser4_context();
35532 +
35533 + ctx = reiser4_init_context(inode->i_sb);
35534 + if (IS_ERR(ctx))
35535 + return PTR_ERR(ctx);
35536 +
35537 + result = 0;
35538 + if (in_reiser4 == 0) {
35539 + uf_info = unix_file_inode_data(inode);
35540 +
35541 + get_exclusive_access_careful(uf_info, inode);
35542 + if (atomic_read(&file->f_dentry->d_count) == 1 &&
35543 + uf_info->container == UF_CONTAINER_EXTENTS &&
35544 + !should_have_notail(uf_info, inode->i_size) &&
35545 + !rofs_inode(inode)) {
35546 + result = extent2tail(file, uf_info);
35547 + if (result != 0) {
35548 + context_set_commit_async(ctx);
35549 + warning("nikita-3233",
35550 + "Failed (%d) to convert in %s (%llu)",
35551 + result, __FUNCTION__,
35552 + (unsigned long long)
35553 + get_inode_oid(inode));
35554 + }
35555 + }
35556 + drop_exclusive_access(uf_info);
35557 + } else {
35558 + /*
35559 + we are within reiser4 context already. How latter is
35560 + possible? Simple:
35561 +
35562 + (gdb) bt
35563 + #0 get_exclusive_access ()
35564 + #2 0xc01e56d3 in release_unix_file ()
35565 + #3 0xc01c3643 in reiser4_release ()
35566 + #4 0xc014cae0 in __fput ()
35567 + #5 0xc013ffc3 in remove_vm_struct ()
35568 + #6 0xc0141786 in exit_mmap ()
35569 + #7 0xc0118480 in mmput ()
35570 + #8 0xc0133205 in oom_kill ()
35571 + #9 0xc01332d1 in out_of_memory ()
35572 + #10 0xc013bc1d in try_to_free_pages ()
35573 + #11 0xc013427b in __alloc_pages ()
35574 + #12 0xc013f058 in do_anonymous_page ()
35575 + #13 0xc013f19d in do_no_page ()
35576 + #14 0xc013f60e in handle_mm_fault ()
35577 + #15 0xc01131e5 in do_page_fault ()
35578 + #16 0xc0104935 in error_code ()
35579 + #17 0xc025c0c6 in __copy_to_user_ll ()
35580 + #18 0xc01d496f in reiser4_read_tail ()
35581 + #19 0xc01e4def in read_unix_file ()
35582 + #20 0xc01c3504 in reiser4_read ()
35583 + #21 0xc014bd4f in vfs_read ()
35584 + #22 0xc014bf66 in sys_read ()
35585 + */
35586 + warning("vs-44", "out of memory?");
35587 + }
35588 +
35589 + reiser4_free_file_fsdata(file);
35590 +
35591 + reiser4_exit_context(ctx);
35592 + return result;
35593 +}
35594 +
35595 +static void set_file_notail(struct inode *inode)
35596 +{
35597 + reiser4_inode *state;
35598 + formatting_plugin *tplug;
35599 +
35600 + state = reiser4_inode_data(inode);
35601 + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35602 + force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35603 +}
35604 +
35605 +/* if file is built of tails - convert it to extents */
35606 +static int unpack(struct file *filp, struct inode *inode, int forever)
35607 +{
35608 + int result = 0;
35609 + struct unix_file_info *uf_info;
35610 +
35611 + uf_info = unix_file_inode_data(inode);
35612 + assert("vs-1628", ea_obtained(uf_info));
35613 +
35614 + result = find_file_state(inode, uf_info);
35615 + if (result)
35616 + return result;
35617 + assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35618 +
35619 + if (uf_info->container == UF_CONTAINER_TAILS) {
35620 + /*
35621 + * if file is being convered by another process - wait until it
35622 + * completes
35623 + */
35624 + while (1) {
35625 + if (reiser4_inode_get_flag(inode,
35626 + REISER4_PART_IN_CONV)) {
35627 + drop_exclusive_access(uf_info);
35628 + schedule();
35629 + get_exclusive_access(uf_info);
35630 + continue;
35631 + }
35632 + break;
35633 + }
35634 + if (uf_info->container == UF_CONTAINER_TAILS) {
35635 + result = tail2extent(uf_info);
35636 + if (result)
35637 + return result;
35638 + }
35639 + }
35640 + if (forever) {
35641 + /* safe new formatting plugin in stat data */
35642 + __u64 tograb;
35643 +
35644 + set_file_notail(inode);
35645 +
35646 + grab_space_enable();
35647 + tograb = inode_file_plugin(inode)->estimate.update(inode);
35648 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35649 + result = reiser4_update_sd(inode);
35650 + }
35651 +
35652 + return result;
35653 +}
35654 +
35655 +/* implentation of vfs' ioctl method of struct file_operations for unix file
35656 + plugin
35657 +*/
35658 +int
35659 +ioctl_unix_file(struct inode *inode, struct file *filp,
35660 + unsigned int cmd, unsigned long arg UNUSED_ARG)
35661 +{
35662 + reiser4_context *ctx;
35663 + int result;
35664 +
35665 + ctx = reiser4_init_context(inode->i_sb);
35666 + if (IS_ERR(ctx))
35667 + return PTR_ERR(ctx);
35668 +
35669 + switch (cmd) {
35670 + case REISER4_IOC_UNPACK:
35671 + get_exclusive_access(unix_file_inode_data(inode));
35672 + result = unpack(filp, inode, 1 /* forever */ );
35673 + drop_exclusive_access(unix_file_inode_data(inode));
35674 + break;
35675 +
35676 + default:
35677 + result = RETERR(-ENOSYS);
35678 + break;
35679 + }
35680 + reiser4_exit_context(ctx);
35681 + return result;
35682 +}
35683 +
35684 +/* implentation of vfs' bmap method of struct address_space_operations for unix
35685 + file plugin
35686 +*/
35687 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35688 +{
35689 + reiser4_context *ctx;
35690 + sector_t result;
35691 + reiser4_key key;
35692 + coord_t coord;
35693 + lock_handle lh;
35694 + struct inode *inode;
35695 + item_plugin *iplug;
35696 + sector_t block;
35697 +
35698 + inode = mapping->host;
35699 +
35700 + ctx = reiser4_init_context(inode->i_sb);
35701 + if (IS_ERR(ctx))
35702 + return PTR_ERR(ctx);
35703 + key_by_inode_and_offset_common(inode,
35704 + (loff_t) lblock * current_blocksize,
35705 + &key);
35706 +
35707 + init_lh(&lh);
35708 + result =
35709 + find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35710 + if (cbk_errored(result)) {
35711 + done_lh(&lh);
35712 + reiser4_exit_context(ctx);
35713 + return result;
35714 + }
35715 +
35716 + result = zload(coord.node);
35717 + if (result) {
35718 + done_lh(&lh);
35719 + reiser4_exit_context(ctx);
35720 + return result;
35721 + }
35722 +
35723 + iplug = item_plugin_by_coord(&coord);
35724 + if (iplug->s.file.get_block) {
35725 + result = iplug->s.file.get_block(&coord, lblock, &block);
35726 + if (result == 0)
35727 + result = block;
35728 + } else
35729 + result = RETERR(-EINVAL);
35730 +
35731 + zrelse(coord.node);
35732 + done_lh(&lh);
35733 + reiser4_exit_context(ctx);
35734 + return result;
35735 +}
35736 +
35737 +/**
35738 + * flow_by_inode_unix_file - initizlize structure flow
35739 + * @inode: inode of file for which read or write is abou
35740 + * @buf: buffer to perform read to or write from
35741 + * @user: flag showing whether @buf is user space or kernel space
35742 + * @size: size of buffer @buf
35743 + * @off: start offset fro read or write
35744 + * @op: READ or WRITE
35745 + * @flow:
35746 + *
35747 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35748 + */
35749 +int flow_by_inode_unix_file(struct inode *inode,
35750 + const char __user *buf, int user,
35751 + loff_t size, loff_t off,
35752 + rw_op op, flow_t *flow)
35753 +{
35754 + assert("nikita-1100", inode != NULL);
35755 +
35756 + flow->length = size;
35757 + memcpy(&flow->data, &buf, sizeof(buf));
35758 + flow->user = user;
35759 + flow->op = op;
35760 + assert("nikita-1931", inode_file_plugin(inode) != NULL);
35761 + assert("nikita-1932",
35762 + inode_file_plugin(inode)->key_by_inode ==
35763 + key_by_inode_and_offset_common);
35764 + /* calculate key of write position and insert it into flow->key */
35765 + return key_by_inode_and_offset_common(inode, off, &flow->key);
35766 +}
35767 +
35768 +/* plugin->u.file.set_plug_in_sd = NULL
35769 + plugin->u.file.set_plug_in_inode = NULL
35770 + plugin->u.file.create_blank_sd = NULL */
35771 +/* plugin->u.file.delete */
35772 +/*
35773 + plugin->u.file.add_link = reiser4_add_link_common
35774 + plugin->u.file.rem_link = NULL */
35775 +
35776 +/* plugin->u.file.owns_item
35777 + this is common_file_owns_item with assertion */
35778 +/* Audited by: green(2002.06.15) */
35779 +int
35780 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35781 + const coord_t * coord /* coord to check */ )
35782 +{
35783 + int result;
35784 +
35785 + result = owns_item_common(inode, coord);
35786 + if (!result)
35787 + return 0;
35788 + if (!plugin_of_group(item_plugin_by_coord(coord),
35789 + UNIX_FILE_METADATA_ITEM_TYPE))
35790 + return 0;
35791 + assert("vs-547",
35792 + item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35793 + item_id_by_coord(coord) == FORMATTING_ID);
35794 + return 1;
35795 +}
35796 +
35797 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
35798 +{
35799 + int result;
35800 + int s_result;
35801 + loff_t old_size;
35802 + reiser4_tree *tree;
35803 +
35804 + inode_check_scale(inode, inode->i_size, attr->ia_size);
35805 +
35806 + old_size = inode->i_size;
35807 + tree = reiser4_tree_by_inode(inode);
35808 +
35809 + result = safe_link_grab(tree, BA_CAN_COMMIT);
35810 + if (result == 0)
35811 + result = safe_link_add(inode, SAFE_TRUNCATE);
35812 + if (result == 0)
35813 + result = truncate_file_body(inode, attr);
35814 + if (result)
35815 + warning("vs-1588", "truncate_file failed: oid %lli, "
35816 + "old size %lld, new size %lld, retval %d",
35817 + (unsigned long long)get_inode_oid(inode),
35818 + old_size, attr->ia_size, result);
35819 +
35820 + s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35821 + if (s_result == 0)
35822 + s_result =
35823 + safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35824 + if (s_result != 0) {
35825 + warning("nikita-3417", "Cannot kill safelink %lli: %i",
35826 + (unsigned long long)get_inode_oid(inode), s_result);
35827 + }
35828 + safe_link_release(tree);
35829 + return result;
35830 +}
35831 +
35832 +/* plugin->u.file.setattr method */
35833 +/* This calls inode_setattr and if truncate is in effect it also takes
35834 + exclusive inode access to avoid races */
35835 +int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
35836 + struct iattr *attr /* change description */ )
35837 +{
35838 + int result;
35839 +
35840 + if (attr->ia_valid & ATTR_SIZE) {
35841 + reiser4_context *ctx;
35842 + struct unix_file_info *uf_info;
35843 +
35844 + /* truncate does reservation itself and requires exclusive
35845 + access obtained */
35846 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
35847 + if (IS_ERR(ctx))
35848 + return PTR_ERR(ctx);
35849 +
35850 + uf_info = unix_file_inode_data(dentry->d_inode);
35851 + get_exclusive_access_careful(uf_info, dentry->d_inode);
35852 + result = setattr_truncate(dentry->d_inode, attr);
35853 + drop_exclusive_access(uf_info);
35854 + context_set_commit_async(ctx);
35855 + reiser4_exit_context(ctx);
35856 + } else
35857 + result = reiser4_setattr_common(dentry, attr);
35858 +
35859 + return result;
35860 +}
35861 +
35862 +/* plugin->u.file.init_inode_data */
35863 +void
35864 +init_inode_data_unix_file(struct inode *inode,
35865 + reiser4_object_create_data * crd, int create)
35866 +{
35867 + struct unix_file_info *data;
35868 +
35869 + data = unix_file_inode_data(inode);
35870 + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35871 + init_rwsem(&data->latch);
35872 + data->tplug = inode_formatting_plugin(inode);
35873 + data->exclusive_use = 0;
35874 +
35875 +#if REISER4_DEBUG
35876 + data->ea_owner = NULL;
35877 + atomic_set(&data->nr_neas, 0);
35878 +#endif
35879 + init_inode_ordering(inode, crd, create);
35880 +}
35881 +
35882 +/**
35883 + * delete_unix_file - delete_object of file_plugin
35884 + * @inode: inode to be deleted
35885 + *
35886 + * Truncates file to length 0, removes stat data and safe link.
35887 + */
35888 +int delete_object_unix_file(struct inode *inode)
35889 +{
35890 + struct unix_file_info *uf_info;
35891 + int result;
35892 +
35893 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35894 + return 0;
35895 +
35896 + /* truncate file bogy first */
35897 + uf_info = unix_file_inode_data(inode);
35898 + get_exclusive_access(uf_info);
35899 + result = shorten_file(inode, 0 /* size */ );
35900 + drop_exclusive_access(uf_info);
35901 +
35902 + if (result)
35903 + warning("edward-1556",
35904 + "failed to truncate file (%llu) on removal: %d",
35905 + get_inode_oid(inode), result);
35906 +
35907 + /* remove stat data and safe link */
35908 + return reiser4_delete_object_common(inode);
35909 +}
35910 +
35911 +/* plugin->write_begin() */
35912 +int write_begin_unix_file(struct file *file, struct page *page,
35913 + unsigned from, unsigned to)
35914 +{
35915 + int ret;
35916 + struct unix_file_info *info;
35917 +
35918 + info = unix_file_inode_data(file->f_dentry->d_inode);
35919 + get_exclusive_access(info);
35920 + ret = find_file_state(file->f_dentry->d_inode, info);
35921 + if (likely(ret == 0)) {
35922 + if (info->container == UF_CONTAINER_TAILS)
35923 + ret = -EINVAL;
35924 + else
35925 + ret = do_prepare_write(file, page, from, to);
35926 + }
35927 + drop_exclusive_access(info);
35928 + return ret;
35929 +}
35930 +
35931 +/*
35932 + * Local variables:
35933 + * c-indentation-style: "K&R"
35934 + * mode-name: "LC"
35935 + * c-basic-offset: 8
35936 + * tab-width: 8
35937 + * fill-column: 79
35938 + * scroll-step: 1
35939 + * End:
35940 + */
35941 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.33/fs/reiser4/plugin/file/file_conversion.c
35942 --- linux-2.6.33.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 01:00:00.000000000 +0100
35943 +++ linux-2.6.33/fs/reiser4/plugin/file/file_conversion.c 2010-03-04 19:33:22.000000000 +0100
35944 @@ -0,0 +1,747 @@
35945 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
35946 + licensing governed by reiser4/README */
35947 +
35948 +/**
35949 + * This file contains dispatching hooks, and conversion methods, which
35950 + * implement transitions in the FILE interface.
35951 + *
35952 + * Dispatching hook makes a decision (at dispatching point) about the
35953 + * most reasonable plugin. Such decision is made in accordance with some
35954 + * O(1)-heuristic.
35955 + *
35956 + * We implement a transition CRYPTCOMPRESS -> UNIX_FILE for files with
35957 + * incompressible data. Current heuristic to estimate compressibility is
35958 + * very simple: if first complete logical cluster (64K by default) of a
35959 + * file is incompressible, then we make a decision, that the whole file
35960 + * is incompressible.
35961 + *
35962 + * To enable dispatching we install a special "magic" compression mode
35963 + * plugin CONVX_COMPRESSION_MODE_ID at file creation time.
35964 + *
35965 + * Note, that we don't perform back conversion (UNIX_FILE->CRYPTCOMPRESS)
35966 + * because of compatibility reasons).
35967 + *
35968 + * In conversion time we protect CS, the conversion set (file's (meta)data
35969 + * and plugin table (pset)) via special per-inode rw-semaphore (conv_sem).
35970 + * The methods which implement conversion are CS writers. The methods of FS
35971 + * interface (file_operations, inode_operations, address_space_operations)
35972 + * are CS readers.
35973 + */
35974 +
35975 +#include "../../inode.h"
35976 +#include "../cluster.h"
35977 +#include "file.h"
35978 +
35979 +#define conversion_enabled(inode) \
35980 + (inode_compression_mode_plugin(inode) == \
35981 + compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35982 +
35983 +/**
35984 + * Located sections (readers and writers of @pset) are not permanently
35985 + * critical: cryptcompress file can be converted only if the conversion
35986 + * is enabled (see the macrio above). Also we don't perform back
35987 + * conversion. The following helper macro is a sanity check to decide
35988 + * if we need the protection (locks are always additional overheads).
35989 + */
35990 +#define should_protect(inode) \
35991 + (inode_file_plugin(inode) == \
35992 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
35993 + conversion_enabled(inode))
35994 +/**
35995 + * To avoid confusion with read/write file operations, we'll speak about
35996 + * "passive" protection for FCS readers and "active" protection for FCS
35997 + * writers. All methods with active or passive protection have suffix
35998 + * "careful".
35999 + */
36000 +/**
36001 + * Macros for passive protection.
36002 + *
36003 + * Construct invariant operation to be supplied to VFS.
36004 + * The macro accepts the following lexemes:
36005 + * @type - type of the value represented by the compound statement;
36006 + * @method - name of an operation to be supplied to VFS (reiser4 file
36007 + * plugin also should contain a method with such name).
36008 + */
36009 +#define PROT_PASSIVE(type, method, args) \
36010 +({ \
36011 + type _result; \
36012 + struct rw_semaphore * guard = \
36013 + &reiser4_inode_data(inode)->conv_sem; \
36014 + \
36015 + if (should_protect(inode)) { \
36016 + down_read(guard); \
36017 + if (!should_protect(inode)) \
36018 + up_read(guard); \
36019 + } \
36020 + _result = inode_file_plugin(inode)->method args; \
36021 + if (should_protect(inode)) \
36022 + up_read(guard); \
36023 + _result; \
36024 +})
36025 +
36026 +#define PROT_PASSIVE_VOID(method, args) \
36027 +({ \
36028 + struct rw_semaphore * guard = \
36029 + &reiser4_inode_data(inode)->conv_sem; \
36030 + \
36031 + if (should_protect(inode)) { \
36032 + down_read(guard); \
36033 + if (!should_protect(inode)) \
36034 + up_read(guard); \
36035 + } \
36036 + inode_file_plugin(inode)->method args; \
36037 + \
36038 + if (should_protect(inode)) \
36039 + up_read(guard); \
36040 +})
36041 +
36042 +/* Pass management to the unix-file plugin with "notail" policy */
36043 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
36044 +{
36045 + int result;
36046 + reiser4_inode *info;
36047 + struct unix_file_info * uf;
36048 + info = reiser4_inode_data(inode);
36049 +
36050 + result = aset_set_unsafe(&info->pset,
36051 + PSET_FILE,
36052 + (reiser4_plugin *)
36053 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36054 + if (result)
36055 + return result;
36056 + result = aset_set_unsafe(&info->pset,
36057 + PSET_FORMATTING,
36058 + (reiser4_plugin *)
36059 + formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
36060 + if (result)
36061 + return result;
36062 + /* get rid of non-standard plugins */
36063 + info->plugin_mask &= ~cryptcompress_mask;
36064 + /* get rid of plugin stat-data extension */
36065 + info->extmask &= ~(1 << PLUGIN_STAT);
36066 +
36067 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
36068 +
36069 + /* FIXME use init_inode_data_unix_file() instead,
36070 + but aviod init_inode_ordering() */
36071 + /* Init unix-file specific part of inode */
36072 + uf = unix_file_inode_data(inode);
36073 + uf->container = UF_CONTAINER_UNKNOWN;
36074 + init_rwsem(&uf->latch);
36075 + uf->tplug = inode_formatting_plugin(inode);
36076 + uf->exclusive_use = 0;
36077 +#if REISER4_DEBUG
36078 + uf->ea_owner = NULL;
36079 + atomic_set(&uf->nr_neas, 0);
36080 +#endif
36081 + /**
36082 + * we was carefull for file_ops, inode_ops and as_ops
36083 + * to be invariant for plugin conversion, so there is
36084 + * no need to update ones already installed in the
36085 + * vfs's residence.
36086 + */
36087 + return 0;
36088 +}
36089 +
36090 +#if REISER4_DEBUG
36091 +static int disabled_conversion_inode_ok(struct inode * inode)
36092 +{
36093 + __u64 extmask = reiser4_inode_data(inode)->extmask;
36094 + __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
36095 +
36096 + return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
36097 + (extmask & (1 << UNIX_STAT)) &&
36098 + (extmask & (1 << LARGE_TIMES_STAT)) &&
36099 + (extmask & (1 << PLUGIN_STAT)) &&
36100 + (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
36101 +}
36102 +#endif
36103 +
36104 +/**
36105 + * Disable future attempts to schedule/convert file plugin.
36106 + * This function is called by plugin schedule hooks.
36107 + *
36108 + * To disable conversion we assign any compression mode plugin id
36109 + * different from CONVX_COMPRESSION_MODE_ID.
36110 + */
36111 +static int disable_conversion(struct inode * inode)
36112 +{
36113 + int result;
36114 + result =
36115 + force_plugin_pset(inode,
36116 + PSET_COMPRESSION_MODE,
36117 + (reiser4_plugin *)compression_mode_plugin_by_id
36118 + (LATTD_COMPRESSION_MODE_ID));
36119 + assert("edward-1500",
36120 + ergo(!result, disabled_conversion_inode_ok(inode)));
36121 + return result;
36122 +}
36123 +
36124 +/**
36125 + * Check if we really have achieved plugin scheduling point
36126 + */
36127 +static int check_dispatch_point(struct inode * inode,
36128 + loff_t pos /* position in the
36129 + file to write from */,
36130 + struct cluster_handle * clust,
36131 + struct dispatch_context * cont)
36132 +{
36133 + assert("edward-1505", conversion_enabled(inode));
36134 + /*
36135 + * if file size is more then cluster size, then compressible
36136 + * status must be figured out (i.e. compression was disabled,
36137 + * or file plugin was converted to unix_file)
36138 + */
36139 + assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
36140 +
36141 + if (pos > inode->i_size)
36142 + /* first logical cluster will contain a (partial) hole */
36143 + return disable_conversion(inode);
36144 + if (pos < inode_cluster_size(inode))
36145 + /* writing to the first logical cluster */
36146 + return 0;
36147 + /*
36148 + * here we have:
36149 + * cluster_size <= pos <= i_size <= cluster_size,
36150 + * and, hence, pos == i_size == cluster_size
36151 + */
36152 + assert("edward-1498",
36153 + pos == inode->i_size &&
36154 + pos == inode_cluster_size(inode));
36155 + assert("edward-1539", cont != NULL);
36156 + assert("edward-1540", cont->state == DISPATCH_INVAL_STATE);
36157 +
36158 + cont->state = DISPATCH_POINT;
36159 + return 0;
36160 +}
36161 +
36162 +static void start_check_compressibility(struct inode * inode,
36163 + struct cluster_handle * clust,
36164 + hint_t * hint)
36165 +{
36166 + assert("edward-1507", clust->index == 1);
36167 + assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
36168 + assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
36169 +
36170 + hint_init_zero(hint);
36171 + clust->hint = hint;
36172 + clust->index --;
36173 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
36174 +
36175 + /* first logical cluster (of index #0) must be complete */
36176 + assert("edward-1510", lbytes(clust->index, inode) ==
36177 + inode_cluster_size(inode));
36178 +}
36179 +
36180 +static void finish_check_compressibility(struct inode * inode,
36181 + struct cluster_handle * clust,
36182 + hint_t * hint)
36183 +{
36184 + reiser4_unset_hint(clust->hint);
36185 + clust->hint = hint;
36186 + clust->index ++;
36187 +}
36188 +
36189 +#if REISER4_DEBUG
36190 +static int prepped_dclust_ok(hint_t * hint)
36191 +{
36192 + reiser4_key key;
36193 + coord_t * coord = &hint->ext_coord.coord;
36194 +
36195 + item_key_by_coord(coord, &key);
36196 + return (item_id_by_coord(coord) == CTAIL_ID &&
36197 + !coord_is_unprepped_ctail(coord) &&
36198 + (get_key_offset(&key) + nr_units_ctail(coord) ==
36199 + dclust_get_extension_dsize(hint)));
36200 +}
36201 +#endif
36202 +
36203 +#define fifty_persent(size) (size >> 1)
36204 +/* evaluation of data compressibility */
36205 +#define data_is_compressible(osize, isize) \
36206 + (osize < fifty_persent(isize))
36207 +
36208 +/**
36209 + * A simple O(1)-heuristic for compressibility.
36210 + * This is called not more then one time per file's life.
36211 + * Read first logical cluster (of index #0) and estimate its compressibility.
36212 + * Save estimation result in @cont.
36213 + */
36214 +static int read_check_compressibility(struct inode * inode,
36215 + struct cluster_handle * clust,
36216 + struct dispatch_context * cont)
36217 +{
36218 + int i;
36219 + int result;
36220 + size_t dst_len;
36221 + hint_t tmp_hint;
36222 + hint_t * cur_hint = clust->hint;
36223 + assert("edward-1541", cont->state == DISPATCH_POINT);
36224 +
36225 + start_check_compressibility(inode, clust, &tmp_hint);
36226 +
36227 + reset_cluster_pgset(clust, cluster_nrpages(inode));
36228 + result = grab_page_cluster(inode, clust, READ_OP);
36229 + if (result)
36230 + return result;
36231 + /* Read page cluster here */
36232 + for (i = 0; i < clust->nr_pages; i++) {
36233 + struct page *page = clust->pages[i];
36234 + lock_page(page);
36235 + result = do_readpage_ctail(inode, clust, page,
36236 + ZNODE_READ_LOCK);
36237 + unlock_page(page);
36238 + if (result)
36239 + goto error;
36240 + }
36241 + tfm_cluster_clr_uptodate(&clust->tc);
36242 +
36243 + cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
36244 +
36245 + if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
36246 + /* lenght of compressed data is known, no need to compress */
36247 + assert("edward-1511",
36248 + znode_is_any_locked(tmp_hint.lh.node));
36249 + assert("edward-1512",
36250 + WITH_DATA(tmp_hint.ext_coord.coord.node,
36251 + prepped_dclust_ok(&tmp_hint)));
36252 + dst_len = dclust_get_extension_dsize(&tmp_hint);
36253 + }
36254 + else {
36255 + struct tfm_cluster * tc = &clust->tc;
36256 + compression_plugin * cplug = inode_compression_plugin(inode);
36257 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
36258 + if (result)
36259 + goto error;
36260 + for (i = 0; i < clust->nr_pages; i++) {
36261 + char *data;
36262 + lock_page(clust->pages[i]);
36263 + BUG_ON(!PageUptodate(clust->pages[i]));
36264 + data = kmap(clust->pages[i]);
36265 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
36266 + data, PAGE_CACHE_SIZE);
36267 + kunmap(clust->pages[i]);
36268 + unlock_page(clust->pages[i]);
36269 + }
36270 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
36271 + if (result)
36272 + goto error;
36273 + result = grab_coa(tc, cplug);
36274 + if (result)
36275 + goto error;
36276 + tc->len = tc->lsize = lbytes(clust->index, inode);
36277 + assert("edward-1513", tc->len == inode_cluster_size(inode));
36278 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
36279 + cplug->compress(get_coa(tc, cplug->h.id, tc->act),
36280 + tfm_input_data(clust), tc->len,
36281 + tfm_output_data(clust), &dst_len);
36282 + assert("edward-1514",
36283 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
36284 + }
36285 + finish_check_compressibility(inode, clust, cur_hint);
36286 + cont->state =
36287 + (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
36288 + DISPATCH_REMAINS_OLD :
36289 + DISPATCH_ASSIGNED_NEW);
36290 + return 0;
36291 + error:
36292 + put_page_cluster(clust, inode, READ_OP);
36293 + return result;
36294 +}
36295 +
36296 +/* Cut disk cluster of index @idx */
36297 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
36298 +{
36299 + reiser4_key from, to;
36300 + assert("edward-1515", inode_file_plugin(inode) ==
36301 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
36302 + key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
36303 + to = from;
36304 + set_key_offset(&to,
36305 + get_key_offset(&from) + inode_cluster_size(inode) - 1);
36306 + return reiser4_cut_tree(reiser4_tree_by_inode(inode),
36307 + &from, &to, inode, 0);
36308 +}
36309 +
36310 +static int reserve_cryptcompress2unixfile(struct inode *inode)
36311 +{
36312 + reiser4_block_nr unformatted_nodes;
36313 + reiser4_tree *tree;
36314 +
36315 + tree = reiser4_tree_by_inode(inode);
36316 +
36317 + /* number of unformatted nodes which will be created */
36318 + unformatted_nodes = cluster_nrpages(inode); /* N */
36319 +
36320 + /*
36321 + * space required for one iteration of extent->tail conversion:
36322 + *
36323 + * 1. kill ctail items
36324 + *
36325 + * 2. insert N unformatted nodes
36326 + *
36327 + * 3. insert N (worst-case single-block
36328 + * extents) extent units.
36329 + *
36330 + * 4. drilling to the leaf level by coord_by_key()
36331 + *
36332 + * 5. possible update of stat-data
36333 + *
36334 + */
36335 + grab_space_enable();
36336 + return reiser4_grab_space
36337 + (2 * tree->height +
36338 + unformatted_nodes +
36339 + unformatted_nodes * estimate_one_insert_into_item(tree) +
36340 + 1 + estimate_one_insert_item(tree) +
36341 + inode_file_plugin(inode)->estimate.update(inode),
36342 + BA_CAN_COMMIT);
36343 +}
36344 +
36345 +/**
36346 + * Convert cryptcompress file plugin to unix_file plugin.
36347 + */
36348 +static int cryptcompress2unixfile(struct file *file, struct inode *inode,
36349 + struct dispatch_context *cont)
36350 +{
36351 + int i;
36352 + int result = 0;
36353 + struct cryptcompress_info *cr_info;
36354 + struct unix_file_info *uf_info;
36355 + assert("edward-1516", cont->pages[0]->index == 0);
36356 +
36357 + /* release all cryptcompress-specific resources */
36358 + cr_info = cryptcompress_inode_data(inode);
36359 + result = reserve_cryptcompress2unixfile(inode);
36360 + if (result)
36361 + goto out;
36362 + /* tell kill_hook to not truncate pages */
36363 + reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36364 + result = cut_disk_cluster(inode, 0);
36365 + if (result)
36366 + goto out;
36367 + /* captured jnode of cluster and assotiated resources (pages,
36368 + reserved disk space) were released by ->kill_hook() method
36369 + of the item plugin */
36370 +
36371 + result = __cryptcompress2unixfile(file, inode);
36372 + if (result)
36373 + goto out;
36374 + /* At this point file is managed by unix file plugin */
36375 +
36376 + uf_info = unix_file_inode_data(inode);
36377 +
36378 + assert("edward-1518",
36379 + ergo(jprivate(cont->pages[0]),
36380 + !jnode_is_cluster_page(jprivate(cont->pages[0]))));
36381 + for(i = 0; i < cont->nr_pages; i++) {
36382 + assert("edward-1519", cont->pages[i]);
36383 + assert("edward-1520", PageUptodate(cont->pages[i]));
36384 +
36385 + result = find_or_create_extent(cont->pages[i]);
36386 + if (result)
36387 + break;
36388 + }
36389 + if (unlikely(result))
36390 + goto out;
36391 + uf_info->container = UF_CONTAINER_EXTENTS;
36392 + result = reiser4_update_sd(inode);
36393 + out:
36394 + all_grabbed2free();
36395 + return result;
36396 +}
36397 +
36398 +#define convert_file_plugin cryptcompress2unixfile
36399 +
36400 +/**
36401 + * This is called by ->write() method of a cryptcompress file plugin.
36402 + * Make a decision about the most reasonable file plugin id to manage
36403 + * the file.
36404 + */
36405 +int write_dispatch_hook(struct file *file, struct inode *inode,
36406 + loff_t pos, struct cluster_handle *clust,
36407 + struct dispatch_context *cont)
36408 +{
36409 + int result;
36410 + if (!conversion_enabled(inode))
36411 + return 0;
36412 + result = check_dispatch_point(inode, pos, clust, cont);
36413 + if (result || cont->state != DISPATCH_POINT)
36414 + return result;
36415 + result = read_check_compressibility(inode, clust, cont);
36416 + if (result)
36417 + return result;
36418 + if (cont->state == DISPATCH_REMAINS_OLD) {
36419 + put_page_cluster(clust, inode, READ_OP);
36420 + return disable_conversion(inode);
36421 + }
36422 + assert("edward-1543", cont->state == DISPATCH_ASSIGNED_NEW);
36423 + /*
36424 + * page cluster is grabbed and uptodate. It will be
36425 + * released with a pgset after plugin conversion is
36426 + * finished, see put_dispatch_context().
36427 + */
36428 + reiser4_unset_hint(clust->hint);
36429 + move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
36430 + return 0;
36431 +}
36432 +
36433 +/**
36434 + * This is called by ->setattr() method of cryptcompress file plugin.
36435 + */
36436 +int setattr_dispatch_hook(struct inode * inode)
36437 +{
36438 + if (conversion_enabled(inode))
36439 + return disable_conversion(inode);
36440 + return 0;
36441 +}
36442 +
36443 +static inline void init_dispatch_context(struct dispatch_context * cont)
36444 +{
36445 + memset(cont, 0, sizeof(*cont));
36446 +}
36447 +
36448 +static inline void done_dispatch_context(struct dispatch_context * cont,
36449 + struct inode * inode)
36450 +{
36451 + if (cont->pages) {
36452 + __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
36453 + kfree(cont->pages);
36454 + }
36455 +}
36456 +/**
36457 + * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36458 + * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36459 + * which is not aware of plugin conversion performed by Reiser4.
36460 + */
36461 +
36462 +/*
36463 + * Wrappers with active protection for:
36464 + *
36465 + * ->write();
36466 + */
36467 +
36468 +/*
36469 + * ->write() file operation supplied to VFS.
36470 + * Write a file in 3 steps (some of them can be optional).
36471 + */
36472 +ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36473 + size_t count, loff_t *off)
36474 +{
36475 + int result;
36476 + reiser4_context *ctx;
36477 + ssize_t written_old = 0; /* bytes written with initial plugin */
36478 + ssize_t written_new = 0; /* bytes written with new plugin */
36479 + struct dispatch_context cont;
36480 + struct inode * inode = file->f_dentry->d_inode;
36481 +
36482 + ctx = reiser4_init_context(inode->i_sb);
36483 + if (IS_ERR(ctx))
36484 + return PTR_ERR(ctx);
36485 + init_dispatch_context(&cont);
36486 + mutex_lock(&inode->i_mutex);
36487 + /**
36488 + * First step.
36489 + * Start write with initial file plugin.
36490 + * Keep a plugin schedule status at @cont (if any).
36491 + */
36492 + written_old = inode_file_plugin(inode)->write(file,
36493 + buf,
36494 + count,
36495 + off,
36496 + &cont);
36497 + if (cont.state != DISPATCH_ASSIGNED_NEW || written_old < 0)
36498 + goto exit;
36499 + /**
36500 + * Second step.
36501 + * New file plugin has been scheduled.
36502 + * Perform conversion to the new plugin.
36503 + */
36504 + down_read(&reiser4_inode_data(inode)->conv_sem);
36505 + result = convert_file_plugin(file, inode, &cont);
36506 + up_read(&reiser4_inode_data(inode)->conv_sem);
36507 + if (result) {
36508 + warning("edward-1544",
36509 + "Inode %llu: file plugin conversion failed (%d)",
36510 + (unsigned long long)get_inode_oid(inode),
36511 + result);
36512 + context_set_commit_async(ctx);
36513 + goto exit;
36514 + }
36515 + reiser4_txn_restart(ctx);
36516 + /**
36517 + * Third step:
36518 + * Finish write with the new file plugin.
36519 + */
36520 + assert("edward-1536",
36521 + inode_file_plugin(inode) ==
36522 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36523 +
36524 + written_new = inode_file_plugin(inode)->write(file,
36525 + buf + written_old,
36526 + count - written_old,
36527 + off,
36528 + NULL);
36529 + exit:
36530 + mutex_unlock(&inode->i_mutex);
36531 + done_dispatch_context(&cont, inode);
36532 + reiser4_exit_context(ctx);
36533 +
36534 + return written_old + (written_new < 0 ? 0 : written_new);
36535 +}
36536 +
36537 +/* Wrappers with passive protection for:
36538 + *
36539 + * ->open();
36540 + * ->read();
36541 + * ->ioctl();
36542 + * ->mmap();
36543 + * ->release();
36544 + * ->bmap().
36545 + */
36546 +
36547 +int reiser4_open_careful(struct inode *inode, struct file *file)
36548 +{
36549 + return PROT_PASSIVE(int, open, (inode, file));
36550 +}
36551 +
36552 +ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36553 + size_t size, loff_t * off)
36554 +{
36555 + struct inode * inode = file->f_dentry->d_inode;
36556 + return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36557 +}
36558 +
36559 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36560 + unsigned int cmd, unsigned long arg)
36561 +{
36562 + return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36563 +}
36564 +
36565 +int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36566 +{
36567 + struct inode *inode = file->f_dentry->d_inode;
36568 + return PROT_PASSIVE(int, mmap, (file, vma));
36569 +}
36570 +
36571 +int reiser4_release_careful(struct inode *inode, struct file *file)
36572 +{
36573 + return PROT_PASSIVE(int, release, (inode, file));
36574 +}
36575 +
36576 +sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36577 +{
36578 + struct inode *inode = mapping->host;
36579 + return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36580 +}
36581 +
36582 +/**
36583 + * NOTE: The following two methods are
36584 + * used only for loopback functionality.
36585 + * reiser4_write_end() can not cope with
36586 + * short writes for now.
36587 + */
36588 +int reiser4_write_begin_careful(struct file *file,
36589 + struct address_space *mapping,
36590 + loff_t pos,
36591 + unsigned len,
36592 + unsigned flags,
36593 + struct page **pagep,
36594 + void **fsdata)
36595 +{
36596 + int ret = 0;
36597 + unsigned start, end;
36598 + struct page *page;
36599 + pgoff_t index;
36600 + reiser4_context *ctx;
36601 + struct inode * inode = file->f_dentry->d_inode;
36602 +
36603 + index = pos >> PAGE_CACHE_SHIFT;
36604 + start = pos & (PAGE_CACHE_SIZE - 1);
36605 + end = start + len;
36606 +
36607 + page = grab_cache_page_write_begin(mapping, index,
36608 + flags & AOP_FLAG_NOFS);
36609 + *pagep = page;
36610 + if (!page)
36611 + return -ENOMEM;
36612 +
36613 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
36614 + if (IS_ERR(ctx)) {
36615 + ret = PTR_ERR(ctx);
36616 + goto out;
36617 + }
36618 + ret = PROT_PASSIVE(int, write_begin, (file, page, start, end));
36619 +
36620 + /* don't commit transaction under inode semaphore */
36621 + context_set_commit_async(ctx);
36622 + reiser4_exit_context(ctx);
36623 + out:
36624 + if (unlikely(ret)) {
36625 + unlock_page(page);
36626 + page_cache_release(page);
36627 + }
36628 + return ret;
36629 +}
36630 +
36631 +int reiser4_write_end_careful(struct file *file,
36632 + struct address_space *mapping,
36633 + loff_t pos,
36634 + unsigned len,
36635 + unsigned copied,
36636 + struct page *page,
36637 + void *fsdata)
36638 +{
36639 + int ret;
36640 + reiser4_context *ctx;
36641 + unsigned start, end;
36642 + struct inode *inode = page->mapping->host;
36643 +
36644 + assert("umka-3101", file != NULL);
36645 + assert("umka-3102", page != NULL);
36646 + assert("umka-3093", PageLocked(page));
36647 +
36648 + start = pos & (PAGE_CACHE_SIZE - 1);
36649 + end = start + len;
36650 +
36651 + flush_dcache_page(page);
36652 + SetPageUptodate(page);
36653 +
36654 + ctx = reiser4_init_context(page->mapping->host->i_sb);
36655 + if (IS_ERR(ctx)){
36656 + unlock_page(page);
36657 + ret = PTR_ERR(ctx);
36658 + goto out;
36659 + }
36660 + ret = PROT_PASSIVE(int, write_end, (file, page, start, end));
36661 +
36662 + /* don't commit transaction under inode semaphore */
36663 + context_set_commit_async(ctx);
36664 + reiser4_exit_context(ctx);
36665 + out:
36666 + page_cache_release(page);
36667 + if (!ret)
36668 + ret = copied;
36669 + return ret;
36670 +}
36671 +
36672 +/*
36673 + * Wrappers without protection for:
36674 + *
36675 + * ->setattr()
36676 + */
36677 +int reiser4_setattr(struct dentry *dentry, struct iattr *attr)
36678 +{
36679 + return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
36680 +}
36681 +
36682 +/*
36683 + Local variables:
36684 + c-indentation-style: "K&R"
36685 + mode-name: "LC"
36686 + c-basic-offset: 8
36687 + tab-width: 8
36688 + fill-column: 80
36689 + scroll-step: 1
36690 + End:
36691 +*/
36692 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file.h linux-2.6.33/fs/reiser4/plugin/file/file.h
36693 --- linux-2.6.33.orig/fs/reiser4/plugin/file/file.h 1970-01-01 01:00:00.000000000 +0100
36694 +++ linux-2.6.33/fs/reiser4/plugin/file/file.h 2010-03-04 19:33:22.000000000 +0100
36695 @@ -0,0 +1,336 @@
36696 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36697 + * reiser4/README */
36698 +
36699 +/* this file contains declarations of methods implementing
36700 + file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36701 + and SYMLINK_FILE_PLUGIN_ID) */
36702 +
36703 +#if !defined( __REISER4_FILE_H__ )
36704 +#define __REISER4_FILE_H__
36705 +
36706 +/* possible states in dispatching process */
36707 +typedef enum {
36708 + DISPATCH_INVAL_STATE, /* invalid state */
36709 + DISPATCH_POINT, /* dispatching point has been achieved */
36710 + DISPATCH_REMAINS_OLD, /* made a decision to manage by old plugin */
36711 + DISPATCH_ASSIGNED_NEW /* a new plugin has been assigned */
36712 +} dispatch_state;
36713 +
36714 +struct dispatch_context {
36715 + int nr_pages;
36716 + struct page **pages;
36717 + dispatch_state state;
36718 +};
36719 +
36720 +/**
36721 + * Declarations of common/careful/generic methods.
36722 + * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36723 + * Then common reiser4 method for foo looks like reiser4_foo_common;
36724 + * careful method looks like reiser4_foo_careful;
36725 + * generic method looks like reiser4_foo.
36726 + *
36727 + * Common method is a simple instruction set eligible for more
36728 + * then one plugin id.
36729 + *
36730 + * Generic method looks at the plugin installed in inode's
36731 + * plugin set and calls its appropriate method.
36732 + *
36733 + * Careful method looks like generic method with protected pset
36734 + * (see plugin/file/file_conversion.c for details).
36735 + */
36736 +
36737 +/* inode operations */
36738 +int reiser4_setattr(struct dentry *, struct iattr *);
36739 +
36740 +/* file operations */
36741 +ssize_t reiser4_read_careful(struct file *, char __user *buf,
36742 + size_t count, loff_t *off);
36743 +ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36744 + size_t count, loff_t * off);
36745 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36746 + unsigned int cmd, unsigned long arg);
36747 +int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36748 +int reiser4_open_careful(struct inode *inode, struct file *file);
36749 +int reiser4_release_careful(struct inode *, struct file *);
36750 +int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
36751 +
36752 +/* address space operations */
36753 +int reiser4_readpage(struct file *, struct page *);
36754 +int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36755 + unsigned);
36756 +int reiser4_writepages(struct address_space *, struct writeback_control *);
36757 +int reiser4_write_begin_careful(struct file *file,
36758 + struct address_space *mapping,
36759 + loff_t pos, unsigned len, unsigned flags,
36760 + struct page **pagep, void **fsdata);
36761 +int reiser4_write_end_careful(struct file *file,
36762 + struct address_space *mapping,
36763 + loff_t pos, unsigned len, unsigned copied,
36764 + struct page *page, void *fsdata);
36765 +sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36766 +
36767 +/*
36768 + * Private methods of unix-file plugin
36769 + * (UNIX_FILE_PLUGIN_ID)
36770 + */
36771 +
36772 +/* private inode operations */
36773 +int setattr_unix_file(struct dentry *, struct iattr *);
36774 +
36775 +/* private file operations */
36776 +
36777 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36778 + loff_t *off);
36779 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36780 + loff_t * off, struct dispatch_context * cont);
36781 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36782 + unsigned long arg);
36783 +int mmap_unix_file(struct file *, struct vm_area_struct *);
36784 +int open_unix_file(struct inode *, struct file *);
36785 +int release_unix_file(struct inode *, struct file *);
36786 +
36787 +/* private address space operations */
36788 +int readpage_unix_file(struct file *, struct page *);
36789 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*,
36790 + unsigned);
36791 +int writepages_unix_file(struct address_space *, struct writeback_control *);
36792 +int write_begin_unix_file(struct file *file, struct page *page,
36793 + unsigned from, unsigned to);
36794 +int write_end_unix_file(struct file *file, struct page *page,
36795 + unsigned from, unsigned to);
36796 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36797 +
36798 +/* other private methods */
36799 +int delete_object_unix_file(struct inode *);
36800 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36801 + int user, loff_t, loff_t, rw_op, flow_t *);
36802 +int owns_item_unix_file(const struct inode *, const coord_t *);
36803 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36804 + int create);
36805 +
36806 +/*
36807 + * Private methods of cryptcompress file plugin
36808 + * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36809 + */
36810 +
36811 +/* private inode operations */
36812 +int setattr_cryptcompress(struct dentry *, struct iattr *);
36813 +
36814 +/* private file operations */
36815 +ssize_t read_cryptcompress(struct file *, char __user *buf,
36816 + size_t count, loff_t *off);
36817 +ssize_t write_cryptcompress(struct file *, const char __user *buf,
36818 + size_t count, loff_t * off,
36819 + struct dispatch_context *cont);
36820 +int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36821 + unsigned long arg);
36822 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36823 +int open_cryptcompress(struct inode *, struct file *);
36824 +int release_cryptcompress(struct inode *, struct file *);
36825 +
36826 +/* private address space operations */
36827 +int readpage_cryptcompress(struct file *, struct page *);
36828 +int readpages_cryptcompress(struct file*, struct address_space*,
36829 + struct list_head*, unsigned);
36830 +int writepages_cryptcompress(struct address_space *,
36831 + struct writeback_control *);
36832 +int write_begin_cryptcompress(struct file *file, struct page *page,
36833 + unsigned from, unsigned to);
36834 +int write_end_cryptcompress(struct file *file, struct page *page,
36835 + unsigned from, unsigned to);
36836 +sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36837 +
36838 +/* other private methods */
36839 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36840 + int user, loff_t, loff_t, rw_op, flow_t *);
36841 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36842 +int create_object_cryptcompress(struct inode *, struct inode *,
36843 + reiser4_object_create_data *);
36844 +int delete_object_cryptcompress(struct inode *);
36845 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36846 + int create);
36847 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36848 + const reiser4_key * to_key,
36849 + reiser4_key * smallest_removed,
36850 + struct inode *object, int truncate,
36851 + int *progress);
36852 +void destroy_inode_cryptcompress(struct inode *);
36853 +
36854 +/*
36855 + * Private methods of symlink file plugin
36856 + * (SYMLINK_FILE_PLUGIN_ID)
36857 + */
36858 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36859 + reiser4_object_create_data *);
36860 +void destroy_inode_symlink(struct inode *);
36861 +
36862 +/*
36863 + * all the write into unix file is performed by item write method. Write method
36864 + * of unix file plugin only decides which item plugin (extent or tail) and in
36865 + * which mode (one from the enum below) to call
36866 + */
36867 +typedef enum {
36868 + FIRST_ITEM = 1,
36869 + APPEND_ITEM = 2,
36870 + OVERWRITE_ITEM = 3
36871 +} write_mode_t;
36872 +
36873 +/* unix file may be in one the following states */
36874 +typedef enum {
36875 + UF_CONTAINER_UNKNOWN = 0,
36876 + UF_CONTAINER_TAILS = 1,
36877 + UF_CONTAINER_EXTENTS = 2,
36878 + UF_CONTAINER_EMPTY = 3
36879 +} file_container_t;
36880 +
36881 +struct formatting_plugin;
36882 +struct inode;
36883 +
36884 +/* unix file plugin specific part of reiser4 inode */
36885 +struct unix_file_info {
36886 + /*
36887 + * this read-write lock protects file containerization change. Accesses
36888 + * which do not change file containerization (see file_container_t)
36889 + * (read, readpage, writepage, write (until tail conversion is
36890 + * involved)) take read-lock. Accesses which modify file
36891 + * containerization (truncate, conversion from tail to extent and back)
36892 + * take write-lock.
36893 + */
36894 + struct rw_semaphore latch;
36895 + /* this enum specifies which items are used to build the file */
36896 + file_container_t container;
36897 + /*
36898 + * plugin which controls when file is to be converted to extents and
36899 + * back to tail
36900 + */
36901 + struct formatting_plugin *tplug;
36902 + /* if this is set, file is in exclusive use */
36903 + int exclusive_use;
36904 +#if REISER4_DEBUG
36905 + /* pointer to task struct of thread owning exclusive access to file */
36906 + void *ea_owner;
36907 + atomic_t nr_neas;
36908 + void *last_reader;
36909 +#endif
36910 +};
36911 +
36912 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36913 +void get_exclusive_access(struct unix_file_info *);
36914 +void drop_exclusive_access(struct unix_file_info *);
36915 +void get_nonexclusive_access(struct unix_file_info *);
36916 +void drop_nonexclusive_access(struct unix_file_info *);
36917 +int try_to_get_nonexclusive_access(struct unix_file_info *);
36918 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36919 + struct inode *);
36920 +int find_file_item_nohint(coord_t *, lock_handle *,
36921 + const reiser4_key *, znode_lock_mode,
36922 + struct inode *);
36923 +
36924 +int load_file_hint(struct file *, hint_t *);
36925 +void save_file_hint(struct file *, const hint_t *);
36926 +
36927 +#include "../item/extent.h"
36928 +#include "../item/tail.h"
36929 +#include "../item/ctail.h"
36930 +
36931 +struct uf_coord {
36932 + coord_t coord;
36933 + lock_handle *lh;
36934 + int valid;
36935 + union {
36936 + struct extent_coord_extension extent;
36937 + struct tail_coord_extension tail;
36938 + struct ctail_coord_extension ctail;
36939 + } extension;
36940 +};
36941 +
36942 +#include "../../forward.h"
36943 +#include "../../seal.h"
36944 +#include "../../lock.h"
36945 +
36946 +/*
36947 + * This structure is used to speed up file operations (reads and writes). A
36948 + * hint is a suggestion about where a key resolved to last time. A seal
36949 + * indicates whether a node has been modified since a hint was last recorded.
36950 + * You check the seal, and if the seal is still valid, you can use the hint
36951 + * without traversing the tree again.
36952 + */
36953 +struct hint {
36954 + seal_t seal; /* a seal over last file item accessed */
36955 + uf_coord_t ext_coord;
36956 + loff_t offset;
36957 + znode_lock_mode mode;
36958 + lock_handle lh;
36959 +};
36960 +
36961 +static inline int hint_is_valid(hint_t * hint)
36962 +{
36963 + return hint->ext_coord.valid;
36964 +}
36965 +
36966 +static inline void hint_set_valid(hint_t * hint)
36967 +{
36968 + hint->ext_coord.valid = 1;
36969 +}
36970 +
36971 +static inline void hint_clr_valid(hint_t * hint)
36972 +{
36973 + hint->ext_coord.valid = 0;
36974 +}
36975 +
36976 +int load_file_hint(struct file *, hint_t *);
36977 +void save_file_hint(struct file *, const hint_t *);
36978 +void hint_init_zero(hint_t *);
36979 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36980 +int hint_is_set(const hint_t *);
36981 +void reiser4_unset_hint(hint_t *);
36982 +
36983 +int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
36984 +int cut_file_items(struct inode *, loff_t new_size,
36985 + int update_sd, loff_t cur_size,
36986 + int (*update_actor) (struct inode *, loff_t, int));
36987 +#if REISER4_DEBUG
36988 +
36989 +/* return 1 is exclusive access is obtained, 0 - otherwise */
36990 +static inline int ea_obtained(struct unix_file_info * uf_info)
36991 +{
36992 + int ret;
36993 +
36994 + ret = down_read_trylock(&uf_info->latch);
36995 + if (ret)
36996 + up_read(&uf_info->latch);
36997 + return !ret;
36998 +}
36999 +
37000 +#endif
37001 +
37002 +#define WRITE_GRANULARITY 32
37003 +
37004 +int tail2extent(struct unix_file_info *);
37005 +int extent2tail(struct file *, struct unix_file_info *);
37006 +
37007 +int goto_right_neighbor(coord_t *, lock_handle *);
37008 +int find_or_create_extent(struct page *);
37009 +int equal_to_ldk(znode *, const reiser4_key *);
37010 +
37011 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
37012 +
37013 +static inline int cbk_errored(int cbk_result)
37014 +{
37015 + return (cbk_result != CBK_COORD_NOTFOUND
37016 + && cbk_result != CBK_COORD_FOUND);
37017 +}
37018 +
37019 +/* __REISER4_FILE_H__ */
37020 +#endif
37021 +
37022 +/*
37023 + * Local variables:
37024 + * c-indentation-style: "K&R"
37025 + * mode-name: "LC"
37026 + * c-basic-offset: 8
37027 + * tab-width: 8
37028 + * fill-column: 79
37029 + * scroll-step: 1
37030 + * End:
37031 +*/
37032 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/Makefile linux-2.6.33/fs/reiser4/plugin/file/Makefile
37033 --- linux-2.6.33.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 01:00:00.000000000 +0100
37034 +++ linux-2.6.33/fs/reiser4/plugin/file/Makefile 2010-03-04 19:33:22.000000000 +0100
37035 @@ -0,0 +1,7 @@
37036 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
37037 +
37038 +file_plugins-objs := \
37039 + file.o \
37040 + tail_conversion.o \
37041 + symlink.o \
37042 + cryptcompress.o
37043 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.33/fs/reiser4/plugin/file/symfile.c
37044 --- linux-2.6.33.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 01:00:00.000000000 +0100
37045 +++ linux-2.6.33/fs/reiser4/plugin/file/symfile.c 2010-03-04 19:33:22.000000000 +0100
37046 @@ -0,0 +1,87 @@
37047 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
37048 +
37049 +/* Symfiles are a generalization of Unix symlinks.
37050 +
37051 + A symfile when read behaves as though you took its contents and
37052 + substituted them into the reiser4 naming system as the right hand side
37053 + of an assignment, and then read that which you had assigned to it.
37054 +
37055 + A key issue for symfiles is how to implement writes through to
37056 + subfiles. In general, one must have some method of determining what
37057 + of that which is written to the symfile is written to what subfile.
37058 + This can be done by use of custom plugin methods written by users, or
37059 + by using a few general methods we provide for those willing to endure
37060 + the insertion of delimiters into what is read.
37061 +
37062 + Writing to symfiles without delimiters to denote what is written to
37063 + what subfile is not supported by any plugins we provide in this
37064 + release. Our most sophisticated support for writes is that embodied
37065 + by the invert plugin (see invert.c).
37066 +
37067 + A read only version of the /etc/passwd file might be
37068 + constructed as a symfile whose contents are as follows:
37069 +
37070 + /etc/passwd/userlines/*
37071 +
37072 + or
37073 +
37074 + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
37075 +
37076 + or
37077 +
37078 + /etc/passwd/userlines/(demidov+edward+reiser+root)
37079 +
37080 + A symfile with contents
37081 +
37082 + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
37083 +
37084 + will return when read
37085 +
37086 + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
37087 +
37088 + and write of what has been read will not be possible to implement as
37089 + an identity operation because there are no delimiters denoting the
37090 + boundaries of what is to be written to what subfile.
37091 +
37092 + Note that one could make this a read/write symfile if one specified
37093 + delimiters, and the write method understood those delimiters delimited
37094 + what was written to subfiles.
37095 +
37096 + So, specifying the symfile in a manner that allows writes:
37097 +
37098 + /etc/passwd/userlines/demidov+"(
37099 + )+/etc/passwd/userlines/edward+"(
37100 + )+/etc/passwd/userlines/reiser+"(
37101 + )+/etc/passwd/userlines/root+"(
37102 + )
37103 +
37104 + or
37105 +
37106 + /etc/passwd/userlines/(demidov+"(
37107 + )+edward+"(
37108 + )+reiser+"(
37109 + )+root+"(
37110 + ))
37111 +
37112 + and the file demidov might be specified as:
37113 +
37114 + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
37115 +
37116 + or
37117 +
37118 + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
37119 +
37120 + Notice that if the file demidov has a carriage return in it, the
37121 + parsing fails, but then if you put carriage returns in the wrong place
37122 + in a normal /etc/passwd file it breaks things also.
37123 +
37124 + Note that it is forbidden to have no text between two interpolations
37125 + if one wants to be able to define what parts of a write go to what
37126 + subfiles referenced in an interpolation.
37127 +
37128 + If one wants to be able to add new lines by writing to the file, one
37129 + must either write a custom plugin for /etc/passwd that knows how to
37130 + name an added line, or one must use an invert, or one must use a more
37131 + sophisticated symfile syntax that we are not planning to write for
37132 + version 4.0.
37133 +*/
37134 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.33/fs/reiser4/plugin/file/symlink.c
37135 --- linux-2.6.33.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 01:00:00.000000000 +0100
37136 +++ linux-2.6.33/fs/reiser4/plugin/file/symlink.c 2010-03-04 19:33:22.000000000 +0100
37137 @@ -0,0 +1,95 @@
37138 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
37139 +
37140 +#include "../../inode.h"
37141 +
37142 +#include <linux/types.h>
37143 +#include <linux/fs.h>
37144 +
37145 +/* file plugin methods specific for symlink files
37146 + (SYMLINK_FILE_PLUGIN_ID) */
37147 +
37148 +/* this is implementation of create_object method of file plugin for
37149 + SYMLINK_FILE_PLUGIN_ID
37150 + */
37151 +
37152 +/**
37153 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
37154 + * @symlink: inode of symlink object
37155 + * @dir: inode of parent directory
37156 + * @info: parameters of new object
37157 + *
37158 + * Inserts stat data with symlink extension where into the tree.
37159 + */
37160 +int reiser4_create_symlink(struct inode *symlink,
37161 + struct inode *dir UNUSED_ARG,
37162 + reiser4_object_create_data *data /* info passed to us
37163 + * this is filled by
37164 + * reiser4() syscall
37165 + * in particular */)
37166 +{
37167 + int result;
37168 +
37169 + assert("nikita-680", symlink != NULL);
37170 + assert("nikita-681", S_ISLNK(symlink->i_mode));
37171 + assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
37172 + assert("nikita-682", dir != NULL);
37173 + assert("nikita-684", data != NULL);
37174 + assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
37175 +
37176 + /*
37177 + * stat data of symlink has symlink extension in which we store
37178 + * symlink content, that is, path symlink is pointing to.
37179 + */
37180 + reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
37181 +
37182 + assert("vs-838", symlink->i_private == NULL);
37183 + symlink->i_private = (void *)data->name;
37184 +
37185 + assert("vs-843", symlink->i_size == 0);
37186 + INODE_SET_FIELD(symlink, i_size, strlen(data->name));
37187 +
37188 + /* insert stat data appended with data->name */
37189 + result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
37190 + if (result) {
37191 + /* FIXME-VS: Make sure that symlink->i_private is not attached
37192 + to kmalloced data */
37193 + INODE_SET_FIELD(symlink, i_size, 0);
37194 + } else {
37195 + assert("vs-849", symlink->i_private
37196 + && reiser4_inode_get_flag(symlink,
37197 + REISER4_GENERIC_PTR_USED));
37198 + assert("vs-850",
37199 + !memcmp((char *)symlink->i_private, data->name,
37200 + (size_t) symlink->i_size + 1));
37201 + }
37202 + return result;
37203 +}
37204 +
37205 +/* this is implementation of destroy_inode method of file plugin for
37206 + SYMLINK_FILE_PLUGIN_ID
37207 + */
37208 +void destroy_inode_symlink(struct inode *inode)
37209 +{
37210 + assert("edward-799",
37211 + inode_file_plugin(inode) ==
37212 + file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
37213 + assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
37214 + assert("edward-801", reiser4_inode_get_flag(inode,
37215 + REISER4_GENERIC_PTR_USED));
37216 + assert("vs-839", S_ISLNK(inode->i_mode));
37217 +
37218 + kfree(inode->i_private);
37219 + inode->i_private = NULL;
37220 + reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
37221 +}
37222 +
37223 +/*
37224 + Local variables:
37225 + c-indentation-style: "K&R"
37226 + mode-name: "LC"
37227 + c-basic-offset: 8
37228 + tab-width: 8
37229 + fill-column: 80
37230 + scroll-step: 1
37231 + End:
37232 +*/
37233 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.33/fs/reiser4/plugin/file/tail_conversion.c
37234 --- linux-2.6.33.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 01:00:00.000000000 +0100
37235 +++ linux-2.6.33/fs/reiser4/plugin/file/tail_conversion.c 2010-03-04 19:33:22.000000000 +0100
37236 @@ -0,0 +1,743 @@
37237 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
37238 +
37239 +#include "../../inode.h"
37240 +#include "../../super.h"
37241 +#include "../../page_cache.h"
37242 +#include "../../carry.h"
37243 +#include "../../safe_link.h"
37244 +#include "../../vfs_ops.h"
37245 +
37246 +#include <linux/writeback.h>
37247 +
37248 +/* this file contains:
37249 + tail2extent and extent2tail */
37250 +
37251 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
37252 +void get_exclusive_access(struct unix_file_info * uf_info)
37253 +{
37254 + assert("nikita-3028", reiser4_schedulable());
37255 + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
37256 + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
37257 + /*
37258 + * "deadlock avoidance": sometimes we commit a transaction under
37259 + * rw-semaphore on a file. Such commit can deadlock with another
37260 + * thread that captured some block (hence preventing atom from being
37261 + * committed) and waits on rw-semaphore.
37262 + */
37263 + reiser4_txn_restart_current();
37264 + LOCK_CNT_INC(inode_sem_w);
37265 + down_write(&uf_info->latch);
37266 + uf_info->exclusive_use = 1;
37267 + assert("vs-1713", uf_info->ea_owner == NULL);
37268 + assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
37269 + ON_DEBUG(uf_info->ea_owner = current);
37270 +}
37271 +
37272 +void drop_exclusive_access(struct unix_file_info * uf_info)
37273 +{
37274 + assert("vs-1714", uf_info->ea_owner == current);
37275 + assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
37276 + ON_DEBUG(uf_info->ea_owner = NULL);
37277 + uf_info->exclusive_use = 0;
37278 + up_write(&uf_info->latch);
37279 + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
37280 + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
37281 + LOCK_CNT_DEC(inode_sem_w);
37282 + reiser4_txn_restart_current();
37283 +}
37284 +
37285 +/**
37286 + * nea_grabbed - do something when file semaphore is down_read-ed
37287 + * @uf_info:
37288 + *
37289 + * This is called when nonexclisive access is obtained on file. All it does is
37290 + * for debugging purposes.
37291 + */
37292 +static void nea_grabbed(struct unix_file_info *uf_info)
37293 +{
37294 +#if REISER4_DEBUG
37295 + LOCK_CNT_INC(inode_sem_r);
37296 + assert("vs-1716", uf_info->ea_owner == NULL);
37297 + atomic_inc(&uf_info->nr_neas);
37298 + uf_info->last_reader = current;
37299 +#endif
37300 +}
37301 +
37302 +/**
37303 + * get_nonexclusive_access - get nonexclusive access to a file
37304 + * @uf_info: unix file specific part of inode to obtain access to
37305 + *
37306 + * Nonexclusive access is obtained on a file before read, write, readpage.
37307 + */
37308 +void get_nonexclusive_access(struct unix_file_info *uf_info)
37309 +{
37310 + assert("nikita-3029", reiser4_schedulable());
37311 + assert("nikita-3361", get_current_context()->trans->atom == NULL);
37312 +
37313 + down_read(&uf_info->latch);
37314 + nea_grabbed(uf_info);
37315 +}
37316 +
37317 +/**
37318 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
37319 + * @uf_info: unix file specific part of inode to obtain access to
37320 + *
37321 + * Non-blocking version of nonexclusive access obtaining.
37322 + */
37323 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
37324 +{
37325 + int result;
37326 +
37327 + result = down_read_trylock(&uf_info->latch);
37328 + if (result)
37329 + nea_grabbed(uf_info);
37330 + return result;
37331 +}
37332 +
37333 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
37334 +{
37335 + assert("vs-1718", uf_info->ea_owner == NULL);
37336 + assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
37337 + ON_DEBUG(atomic_dec(&uf_info->nr_neas));
37338 +
37339 + up_read(&uf_info->latch);
37340 +
37341 + LOCK_CNT_DEC(inode_sem_r);
37342 + reiser4_txn_restart_current();
37343 +}
37344 +
37345 +/* part of tail2extent. Cut all items covering @count bytes starting from
37346 + @offset */
37347 +/* Audited by: green(2002.06.15) */
37348 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
37349 +{
37350 + reiser4_key from, to;
37351 +
37352 + /* AUDIT: How about putting an assertion here, what would check
37353 + all provided range is covered by tail items only? */
37354 + /* key of first byte in the range to be cut */
37355 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37356 +
37357 + /* key of last byte in that range */
37358 + to = from;
37359 + set_key_offset(&to, (__u64) (offset + count - 1));
37360 +
37361 + /* cut everything between those keys */
37362 + return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
37363 + inode, 0);
37364 +}
37365 +
37366 +static void release_all_pages(struct page **pages, unsigned nr_pages)
37367 +{
37368 + unsigned i;
37369 +
37370 + for (i = 0; i < nr_pages; i++) {
37371 + if (pages[i] == NULL) {
37372 +#if REISER4_DEBUG
37373 + unsigned j;
37374 + for (j = i + 1; j < nr_pages; j++)
37375 + assert("vs-1620", pages[j] == NULL);
37376 +#endif
37377 + break;
37378 + }
37379 + page_cache_release(pages[i]);
37380 + pages[i] = NULL;
37381 + }
37382 +}
37383 +
37384 +/* part of tail2extent. replace tail items with extent one. Content of tail
37385 + items (@count bytes) being cut are copied already into
37386 + pages. extent_writepage method is called to create extents corresponding to
37387 + those pages */
37388 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
37389 +{
37390 + int result;
37391 + unsigned i;
37392 + STORE_COUNTERS;
37393 +
37394 + if (nr_pages == 0)
37395 + return 0;
37396 +
37397 + assert("vs-596", pages[0]);
37398 +
37399 + /* cut copied items */
37400 + result = cut_formatting_items(inode, page_offset(pages[0]), count);
37401 + if (result)
37402 + return result;
37403 +
37404 + CHECK_COUNTERS;
37405 +
37406 + /* put into tree replacement for just removed items: extent item, namely */
37407 + for (i = 0; i < nr_pages; i++) {
37408 + result = add_to_page_cache_lru(pages[i], inode->i_mapping,
37409 + pages[i]->index,
37410 + mapping_gfp_mask(inode->
37411 + i_mapping));
37412 + if (result)
37413 + break;
37414 + unlock_page(pages[i]);
37415 + result = find_or_create_extent(pages[i]);
37416 + if (result)
37417 + break;
37418 + SetPageUptodate(pages[i]);
37419 + }
37420 + return result;
37421 +}
37422 +
37423 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37424 + * items */
37425 +
37426 +static int reserve_tail2extent_iteration(struct inode *inode)
37427 +{
37428 + reiser4_block_nr unformatted_nodes;
37429 + reiser4_tree *tree;
37430 +
37431 + tree = reiser4_tree_by_inode(inode);
37432 +
37433 + /* number of unformatted nodes which will be created */
37434 + unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37435 +
37436 + /*
37437 + * space required for one iteration of extent->tail conversion:
37438 + *
37439 + * 1. kill N tail items
37440 + *
37441 + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37442 + *
37443 + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37444 + * extents) extent units.
37445 + *
37446 + * 4. drilling to the leaf level by coord_by_key()
37447 + *
37448 + * 5. possible update of stat-data
37449 + *
37450 + */
37451 + grab_space_enable();
37452 + return reiser4_grab_space
37453 + (2 * tree->height +
37454 + TAIL2EXTENT_PAGE_NUM +
37455 + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37456 + 1 + estimate_one_insert_item(tree) +
37457 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37458 +}
37459 +
37460 +/* clear stat data's flag indicating that conversion is being converted */
37461 +static int complete_conversion(struct inode *inode)
37462 +{
37463 + int result;
37464 +
37465 + grab_space_enable();
37466 + result =
37467 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37468 + BA_CAN_COMMIT);
37469 + if (result == 0) {
37470 + reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37471 + result = reiser4_update_sd(inode);
37472 + }
37473 + if (result)
37474 + warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37475 + (unsigned long long)get_inode_oid(inode), result);
37476 + return 0;
37477 +}
37478 +
37479 +/**
37480 + * find_start
37481 + * @inode:
37482 + * @id:
37483 + * @offset:
37484 + *
37485 + * this is used by tail2extent and extent2tail to detect where previous
37486 + * uncompleted conversion stopped
37487 + */
37488 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37489 +{
37490 + int result;
37491 + lock_handle lh;
37492 + coord_t coord;
37493 + struct unix_file_info *ufo;
37494 + int found;
37495 + reiser4_key key;
37496 +
37497 + ufo = unix_file_inode_data(inode);
37498 + init_lh(&lh);
37499 + result = 0;
37500 + found = 0;
37501 + inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37502 + do {
37503 + init_lh(&lh);
37504 + result = find_file_item_nohint(&coord, &lh, &key,
37505 + ZNODE_READ_LOCK, inode);
37506 +
37507 + if (result == CBK_COORD_FOUND) {
37508 + if (coord.between == AT_UNIT) {
37509 + /*coord_clear_iplug(&coord); */
37510 + result = zload(coord.node);
37511 + if (result == 0) {
37512 + if (item_id_by_coord(&coord) == id)
37513 + found = 1;
37514 + else
37515 + item_plugin_by_coord(&coord)->s.
37516 + file.append_key(&coord,
37517 + &key);
37518 + zrelse(coord.node);
37519 + }
37520 + } else
37521 + result = RETERR(-ENOENT);
37522 + }
37523 + done_lh(&lh);
37524 + } while (result == 0 && !found);
37525 + *offset = get_key_offset(&key);
37526 + return result;
37527 +}
37528 +
37529 +/**
37530 + * tail2extent
37531 + * @uf_info:
37532 + *
37533 + *
37534 + */
37535 +int tail2extent(struct unix_file_info *uf_info)
37536 +{
37537 + int result;
37538 + reiser4_key key; /* key of next byte to be moved to page */
37539 + char *p_data; /* data of page */
37540 + unsigned page_off = 0, /* offset within the page where to copy data */
37541 + count; /* number of bytes of item which can be
37542 + * copied to page */
37543 + struct page *pages[TAIL2EXTENT_PAGE_NUM];
37544 + struct page *page;
37545 + int done; /* set to 1 when all file is read */
37546 + char *item;
37547 + int i;
37548 + struct inode *inode;
37549 + int first_iteration;
37550 + int bytes;
37551 + __u64 offset;
37552 +
37553 + assert("nikita-3362", ea_obtained(uf_info));
37554 + inode = unix_file_info_to_inode(uf_info);
37555 + assert("nikita-3412", !IS_RDONLY(inode));
37556 + assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37557 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37558 +
37559 + offset = 0;
37560 + first_iteration = 1;
37561 + result = 0;
37562 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37563 + /*
37564 + * file is marked on disk as there was a conversion which did
37565 + * not complete due to either crash or some error. Find which
37566 + * offset tail conversion stopped at
37567 + */
37568 + result = find_start(inode, FORMATTING_ID, &offset);
37569 + if (result == -ENOENT) {
37570 + /* no tail items found, everything is converted */
37571 + uf_info->container = UF_CONTAINER_EXTENTS;
37572 + complete_conversion(inode);
37573 + return 0;
37574 + } else if (result != 0)
37575 + /* some other error */
37576 + return result;
37577 + first_iteration = 0;
37578 + }
37579 +
37580 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37581 +
37582 + /* get key of first byte of a file */
37583 + inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37584 +
37585 + done = 0;
37586 + while (done == 0) {
37587 + memset(pages, 0, sizeof(pages));
37588 + result = reserve_tail2extent_iteration(inode);
37589 + if (result != 0) {
37590 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37591 + goto out;
37592 + }
37593 + if (first_iteration) {
37594 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37595 + reiser4_update_sd(inode);
37596 + first_iteration = 0;
37597 + }
37598 + bytes = 0;
37599 + for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37600 + assert("vs-598",
37601 + (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37602 + page = alloc_page(reiser4_ctx_gfp_mask_get());
37603 + if (!page) {
37604 + result = RETERR(-ENOMEM);
37605 + goto error;
37606 + }
37607 +
37608 + page->index =
37609 + (unsigned long)(get_key_offset(&key) >>
37610 + PAGE_CACHE_SHIFT);
37611 + /*
37612 + * usually when one is going to longterm lock znode (as
37613 + * find_file_item does, for instance) he must not hold
37614 + * locked pages. However, there is an exception for
37615 + * case tail2extent. Pages appearing here are not
37616 + * reachable to everyone else, they are clean, they do
37617 + * not have jnodes attached so keeping them locked do
37618 + * not risk deadlock appearance
37619 + */
37620 + assert("vs-983", !PagePrivate(page));
37621 + reiser4_invalidate_pages(inode->i_mapping, page->index,
37622 + 1, 0);
37623 +
37624 + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37625 + coord_t coord;
37626 + lock_handle lh;
37627 +
37628 + /* get next item */
37629 + /* FIXME: we might want to readahead here */
37630 + init_lh(&lh);
37631 + result =
37632 + find_file_item_nohint(&coord, &lh, &key,
37633 + ZNODE_READ_LOCK,
37634 + inode);
37635 + if (result != CBK_COORD_FOUND) {
37636 + /*
37637 + * error happened of not items of file
37638 + * were found
37639 + */
37640 + done_lh(&lh);
37641 + page_cache_release(page);
37642 + goto error;
37643 + }
37644 +
37645 + if (coord.between == AFTER_UNIT) {
37646 + /*
37647 + * end of file is reached. Padd page
37648 + * with zeros
37649 + */
37650 + done_lh(&lh);
37651 + done = 1;
37652 + p_data = kmap_atomic(page, KM_USER0);
37653 + memset(p_data + page_off, 0,
37654 + PAGE_CACHE_SIZE - page_off);
37655 + kunmap_atomic(p_data, KM_USER0);
37656 + break;
37657 + }
37658 +
37659 + result = zload(coord.node);
37660 + if (result) {
37661 + page_cache_release(page);
37662 + done_lh(&lh);
37663 + goto error;
37664 + }
37665 + assert("vs-856", coord.between == AT_UNIT);
37666 + item = ((char *)item_body_by_coord(&coord)) +
37667 + coord.unit_pos;
37668 +
37669 + /* how many bytes to copy */
37670 + count =
37671 + item_length_by_coord(&coord) -
37672 + coord.unit_pos;
37673 + /* limit length of copy to end of page */
37674 + if (count > PAGE_CACHE_SIZE - page_off)
37675 + count = PAGE_CACHE_SIZE - page_off;
37676 +
37677 + /*
37678 + * copy item (as much as will fit starting from
37679 + * the beginning of the item) into the page
37680 + */
37681 + p_data = kmap_atomic(page, KM_USER0);
37682 + memcpy(p_data + page_off, item, count);
37683 + kunmap_atomic(p_data, KM_USER0);
37684 +
37685 + page_off += count;
37686 + bytes += count;
37687 + set_key_offset(&key,
37688 + get_key_offset(&key) + count);
37689 +
37690 + zrelse(coord.node);
37691 + done_lh(&lh);
37692 + } /* end of loop which fills one page by content of
37693 + * formatting items */
37694 +
37695 + if (page_off) {
37696 + /* something was copied into page */
37697 + pages[i] = page;
37698 + } else {
37699 + page_cache_release(page);
37700 + assert("vs-1648", done == 1);
37701 + break;
37702 + }
37703 + } /* end of loop through pages of one conversion iteration */
37704 +
37705 + if (i > 0) {
37706 + result = replace(inode, pages, i, bytes);
37707 + release_all_pages(pages, sizeof_array(pages));
37708 + if (result)
37709 + goto error;
37710 + /*
37711 + * We have to drop exclusive access to avoid deadlock
37712 + * which may happen because called by reiser4_writepages
37713 + * capture_unix_file requires to get non-exclusive
37714 + * access to a file. It is safe to drop EA in the middle
37715 + * of tail2extent conversion because write_unix_file,
37716 + * setattr_unix_file(truncate), mmap_unix_file,
37717 + * release_unix_file(extent2tail) checks if conversion
37718 + * is not in progress (see comments before
37719 + * get_exclusive_access_careful().
37720 + * Other processes that acquire non-exclusive access
37721 + * (read_unix_file, reiser4_writepages, etc) should work
37722 + * on partially converted files.
37723 + */
37724 + drop_exclusive_access(uf_info);
37725 + /* throttle the conversion
37726 + FIXME-EDWARD: Pass the precise number of pages
37727 + that was dirtied */
37728 + reiser4_throttle_write(inode, 1);
37729 + get_exclusive_access(uf_info);
37730 +
37731 + /*
37732 + * nobody is allowed to complete conversion but a
37733 + * process which started it
37734 + */
37735 + assert("", reiser4_inode_get_flag(inode,
37736 + REISER4_PART_MIXED));
37737 + }
37738 + }
37739 + if (result == 0) {
37740 + /* file is converted to extent items */
37741 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37742 + assert("vs-1697", reiser4_inode_get_flag(inode,
37743 + REISER4_PART_MIXED));
37744 +
37745 + uf_info->container = UF_CONTAINER_EXTENTS;
37746 + complete_conversion(inode);
37747 + } else {
37748 + /*
37749 + * conversion is not complete. Inode was already marked as
37750 + * REISER4_PART_MIXED and stat-data were updated at the first
37751 + * iteration of the loop above.
37752 + */
37753 + error:
37754 + release_all_pages(pages, sizeof_array(pages));
37755 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37756 + warning("edward-1548", "Partial conversion of %llu: %i",
37757 + (unsigned long long)get_inode_oid(inode), result);
37758 + }
37759 +
37760 + out:
37761 + /* this flag should be cleared, otherwise get_exclusive_access_careful()
37762 + will fall into infinite loop */
37763 + assert("edward-1549", !reiser4_inode_get_flag(inode,
37764 + REISER4_PART_IN_CONV));
37765 + return result;
37766 +}
37767 +
37768 +static int reserve_extent2tail_iteration(struct inode *inode)
37769 +{
37770 + reiser4_tree *tree;
37771 +
37772 + tree = reiser4_tree_by_inode(inode);
37773 + /*
37774 + * reserve blocks for (in this order):
37775 + *
37776 + * 1. removal of extent item
37777 + *
37778 + * 2. insertion of tail by insert_flow()
37779 + *
37780 + * 3. drilling to the leaf level by coord_by_key()
37781 + *
37782 + * 4. possible update of stat-data
37783 + */
37784 + grab_space_enable();
37785 + return reiser4_grab_space
37786 + (estimate_one_item_removal(tree) +
37787 + estimate_insert_flow(tree->height) +
37788 + 1 + estimate_one_insert_item(tree) +
37789 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37790 +}
37791 +
37792 +/* for every page of file: read page, cut part of extent pointing to this page,
37793 + put data of page tree by tail item */
37794 +int extent2tail(struct file * file, struct unix_file_info *uf_info)
37795 +{
37796 + int result;
37797 + struct inode *inode;
37798 + struct page *page;
37799 + unsigned long num_pages, i;
37800 + unsigned long start_page;
37801 + reiser4_key from;
37802 + reiser4_key to;
37803 + unsigned count;
37804 + __u64 offset;
37805 +
37806 + assert("nikita-3362", ea_obtained(uf_info));
37807 + inode = unix_file_info_to_inode(uf_info);
37808 + assert("nikita-3412", !IS_RDONLY(inode));
37809 + assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37810 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37811 +
37812 + offset = 0;
37813 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37814 + /*
37815 + * file is marked on disk as there was a conversion which did
37816 + * not complete due to either crash or some error. Find which
37817 + * offset tail conversion stopped at
37818 + */
37819 + result = find_start(inode, EXTENT_POINTER_ID, &offset);
37820 + if (result == -ENOENT) {
37821 + /* no extent found, everything is converted */
37822 + uf_info->container = UF_CONTAINER_TAILS;
37823 + complete_conversion(inode);
37824 + return 0;
37825 + } else if (result != 0)
37826 + /* some other error */
37827 + return result;
37828 + }
37829 +
37830 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37831 +
37832 + /* number of pages in the file */
37833 + num_pages =
37834 + (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37835 + start_page = offset >> PAGE_CACHE_SHIFT;
37836 +
37837 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37838 + to = from;
37839 +
37840 + result = 0;
37841 + for (i = 0; i < num_pages; i++) {
37842 + __u64 start_byte;
37843 +
37844 + result = reserve_extent2tail_iteration(inode);
37845 + if (result != 0)
37846 + break;
37847 + if (i == 0 && offset == 0) {
37848 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37849 + reiser4_update_sd(inode);
37850 + }
37851 +
37852 + page = read_mapping_page(inode->i_mapping,
37853 + (unsigned)(i + start_page), NULL);
37854 + if (IS_ERR(page)) {
37855 + result = PTR_ERR(page);
37856 + break;
37857 + }
37858 +
37859 + wait_on_page_locked(page);
37860 +
37861 + if (!PageUptodate(page)) {
37862 + page_cache_release(page);
37863 + result = RETERR(-EIO);
37864 + break;
37865 + }
37866 +
37867 + /* cut part of file we have read */
37868 + start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37869 + set_key_offset(&from, start_byte);
37870 + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37871 + /*
37872 + * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37873 + * commits during over-long truncates. But
37874 + * extent->tail conversion should be performed in one
37875 + * transaction.
37876 + */
37877 + result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37878 + &to, inode, 0);
37879 +
37880 + if (result) {
37881 + page_cache_release(page);
37882 + break;
37883 + }
37884 +
37885 + /* put page data into tree via tail_write */
37886 + count = PAGE_CACHE_SIZE;
37887 + if ((i == (num_pages - 1)) &&
37888 + (inode->i_size & ~PAGE_CACHE_MASK))
37889 + /* last page can be incompleted */
37890 + count = (inode->i_size & ~PAGE_CACHE_MASK);
37891 + while (count) {
37892 + loff_t pos = start_byte;
37893 +
37894 + assert("edward-1537",
37895 + file != NULL && file->f_dentry != NULL);
37896 + assert("edward-1538",
37897 + file->f_dentry->d_inode == inode);
37898 +
37899 + result = reiser4_write_tail(file, inode,
37900 + (char __user *)kmap(page),
37901 + count, &pos);
37902 + reiser4_free_file_fsdata(file);
37903 + if (result <= 0) {
37904 + warning("", "reiser4_write_tail failed");
37905 + page_cache_release(page);
37906 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37907 + return result;
37908 + }
37909 + count -= result;
37910 + }
37911 +
37912 + /* release page */
37913 + lock_page(page);
37914 + /* page is already detached from jnode and mapping. */
37915 + assert("vs-1086", page->mapping == NULL);
37916 + assert("nikita-2690",
37917 + (!PagePrivate(page) && jprivate(page) == 0));
37918 + /* waiting for writeback completion with page lock held is
37919 + * perfectly valid. */
37920 + wait_on_page_writeback(page);
37921 + reiser4_drop_page(page);
37922 + /* release reference taken by read_cache_page() above */
37923 + page_cache_release(page);
37924 +
37925 + drop_exclusive_access(uf_info);
37926 + /*
37927 + * throttle the conversion.
37928 + * FIXME-EDWARD: Calculate and pass the precise number
37929 + * of pages that was dirtied
37930 + */
37931 + reiser4_throttle_write(inode, 1);
37932 + get_exclusive_access(uf_info);
37933 + /*
37934 + * nobody is allowed to complete conversion but a process which
37935 + * started it
37936 + */
37937 + assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37938 + }
37939 +
37940 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37941 +
37942 + if (i == num_pages) {
37943 + /* file is converted to formatted items */
37944 + assert("vs-1698", reiser4_inode_get_flag(inode,
37945 + REISER4_PART_MIXED));
37946 + assert("vs-1260",
37947 + inode_has_no_jnodes(reiser4_inode_data(inode)));
37948 +
37949 + uf_info->container = UF_CONTAINER_TAILS;
37950 + complete_conversion(inode);
37951 + return 0;
37952 + }
37953 + /*
37954 + * conversion is not complete. Inode was already marked as
37955 + * REISER4_PART_MIXED and stat-data were updated at the first
37956 + * iteration of the loop above.
37957 + */
37958 + warning("nikita-2282",
37959 + "Partial conversion of %llu: %lu of %lu: %i",
37960 + (unsigned long long)get_inode_oid(inode), i,
37961 + num_pages, result);
37962 +
37963 + /* this flag should be cleared, otherwise get_exclusive_access_careful()
37964 + will fall into infinite loop */
37965 + assert("edward-1550", !reiser4_inode_get_flag(inode,
37966 + REISER4_PART_IN_CONV));
37967 + return result;
37968 +}
37969 +
37970 +/*
37971 + * Local variables:
37972 + * c-indentation-style: "K&R"
37973 + * mode-name: "LC"
37974 + * c-basic-offset: 8
37975 + * tab-width: 8
37976 + * fill-column: 79
37977 + * scroll-step: 1
37978 + * End:
37979 + */
37980 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_ops.c linux-2.6.33/fs/reiser4/plugin/file_ops.c
37981 --- linux-2.6.33.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 01:00:00.000000000 +0100
37982 +++ linux-2.6.33/fs/reiser4/plugin/file_ops.c 2010-03-04 19:33:22.000000000 +0100
37983 @@ -0,0 +1,162 @@
37984 +/* Copyright 2005 by Hans Reiser, licensing governed by
37985 + reiser4/README */
37986 +
37987 +/* this file contains typical implementations for some of methods of
37988 + struct file_operations and of struct address_space_operations
37989 +*/
37990 +
37991 +#include "../inode.h"
37992 +#include "object.h"
37993 +
37994 +/* file operations */
37995 +
37996 +/* implementation of vfs's llseek method of struct file_operations for
37997 + typical directory can be found in readdir_common.c
37998 +*/
37999 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
38000 +
38001 +/* implementation of vfs's readdir method of struct file_operations for
38002 + typical directory can be found in readdir_common.c
38003 +*/
38004 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
38005 +
38006 +/**
38007 + * reiser4_release_dir_common - release of struct file_operations
38008 + * @inode: inode of released file
38009 + * @file: file to release
38010 + *
38011 + * Implementation of release method of struct file_operations for typical
38012 + * directory. All it does is freeing of reiser4 specific file data.
38013 +*/
38014 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
38015 +{
38016 + reiser4_context *ctx;
38017 +
38018 + ctx = reiser4_init_context(inode->i_sb);
38019 + if (IS_ERR(ctx))
38020 + return PTR_ERR(ctx);
38021 + reiser4_free_file_fsdata(file);
38022 + reiser4_exit_context(ctx);
38023 + return 0;
38024 +}
38025 +
38026 +/* this is common implementation of vfs's fsync method of struct
38027 + file_operations
38028 +*/
38029 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
38030 +{
38031 + reiser4_context *ctx;
38032 + int result;
38033 +
38034 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
38035 + if (IS_ERR(ctx))
38036 + return PTR_ERR(ctx);
38037 + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
38038 +
38039 + context_set_commit_async(ctx);
38040 + reiser4_exit_context(ctx);
38041 + return result;
38042 +}
38043 +
38044 +/*
38045 + * common sync method for regular files.
38046 + *
38047 + * We are trying to be smart here. Instead of committing all atoms (original
38048 + * solution), we scan dirty pages of this file and commit all atoms they are
38049 + * part of.
38050 + *
38051 + * Situation is complicated by anonymous pages: i.e., extent-less pages
38052 + * dirtied through mmap. Fortunately sys_fsync() first calls
38053 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
38054 + * all missing extents and capture anonymous pages.
38055 + */
38056 +int reiser4_sync_file_common(struct file *file,
38057 + struct dentry *dentry, int datasync)
38058 +{
38059 + reiser4_context *ctx;
38060 + txn_atom *atom;
38061 + reiser4_block_nr reserve;
38062 +
38063 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
38064 + if (IS_ERR(ctx))
38065 + return PTR_ERR(ctx);
38066 +
38067 + reserve = estimate_update_common(dentry->d_inode);
38068 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
38069 + reiser4_exit_context(ctx);
38070 + return RETERR(-ENOSPC);
38071 + }
38072 + write_sd_by_inode_common(dentry->d_inode);
38073 +
38074 + atom = get_current_atom_locked();
38075 + spin_lock_txnh(ctx->trans);
38076 + force_commit_atom(ctx->trans);
38077 + reiser4_exit_context(ctx);
38078 + return 0;
38079 +}
38080 +
38081 +
38082 +/* address space operations */
38083 +
38084 +
38085 +/* this is helper for plugin->write_begin() */
38086 +int do_prepare_write(struct file *file, struct page *page, unsigned from,
38087 + unsigned to)
38088 +{
38089 + int result;
38090 + file_plugin *fplug;
38091 + struct inode *inode;
38092 +
38093 + assert("umka-3099", file != NULL);
38094 + assert("umka-3100", page != NULL);
38095 + assert("umka-3095", PageLocked(page));
38096 +
38097 + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
38098 + return 0;
38099 +
38100 + inode = page->mapping->host;
38101 + fplug = inode_file_plugin(inode);
38102 +
38103 + if (page->mapping->a_ops->readpage == NULL)
38104 + return RETERR(-EINVAL);
38105 +
38106 + result = page->mapping->a_ops->readpage(file, page);
38107 + if (result != 0) {
38108 + SetPageError(page);
38109 + ClearPageUptodate(page);
38110 + /* All reiser4 readpage() implementations should return the
38111 + * page locked in case of error. */
38112 + assert("nikita-3472", PageLocked(page));
38113 + } else {
38114 + /*
38115 + * ->readpage() either:
38116 + *
38117 + * 1. starts IO against @page. @page is locked for IO in
38118 + * this case.
38119 + *
38120 + * 2. doesn't start IO. @page is unlocked.
38121 + *
38122 + * In either case, page should be locked.
38123 + */
38124 + lock_page(page);
38125 + /*
38126 + * IO (if any) is completed at this point. Check for IO
38127 + * errors.
38128 + */
38129 + if (!PageUptodate(page))
38130 + result = RETERR(-EIO);
38131 + }
38132 + assert("umka-3098", PageLocked(page));
38133 + return result;
38134 +}
38135 +
38136 +/*
38137 + * Local variables:
38138 + * c-indentation-style: "K&R"
38139 + * mode-name: "LC"
38140 + * c-basic-offset: 8
38141 + * tab-width: 8
38142 + * fill-column: 79
38143 + * scroll-step: 1
38144 + * End:
38145 + */
38146 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.33/fs/reiser4/plugin/file_ops_readdir.c
38147 --- linux-2.6.33.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 01:00:00.000000000 +0100
38148 +++ linux-2.6.33/fs/reiser4/plugin/file_ops_readdir.c 2010-03-04 19:33:22.000000000 +0100
38149 @@ -0,0 +1,658 @@
38150 +/* Copyright 2005 by Hans Reiser, licensing governed by
38151 + * reiser4/README */
38152 +
38153 +#include "../inode.h"
38154 +
38155 +/* return true, iff @coord points to the valid directory item that is part of
38156 + * @inode directory. */
38157 +static int is_valid_dir_coord(struct inode *inode, coord_t *coord)
38158 +{
38159 + return plugin_of_group(item_plugin_by_coord(coord),
38160 + DIR_ENTRY_ITEM_TYPE) &&
38161 + inode_file_plugin(inode)->owns_item(inode, coord);
38162 +}
38163 +
38164 +/* compare two logical positions within the same directory */
38165 +static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2)
38166 +{
38167 + cmp_t result;
38168 +
38169 + assert("nikita-2534", p1 != NULL);
38170 + assert("nikita-2535", p2 != NULL);
38171 +
38172 + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
38173 + if (result == EQUAL_TO) {
38174 + int diff;
38175 +
38176 + diff = p1->pos - p2->pos;
38177 + result =
38178 + (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
38179 + }
38180 + return result;
38181 +}
38182 +
38183 +/* see comment before reiser4_readdir_common() for overview of why "adjustment"
38184 + * is necessary. */
38185 +static void
38186 +adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot,
38187 + const struct dir_pos *mod_point, int adj)
38188 +{
38189 + struct dir_pos *pos;
38190 +
38191 + /*
38192 + * new directory entry was added (adj == +1) or removed (adj == -1) at
38193 + * the @mod_point. Directory file descriptor @dir is doing readdir and
38194 + * is currently positioned at @readdir_spot. Latter has to be updated
38195 + * to maintain stable readdir.
38196 + */
38197 + /* directory is positioned to the beginning. */
38198 + if (readdir_spot->entry_no == 0)
38199 + return;
38200 +
38201 + pos = &readdir_spot->position;
38202 + switch (dir_pos_cmp(mod_point, pos)) {
38203 + case LESS_THAN:
38204 + /* @mod_pos is _before_ @readdir_spot, that is, entry was
38205 + * added/removed on the left (in key order) of current
38206 + * position. */
38207 + /* logical number of directory entry readdir is "looking" at
38208 + * changes */
38209 + readdir_spot->entry_no += adj;
38210 + assert("nikita-2577",
38211 + ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
38212 + if (de_id_cmp(&pos->dir_entry_key,
38213 + &mod_point->dir_entry_key) == EQUAL_TO) {
38214 + assert("nikita-2575", mod_point->pos < pos->pos);
38215 + /*
38216 + * if entry added/removed has the same key as current
38217 + * for readdir, update counter of duplicate keys in
38218 + * @readdir_spot.
38219 + */
38220 + pos->pos += adj;
38221 + }
38222 + break;
38223 + case GREATER_THAN:
38224 + /* directory is modified after @pos: nothing to do. */
38225 + break;
38226 + case EQUAL_TO:
38227 + /* cannot insert an entry readdir is looking at, because it
38228 + already exists. */
38229 + assert("nikita-2576", adj < 0);
38230 + /* directory entry to which @pos points to is being
38231 + removed.
38232 +
38233 + NOTE-NIKITA: Right thing to do is to update @pos to point
38234 + to the next entry. This is complex (we are under spin-lock
38235 + for one thing). Just rewind it to the beginning. Next
38236 + readdir will have to scan the beginning of
38237 + directory. Proper solution is to use semaphore in
38238 + spin lock's stead and use rewind_right() here.
38239 +
38240 + NOTE-NIKITA: now, semaphore is used, so...
38241 + */
38242 + memset(readdir_spot, 0, sizeof *readdir_spot);
38243 + }
38244 +}
38245 +
38246 +/* scan all file-descriptors for this directory and adjust their
38247 + positions respectively. Should be used by implementations of
38248 + add_entry and rem_entry of dir plugin */
38249 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
38250 + int offset, int adj)
38251 +{
38252 + reiser4_file_fsdata *scan;
38253 + struct dir_pos mod_point;
38254 +
38255 + assert("nikita-2536", dir != NULL);
38256 + assert("nikita-2538", de != NULL);
38257 + assert("nikita-2539", adj != 0);
38258 +
38259 + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
38260 + mod_point.pos = offset;
38261 +
38262 + spin_lock_inode(dir);
38263 +
38264 + /*
38265 + * new entry was added/removed in directory @dir. Scan all file
38266 + * descriptors for @dir that are currently involved into @readdir and
38267 + * update them.
38268 + */
38269 +
38270 + list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
38271 + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
38272 +
38273 + spin_unlock_inode(dir);
38274 +}
38275 +
38276 +/*
38277 + * traverse tree to start/continue readdir from the readdir position @pos.
38278 + */
38279 +static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap)
38280 +{
38281 + reiser4_key key;
38282 + int result;
38283 + struct inode *inode;
38284 +
38285 + assert("nikita-2554", pos != NULL);
38286 +
38287 + inode = dir->f_dentry->d_inode;
38288 + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
38289 + if (result != 0)
38290 + return result;
38291 + result = reiser4_object_lookup(inode,
38292 + &key,
38293 + tap->coord,
38294 + tap->lh,
38295 + tap->mode,
38296 + FIND_EXACT,
38297 + LEAF_LEVEL, LEAF_LEVEL,
38298 + 0, &tap->ra_info);
38299 + if (result == CBK_COORD_FOUND)
38300 + result = rewind_right(tap, (int)pos->position.pos);
38301 + else {
38302 + tap->coord->node = NULL;
38303 + done_lh(tap->lh);
38304 + result = RETERR(-EIO);
38305 + }
38306 + return result;
38307 +}
38308 +
38309 +/*
38310 + * handling of non-unique keys: calculate at what ordinal position within
38311 + * sequence of directory items with identical keys @pos is.
38312 + */
38313 +static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap)
38314 +{
38315 + int result;
38316 + coord_t coord;
38317 + lock_handle lh;
38318 + tap_t scan;
38319 + de_id *did;
38320 + reiser4_key de_key;
38321 +
38322 + coord_init_zero(&coord);
38323 + init_lh(&lh);
38324 + reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
38325 + reiser4_tap_copy(&scan, tap);
38326 + reiser4_tap_load(&scan);
38327 + pos->position.pos = 0;
38328 +
38329 + did = &pos->position.dir_entry_key;
38330 +
38331 + if (is_valid_dir_coord(inode, scan.coord)) {
38332 +
38333 + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
38334 +
38335 + while (1) {
38336 +
38337 + result = go_prev_unit(&scan);
38338 + if (result != 0)
38339 + break;
38340 +
38341 + if (!is_valid_dir_coord(inode, scan.coord)) {
38342 + result = -EINVAL;
38343 + break;
38344 + }
38345 +
38346 + /* get key of directory entry */
38347 + unit_key_by_coord(scan.coord, &de_key);
38348 + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
38349 + /* duplicate-sequence is over */
38350 + break;
38351 + }
38352 + pos->position.pos++;
38353 + }
38354 + } else
38355 + result = RETERR(-ENOENT);
38356 + reiser4_tap_relse(&scan);
38357 + reiser4_tap_done(&scan);
38358 + return result;
38359 +}
38360 +
38361 +/*
38362 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
38363 + */
38364 +static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap)
38365 +{
38366 + __u64 destination;
38367 + __s64 shift;
38368 + int result;
38369 + struct inode *inode;
38370 + loff_t dirpos;
38371 +
38372 + assert("nikita-2553", dir != NULL);
38373 + assert("nikita-2548", pos != NULL);
38374 + assert("nikita-2551", tap->coord != NULL);
38375 + assert("nikita-2552", tap->lh != NULL);
38376 +
38377 + dirpos = reiser4_get_dir_fpos(dir);
38378 + shift = dirpos - pos->fpos;
38379 + /* this is logical directory entry within @dir which we are rewinding
38380 + * to */
38381 + destination = pos->entry_no + shift;
38382 +
38383 + inode = dir->f_dentry->d_inode;
38384 + if (dirpos < 0)
38385 + return RETERR(-EINVAL);
38386 + else if (destination == 0ll || dirpos == 0) {
38387 + /* rewind to the beginning of directory */
38388 + memset(pos, 0, sizeof *pos);
38389 + return dir_go_to(dir, pos, tap);
38390 + } else if (destination >= inode->i_size)
38391 + return RETERR(-ENOENT);
38392 +
38393 + if (shift < 0) {
38394 + /* I am afraid of negative numbers */
38395 + shift = -shift;
38396 + /* rewinding to the left */
38397 + if (shift <= (int)pos->position.pos) {
38398 + /* destination is within sequence of entries with
38399 + duplicate keys. */
38400 + result = dir_go_to(dir, pos, tap);
38401 + } else {
38402 + shift -= pos->position.pos;
38403 + while (1) {
38404 + /* repetitions: deadlock is possible when
38405 + going to the left. */
38406 + result = dir_go_to(dir, pos, tap);
38407 + if (result == 0) {
38408 + result = rewind_left(tap, shift);
38409 + if (result == -E_DEADLOCK) {
38410 + reiser4_tap_done(tap);
38411 + continue;
38412 + }
38413 + }
38414 + break;
38415 + }
38416 + }
38417 + } else {
38418 + /* rewinding to the right */
38419 + result = dir_go_to(dir, pos, tap);
38420 + if (result == 0)
38421 + result = rewind_right(tap, shift);
38422 + }
38423 + if (result == 0) {
38424 + result = set_pos(inode, pos, tap);
38425 + if (result == 0) {
38426 + /* update pos->position.pos */
38427 + pos->entry_no = destination;
38428 + pos->fpos = dirpos;
38429 + }
38430 + }
38431 + return result;
38432 +}
38433 +
38434 +/*
38435 + * Function that is called by common_readdir() on each directory entry while
38436 + * doing readdir. ->filldir callback may block, so we had to release long term
38437 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
38438 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38439 + *
38440 + * Whether node is unlocked in case of any other error is undefined. It is
38441 + * guaranteed to be still locked if success (0) is returned.
38442 + *
38443 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
38444 + * unlocked.
38445 + */
38446 +static int
38447 +feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap,
38448 + filldir_t filldir, void *dirent)
38449 +{
38450 + item_plugin *iplug;
38451 + char *name;
38452 + reiser4_key sd_key;
38453 + int result;
38454 + char buf[DE_NAME_BUF_LEN];
38455 + char name_buf[32];
38456 + char *local_name;
38457 + unsigned file_type;
38458 + seal_t seal;
38459 + coord_t *coord;
38460 + reiser4_key entry_key;
38461 +
38462 + coord = tap->coord;
38463 + iplug = item_plugin_by_coord(coord);
38464 +
38465 + /* pointer to name within the node */
38466 + name = iplug->s.dir.extract_name(coord, buf);
38467 + assert("nikita-1371", name != NULL);
38468 +
38469 + /* key of object the entry points to */
38470 + if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38471 + return RETERR(-EIO);
38472 +
38473 + /* we must release longterm znode lock before calling filldir to avoid
38474 + deadlock which may happen if filldir causes page fault. So, copy
38475 + name to intermediate buffer */
38476 + if (strlen(name) + 1 > sizeof(name_buf)) {
38477 + local_name = kmalloc(strlen(name) + 1,
38478 + reiser4_ctx_gfp_mask_get());
38479 + if (local_name == NULL)
38480 + return RETERR(-ENOMEM);
38481 + } else
38482 + local_name = name_buf;
38483 +
38484 + strcpy(local_name, name);
38485 + file_type = iplug->s.dir.extract_file_type(coord);
38486 +
38487 + unit_key_by_coord(coord, &entry_key);
38488 + reiser4_seal_init(&seal, coord, &entry_key);
38489 +
38490 + longterm_unlock_znode(tap->lh);
38491 +
38492 + /*
38493 + * send information about directory entry to the ->filldir() filler
38494 + * supplied to us by caller (VFS).
38495 + *
38496 + * ->filldir is entitled to do weird things. For example, ->filldir
38497 + * supplied by knfsd re-enters file system. Make sure no locks are
38498 + * held.
38499 + */
38500 + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38501 +
38502 + reiser4_txn_restart_current();
38503 + result = filldir(dirent, name, (int)strlen(name),
38504 + /* offset of this entry */
38505 + f->f_pos,
38506 + /* inode number of object bounden by this entry */
38507 + oid_to_uino(get_key_objectid(&sd_key)), file_type);
38508 + if (local_name != name_buf)
38509 + kfree(local_name);
38510 + if (result < 0)
38511 + /* ->filldir() is satisfied. (no space in buffer, IOW) */
38512 + result = 1;
38513 + else
38514 + result = reiser4_seal_validate(&seal, coord, &entry_key,
38515 + tap->lh, tap->mode,
38516 + ZNODE_LOCK_HIPRI);
38517 + return result;
38518 +}
38519 +
38520 +static void move_entry(struct readdir_pos *pos, coord_t *coord)
38521 +{
38522 + reiser4_key de_key;
38523 + de_id *did;
38524 +
38525 + /* update @pos */
38526 + ++pos->entry_no;
38527 + did = &pos->position.dir_entry_key;
38528 +
38529 + /* get key of directory entry */
38530 + unit_key_by_coord(coord, &de_key);
38531 +
38532 + if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38533 + /* we are within sequence of directory entries
38534 + with duplicate keys. */
38535 + ++pos->position.pos;
38536 + else {
38537 + pos->position.pos = 0;
38538 + build_de_id_by_key(&de_key, did);
38539 + }
38540 + ++pos->fpos;
38541 +}
38542 +
38543 +/*
38544 + * STATELESS READDIR
38545 + *
38546 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
38547 + * into reiser4_file_fsdata on each directory modification (name insertion and
38548 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
38549 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38550 + * across client READDIR requests for the same directory.
38551 + *
38552 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
38553 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38554 + * find detached reiser4_file_fsdata corresponding to previous readdir
38555 + * request. In other words, additional state is maintained on the
38556 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
38557 + *
38558 + * To efficiently detect when our ->readdir() method is called by NFS server,
38559 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38560 + * file_is_stateless() function).
38561 + *
38562 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
38563 + * bits of NFS readdir cookie: when first readdir request comes to the given
38564 + * directory from the given client, cookie is set to 0. This situation is
38565 + * detected, global cid_counter is incremented, and stored in highest bits of
38566 + * all direntry offsets returned to the client, including last one. As the
38567 + * only valid readdir cookie is one obtained as direntry->offset, we are
38568 + * guaranteed that next readdir request (continuing current one) will have
38569 + * current cid in the highest bits of starting readdir cookie. All d_cursors
38570 + * are hashed into per-super-block hash table by (oid, cid) key.
38571 + *
38572 + * In addition d_cursors are placed into per-super-block radix tree where they
38573 + * are keyed by oid alone. This is necessary to efficiently remove them during
38574 + * rmdir.
38575 + *
38576 + * At last, currently unused d_cursors are linked into special list. This list
38577 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38578 + *
38579 + */
38580 +
38581 +/*
38582 + * prepare for readdir.
38583 + */
38584 +static int dir_readdir_init(struct file *f, tap_t *tap,
38585 + struct readdir_pos **pos)
38586 +{
38587 + struct inode *inode;
38588 + reiser4_file_fsdata *fsdata;
38589 + int result;
38590 +
38591 + assert("nikita-1359", f != NULL);
38592 + inode = f->f_dentry->d_inode;
38593 + assert("nikita-1360", inode != NULL);
38594 +
38595 + if (!S_ISDIR(inode->i_mode))
38596 + return RETERR(-ENOTDIR);
38597 +
38598 + /* try to find detached readdir state */
38599 + result = reiser4_attach_fsdata(f, inode);
38600 + if (result != 0)
38601 + return result;
38602 +
38603 + fsdata = reiser4_get_file_fsdata(f);
38604 + assert("nikita-2571", fsdata != NULL);
38605 + if (IS_ERR(fsdata))
38606 + return PTR_ERR(fsdata);
38607 +
38608 + /* add file descriptor to the readdir list hanging of directory
38609 + * inode. This list is used to scan "readdirs-in-progress" while
38610 + * inserting or removing names in the directory. */
38611 + spin_lock_inode(inode);
38612 + if (list_empty_careful(&fsdata->dir.linkage))
38613 + list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38614 + *pos = &fsdata->dir.readdir;
38615 + spin_unlock_inode(inode);
38616 +
38617 + /* move @tap to the current position */
38618 + return dir_rewind(f, *pos, tap);
38619 +}
38620 +
38621 +/* this is implementation of vfs's llseek method of struct file_operations for
38622 + typical directory
38623 + See comment before reiser4_readdir_common() for explanation.
38624 +*/
38625 +loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin)
38626 +{
38627 + reiser4_context *ctx;
38628 + loff_t result;
38629 + struct inode *inode;
38630 +
38631 + inode = file->f_dentry->d_inode;
38632 +
38633 + ctx = reiser4_init_context(inode->i_sb);
38634 + if (IS_ERR(ctx))
38635 + return PTR_ERR(ctx);
38636 +
38637 + mutex_lock(&inode->i_mutex);
38638 +
38639 + /* update ->f_pos */
38640 + result = default_llseek(file, off, origin);
38641 + if (result >= 0) {
38642 + int ff;
38643 + coord_t coord;
38644 + lock_handle lh;
38645 + tap_t tap;
38646 + struct readdir_pos *pos;
38647 +
38648 + coord_init_zero(&coord);
38649 + init_lh(&lh);
38650 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38651 +
38652 + ff = dir_readdir_init(file, &tap, &pos);
38653 + reiser4_detach_fsdata(file);
38654 + if (ff != 0)
38655 + result = (loff_t) ff;
38656 + reiser4_tap_done(&tap);
38657 + }
38658 + reiser4_detach_fsdata(file);
38659 + mutex_unlock(&inode->i_mutex);
38660 +
38661 + reiser4_exit_context(ctx);
38662 + return result;
38663 +}
38664 +
38665 +/* this is common implementation of vfs's readdir method of struct
38666 + file_operations
38667 +
38668 + readdir problems:
38669 +
38670 + readdir(2)/getdents(2) interface is based on implicit assumption that
38671 + readdir can be restarted from any particular point by supplying file system
38672 + with off_t-full of data. That is, file system fills ->d_off field in struct
38673 + dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38674 + implemented by glibc as lseek(2) on directory.
38675 +
38676 + Reiser4 cannot restart readdir from 64 bits of data, because two last
38677 + components of the key of directory entry are unknown, which given 128 bits:
38678 + locality and type fields in the key of directory entry are always known, to
38679 + start readdir() from given point objectid and offset fields have to be
38680 + filled.
38681 +
38682 + Traditional UNIX API for scanning through directory
38683 + (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38684 + assumption that directory is structured very much like regular file, in
38685 + particular, it is implied that each name within given directory (directory
38686 + entry) can be uniquely identified by scalar offset and that such offset is
38687 + stable across the life-time of the name is identifies.
38688 +
38689 + This is manifestly not so for reiser4. In reiser4 the only stable unique
38690 + identifies for the directory entry is its key that doesn't fit into
38691 + seekdir/telldir API.
38692 +
38693 + solution:
38694 +
38695 + Within each file descriptor participating in readdir-ing of directory
38696 + plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38697 + the "current" directory entry that file descriptor looks at. It contains a
38698 + key of directory entry (plus some additional info to deal with non-unique
38699 + keys that we wouldn't dwell onto here) and a logical position of this
38700 + directory entry starting from the beginning of the directory, that is
38701 + ordinal number of this entry in the readdir order.
38702 +
38703 + Obviously this logical position is not stable in the face of directory
38704 + modifications. To work around this, on each addition or removal of directory
38705 + entry all file descriptors for directory inode are scanned and their
38706 + readdir_pos are updated accordingly (adjust_dir_pos()).
38707 +*/
38708 +int reiser4_readdir_common(struct file *f /* directory file being read */,
38709 + void *dirent /* opaque data passed to us by VFS */,
38710 + filldir_t filld /* filler function passed to us
38711 + * by VFS */)
38712 +{
38713 + reiser4_context *ctx;
38714 + int result;
38715 + struct inode *inode;
38716 + coord_t coord;
38717 + lock_handle lh;
38718 + tap_t tap;
38719 + struct readdir_pos *pos;
38720 +
38721 + assert("nikita-1359", f != NULL);
38722 + inode = f->f_dentry->d_inode;
38723 + assert("nikita-1360", inode != NULL);
38724 +
38725 + if (!S_ISDIR(inode->i_mode))
38726 + return RETERR(-ENOTDIR);
38727 +
38728 + ctx = reiser4_init_context(inode->i_sb);
38729 + if (IS_ERR(ctx))
38730 + return PTR_ERR(ctx);
38731 +
38732 + coord_init_zero(&coord);
38733 + init_lh(&lh);
38734 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38735 +
38736 + reiser4_readdir_readahead_init(inode, &tap);
38737 +
38738 +repeat:
38739 + result = dir_readdir_init(f, &tap, &pos);
38740 + if (result == 0) {
38741 + result = reiser4_tap_load(&tap);
38742 + /* scan entries one by one feeding them to @filld */
38743 + while (result == 0) {
38744 + coord_t *coord;
38745 +
38746 + coord = tap.coord;
38747 + assert("nikita-2572", coord_is_existing_unit(coord));
38748 + assert("nikita-3227", is_valid_dir_coord(inode, coord));
38749 +
38750 + result = feed_entry(f, pos, &tap, filld, dirent);
38751 + if (result > 0) {
38752 + break;
38753 + } else if (result == 0) {
38754 + ++f->f_pos;
38755 + result = go_next_unit(&tap);
38756 + if (result == -E_NO_NEIGHBOR ||
38757 + result == -ENOENT) {
38758 + result = 0;
38759 + break;
38760 + } else if (result == 0) {
38761 + if (is_valid_dir_coord(inode, coord))
38762 + move_entry(pos, coord);
38763 + else
38764 + break;
38765 + }
38766 + } else if (result == -E_REPEAT) {
38767 + /* feed_entry() had to restart. */
38768 + ++f->f_pos;
38769 + reiser4_tap_relse(&tap);
38770 + goto repeat;
38771 + } else
38772 + warning("vs-1617",
38773 + "reiser4_readdir_common: unexpected error %d",
38774 + result);
38775 + }
38776 + reiser4_tap_relse(&tap);
38777 +
38778 + if (result >= 0)
38779 + f->f_version = inode->i_version;
38780 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38781 + result = 0;
38782 + reiser4_tap_done(&tap);
38783 + reiser4_detach_fsdata(f);
38784 +
38785 + /* try to update directory's atime */
38786 + if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38787 + BA_CAN_COMMIT) != 0)
38788 + warning("", "failed to update atime on readdir: %llu",
38789 + get_inode_oid(inode));
38790 + else
38791 + file_accessed(f);
38792 +
38793 + context_set_commit_async(ctx);
38794 + reiser4_exit_context(ctx);
38795 +
38796 + return (result <= 0) ? result : 0;
38797 +}
38798 +
38799 +/*
38800 + * Local variables:
38801 + * c-indentation-style: "K&R"
38802 + * mode-name: "LC"
38803 + * c-basic-offset: 8
38804 + * tab-width: 8
38805 + * fill-column: 79
38806 + * End:
38807 + */
38808 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.33/fs/reiser4/plugin/file_plugin_common.c
38809 --- linux-2.6.33.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 01:00:00.000000000 +0100
38810 +++ linux-2.6.33/fs/reiser4/plugin/file_plugin_common.c 2010-03-04 19:33:22.000000000 +0100
38811 @@ -0,0 +1,1008 @@
38812 +/* Copyright 2005 by Hans Reiser, licensing governed by
38813 + reiser4/README */
38814 +
38815 +/* this file contains typical implementations for most of methods of
38816 + file plugin
38817 +*/
38818 +
38819 +#include "../inode.h"
38820 +#include "object.h"
38821 +#include "../safe_link.h"
38822 +
38823 +#include <linux/quotaops.h>
38824 +
38825 +static int insert_new_sd(struct inode *inode);
38826 +static int update_sd(struct inode *inode);
38827 +
38828 +/* this is common implementation of write_sd_by_inode method of file plugin
38829 + either insert stat data or update it
38830 + */
38831 +int write_sd_by_inode_common(struct inode *inode/* object to save */)
38832 +{
38833 + int result;
38834 +
38835 + assert("nikita-730", inode != NULL);
38836 +
38837 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38838 + /* object doesn't have stat-data yet */
38839 + result = insert_new_sd(inode);
38840 + else
38841 + result = update_sd(inode);
38842 + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38843 + /* Don't issue warnings about "name is too long" */
38844 + warning("nikita-2221", "Failed to save sd for %llu: %i",
38845 + (unsigned long long)get_inode_oid(inode), result);
38846 + return result;
38847 +}
38848 +
38849 +/* this is common implementation of key_by_inode method of file plugin
38850 + */
38851 +int
38852 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38853 + reiser4_key * key)
38854 +{
38855 + reiser4_key_init(key);
38856 + set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38857 + set_key_ordering(key, get_inode_ordering(inode));
38858 + set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
38859 + set_key_type(key, KEY_BODY_MINOR);
38860 + set_key_offset(key, (__u64) off);
38861 + return 0;
38862 +}
38863 +
38864 +/* this is common implementation of set_plug_in_inode method of file plugin
38865 + */
38866 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38867 + struct inode *parent /* parent object */ ,
38868 + reiser4_object_create_data * data /* creational
38869 + * data */ )
38870 +{
38871 + __u64 mask;
38872 +
38873 + object->i_mode = data->mode;
38874 + /* this should be plugin decision */
38875 + object->i_uid = current->cred->fsuid;
38876 + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38877 +
38878 + /* support for BSD style group-id assignment. See mount's manual page
38879 + description of bsdgroups ext2 mount options for more details */
38880 + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38881 + object->i_gid = parent->i_gid;
38882 + else if (parent->i_mode & S_ISGID) {
38883 + /* parent directory has sguid bit */
38884 + object->i_gid = parent->i_gid;
38885 + if (S_ISDIR(object->i_mode))
38886 + /* sguid is inherited by sub-directories */
38887 + object->i_mode |= S_ISGID;
38888 + } else
38889 + object->i_gid = current->cred->fsgid;
38890 +
38891 + /* this object doesn't have stat-data yet */
38892 + reiser4_inode_set_flag(object, REISER4_NO_SD);
38893 +#if 0
38894 + /* this is now called after all inode plugins are initialized:
38895 + do_create_vfs_child after adjust_to_parent */
38896 + /* setup inode and file-operations for this inode */
38897 + setup_inode_ops(object, data);
38898 +#endif
38899 + object->i_nlink = 0;
38900 + reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38901 + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38902 + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38903 + mask |= (1 << LARGE_TIMES_STAT);
38904 +
38905 + reiser4_inode_data(object)->extmask = mask;
38906 + return 0;
38907 +}
38908 +
38909 +/* this is common implementation of adjust_to_parent method of file plugin for
38910 + regular files
38911 + */
38912 +int adjust_to_parent_common(struct inode *object /* new object */ ,
38913 + struct inode *parent /* parent directory */ ,
38914 + struct inode *root/* root directory */)
38915 +{
38916 + assert("nikita-2165", object != NULL);
38917 + if (parent == NULL)
38918 + parent = root;
38919 + assert("nikita-2069", parent != NULL);
38920 +
38921 + /*
38922 + * inherit missing plugins from parent
38923 + */
38924 +
38925 + grab_plugin_pset(object, parent, PSET_FILE);
38926 + grab_plugin_pset(object, parent, PSET_SD);
38927 + grab_plugin_pset(object, parent, PSET_FORMATTING);
38928 + grab_plugin_pset(object, parent, PSET_PERM);
38929 + return 0;
38930 +}
38931 +
38932 +/* this is common implementation of adjust_to_parent method of file plugin for
38933 + typical directories
38934 + */
38935 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38936 + struct inode *parent /* parent directory */ ,
38937 + struct inode *root/* root directory */)
38938 +{
38939 + int result = 0;
38940 + pset_member memb;
38941 +
38942 + assert("nikita-2166", object != NULL);
38943 + if (parent == NULL)
38944 + parent = root;
38945 + assert("nikita-2167", parent != NULL);
38946 +
38947 + /*
38948 + * inherit missing plugins from parent
38949 + */
38950 + for (memb = 0; memb < PSET_LAST; ++memb) {
38951 + result = grab_plugin_pset(object, parent, memb);
38952 + if (result != 0)
38953 + break;
38954 + }
38955 + return result;
38956 +}
38957 +
38958 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38959 + struct inode *parent /* parent directory */,
38960 + struct inode *root/* root directory */)
38961 +{
38962 + int result;
38963 + result = adjust_to_parent_common(object, parent, root);
38964 + if (result)
38965 + return result;
38966 + assert("edward-1416", parent != NULL);
38967 +
38968 + grab_plugin_pset(object, parent, PSET_CLUSTER);
38969 + grab_plugin_pset(object, parent, PSET_CIPHER);
38970 + grab_plugin_pset(object, parent, PSET_DIGEST);
38971 + grab_plugin_pset(object, parent, PSET_COMPRESSION);
38972 + grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38973 +
38974 + return 0;
38975 +}
38976 +
38977 +/* this is common implementation of create_object method of file plugin
38978 + */
38979 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
38980 + reiser4_object_create_data * data)
38981 +{
38982 + reiser4_block_nr reserve;
38983 + assert("nikita-744", object != NULL);
38984 + assert("nikita-745", parent != NULL);
38985 + assert("nikita-747", data != NULL);
38986 + assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38987 +
38988 + reserve = estimate_create_common(object);
38989 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38990 + return RETERR(-ENOSPC);
38991 + return write_sd_by_inode_common(object);
38992 +}
38993 +
38994 +static int common_object_delete_no_reserve(struct inode *inode);
38995 +
38996 +/**
38997 + * reiser4_delete_object_common - delete_object of file_plugin
38998 + * @inode: inode to be deleted
38999 + *
39000 + * This is common implementation of delete_object method of file_plugin. It
39001 + * applies to object its deletion consists of removing two items - stat data
39002 + * and safe-link.
39003 + */
39004 +int reiser4_delete_object_common(struct inode *inode)
39005 +{
39006 + int result;
39007 +
39008 + assert("nikita-1477", inode != NULL);
39009 + /* FIXME: if file body deletion failed (i/o error, for instance),
39010 + inode->i_size can be != 0 here */
39011 + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
39012 + assert("nikita-3421", inode->i_nlink == 0);
39013 +
39014 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39015 + reiser4_block_nr reserve;
39016 +
39017 + /* grab space which is needed to remove 2 items from the tree:
39018 + stat data and safe-link */
39019 + reserve = 2 *
39020 + estimate_one_item_removal(reiser4_tree_by_inode(inode));
39021 + if (reiser4_grab_space_force(reserve,
39022 + BA_RESERVED | BA_CAN_COMMIT))
39023 + return RETERR(-ENOSPC);
39024 + result = common_object_delete_no_reserve(inode);
39025 + } else
39026 + result = 0;
39027 + return result;
39028 +}
39029 +
39030 +/**
39031 + * reiser4_delete_dir_common - delete_object of file_plugin
39032 + * @inode: inode to be deleted
39033 + *
39034 + * This is common implementation of delete_object method of file_plugin for
39035 + * typical directory. It calls done method of dir_plugin to remove "." and
39036 + * removes stat data and safe-link.
39037 + */
39038 +int reiser4_delete_dir_common(struct inode *inode)
39039 +{
39040 + int result;
39041 + dir_plugin *dplug;
39042 +
39043 + assert("", (get_current_context() &&
39044 + get_current_context()->trans->atom == NULL));
39045 +
39046 + dplug = inode_dir_plugin(inode);
39047 + assert("vs-1101", dplug && dplug->done);
39048 +
39049 + /* kill cursors which might be attached to inode */
39050 + reiser4_kill_cursors(inode);
39051 +
39052 + /* grab space enough for removing two items */
39053 + if (reiser4_grab_space
39054 + (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
39055 + BA_RESERVED | BA_CAN_COMMIT))
39056 + return RETERR(-ENOSPC);
39057 +
39058 + result = dplug->done(inode);
39059 + if (!result)
39060 + result = common_object_delete_no_reserve(inode);
39061 + return result;
39062 +}
39063 +
39064 +/* this is common implementation of add_link method of file plugin
39065 + */
39066 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
39067 +{
39068 + /*
39069 + * increment ->i_nlink and update ->i_ctime
39070 + */
39071 +
39072 + INODE_INC_FIELD(object, i_nlink);
39073 + object->i_ctime = CURRENT_TIME;
39074 + return 0;
39075 +}
39076 +
39077 +/* this is common implementation of rem_link method of file plugin
39078 + */
39079 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
39080 +{
39081 + assert("nikita-2021", object != NULL);
39082 + assert("nikita-2163", object->i_nlink > 0);
39083 +
39084 + /*
39085 + * decrement ->i_nlink and update ->i_ctime
39086 + */
39087 +
39088 + INODE_DEC_FIELD(object, i_nlink);
39089 + object->i_ctime = CURRENT_TIME;
39090 + return 0;
39091 +}
39092 +
39093 +/* this is common implementation of rem_link method of file plugin for typical
39094 + directory
39095 +*/
39096 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
39097 +{
39098 + assert("nikita-20211", object != NULL);
39099 + assert("nikita-21631", object->i_nlink > 0);
39100 +
39101 + /*
39102 + * decrement ->i_nlink and update ->i_ctime
39103 + */
39104 + INODE_DEC_FIELD(object, i_nlink);
39105 + if (object->i_nlink == 1)
39106 + INODE_DEC_FIELD(object, i_nlink);
39107 + object->i_ctime = CURRENT_TIME;
39108 + return 0;
39109 +}
39110 +
39111 +/* this is common implementation of owns_item method of file plugin
39112 + compare objectids of keys in inode and coord */
39113 +int owns_item_common(const struct inode *inode, /* object to check
39114 + * against */
39115 + const coord_t *coord/* coord to check */)
39116 +{
39117 + reiser4_key item_key;
39118 + reiser4_key file_key;
39119 +
39120 + assert("nikita-760", inode != NULL);
39121 + assert("nikita-761", coord != NULL);
39122 +
39123 + return coord_is_existing_item(coord) &&
39124 + (get_key_objectid(build_sd_key(inode, &file_key)) ==
39125 + get_key_objectid(item_key_by_coord(coord, &item_key)));
39126 +}
39127 +
39128 +/* this is common implementation of owns_item method of file plugin
39129 + for typical directory
39130 +*/
39131 +int owns_item_common_dir(const struct inode *inode,/* object to check against */
39132 + const coord_t *coord/* coord of item to check */)
39133 +{
39134 + reiser4_key item_key;
39135 +
39136 + assert("nikita-1335", inode != NULL);
39137 + assert("nikita-1334", coord != NULL);
39138 +
39139 + if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
39140 + return get_key_locality(item_key_by_coord(coord, &item_key)) ==
39141 + get_inode_oid(inode);
39142 + else
39143 + return owns_item_common(inode, coord);
39144 +}
39145 +
39146 +/* this is common implementation of can_add_link method of file plugin
39147 + checks whether yet another hard links to this object can be added
39148 +*/
39149 +int can_add_link_common(const struct inode *object/* object to check */)
39150 +{
39151 + assert("nikita-732", object != NULL);
39152 +
39153 + /* inode->i_nlink is unsigned int, so just check for integer
39154 + overflow */
39155 + return object->i_nlink + 1 != 0;
39156 +}
39157 +
39158 +/* this is common implementation of can_rem_link method of file plugin for
39159 + typical directory
39160 +*/
39161 +int can_rem_link_common_dir(const struct inode *inode)
39162 +{
39163 + /* is_dir_empty() returns 0 is dir is empty */
39164 + return !is_dir_empty(inode);
39165 +}
39166 +
39167 +/* this is common implementation of detach method of file plugin for typical
39168 + directory
39169 +*/
39170 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
39171 +{
39172 + dir_plugin *dplug;
39173 +
39174 + dplug = inode_dir_plugin(child);
39175 + assert("nikita-2883", dplug != NULL);
39176 + assert("nikita-2884", dplug->detach != NULL);
39177 + return dplug->detach(child, parent);
39178 +}
39179 +
39180 +/* this is common implementation of bind method of file plugin for typical
39181 + directory
39182 +*/
39183 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
39184 +{
39185 + dir_plugin *dplug;
39186 +
39187 + dplug = inode_dir_plugin(child);
39188 + assert("nikita-2646", dplug != NULL);
39189 + return dplug->attach(child, parent);
39190 +}
39191 +
39192 +static int process_truncate(struct inode *, __u64 size);
39193 +
39194 +/* this is common implementation of safelink method of file plugin
39195 + */
39196 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
39197 +{
39198 + int result;
39199 +
39200 + assert("vs-1705", get_current_context()->trans->atom == NULL);
39201 + if (link == SAFE_UNLINK)
39202 + /* nothing to do. iput() in the caller (process_safelink) will
39203 + * finish with file */
39204 + result = 0;
39205 + else if (link == SAFE_TRUNCATE)
39206 + result = process_truncate(object, value);
39207 + else {
39208 + warning("nikita-3438", "Unrecognized safe-link type: %i", link);
39209 + result = RETERR(-EIO);
39210 + }
39211 + return result;
39212 +}
39213 +
39214 +/* this is common implementation of estimate.create method of file plugin
39215 + can be used when object creation involves insertion of one item (usually stat
39216 + data) into tree
39217 +*/
39218 +reiser4_block_nr estimate_create_common(const struct inode *object)
39219 +{
39220 + return estimate_one_insert_item(reiser4_tree_by_inode(object));
39221 +}
39222 +
39223 +/* this is common implementation of estimate.create method of file plugin for
39224 + typical directory
39225 + can be used when directory creation involves insertion of two items (usually
39226 + stat data and item containing "." and "..") into tree
39227 +*/
39228 +reiser4_block_nr estimate_create_common_dir(const struct inode *object)
39229 +{
39230 + return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
39231 +}
39232 +
39233 +/* this is common implementation of estimate.update method of file plugin
39234 + can be used when stat data update does not do more than inserting a unit
39235 + into a stat data item which is probably true for most cases
39236 +*/
39237 +reiser4_block_nr estimate_update_common(const struct inode *inode)
39238 +{
39239 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
39240 +}
39241 +
39242 +/* this is common implementation of estimate.unlink method of file plugin
39243 + */
39244 +reiser4_block_nr
39245 +estimate_unlink_common(const struct inode *object UNUSED_ARG,
39246 + const struct inode *parent UNUSED_ARG)
39247 +{
39248 + return 0;
39249 +}
39250 +
39251 +/* this is common implementation of estimate.unlink method of file plugin for
39252 + typical directory
39253 +*/
39254 +reiser4_block_nr
39255 +estimate_unlink_common_dir(const struct inode *object,
39256 + const struct inode *parent)
39257 +{
39258 + dir_plugin *dplug;
39259 +
39260 + dplug = inode_dir_plugin(object);
39261 + assert("nikita-2888", dplug != NULL);
39262 + assert("nikita-2887", dplug->estimate.unlink != NULL);
39263 + return dplug->estimate.unlink(object, parent);
39264 +}
39265 +
39266 +char *wire_write_common(struct inode *inode, char *start)
39267 +{
39268 + return build_inode_onwire(inode, start);
39269 +}
39270 +
39271 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
39272 +{
39273 + if (!obj)
39274 + return locate_obj_key_id_onwire(addr);
39275 + return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
39276 +}
39277 +
39278 +struct dentry *wire_get_common(struct super_block *sb,
39279 + reiser4_object_on_wire * obj)
39280 +{
39281 + struct inode *inode;
39282 + struct dentry *dentry;
39283 + reiser4_key key;
39284 +
39285 + extract_key_from_id(&obj->u.std.key_id, &key);
39286 + inode = reiser4_iget(sb, &key, 1);
39287 + if (!IS_ERR(inode)) {
39288 + reiser4_iget_complete(inode);
39289 + dentry = d_obtain_alias(inode);
39290 + if (!IS_ERR(dentry))
39291 + dentry->d_op = &get_super_private(sb)->ops.dentry;
39292 + } else if (PTR_ERR(inode) == -ENOENT)
39293 + /*
39294 + * inode wasn't found at the key encoded in the file
39295 + * handle. Hence, file handle is stale.
39296 + */
39297 + dentry = ERR_PTR(RETERR(-ESTALE));
39298 + else
39299 + dentry = (void *)inode;
39300 + return dentry;
39301 +}
39302 +
39303 +int wire_size_common(struct inode *inode)
39304 +{
39305 + return inode_onwire_size(inode);
39306 +}
39307 +
39308 +void wire_done_common(reiser4_object_on_wire * obj)
39309 +{
39310 + /* nothing to do */
39311 +}
39312 +
39313 +/* helper function to print errors */
39314 +static void key_warning(const reiser4_key * key /* key to print */ ,
39315 + const struct inode *inode,
39316 + int code/* error code to print */)
39317 +{
39318 + assert("nikita-716", key != NULL);
39319 +
39320 + if (code != -ENOMEM) {
39321 + warning("nikita-717", "Error for inode %llu (%i)",
39322 + (unsigned long long)get_key_objectid(key), code);
39323 + reiser4_print_key("for key", key);
39324 + }
39325 +}
39326 +
39327 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
39328 +#if REISER4_DEBUG
39329 +static void
39330 +check_inode_seal(const struct inode *inode,
39331 + const coord_t *coord, const reiser4_key * key)
39332 +{
39333 + reiser4_key unit_key;
39334 +
39335 + unit_key_by_coord(coord, &unit_key);
39336 + assert("nikita-2752",
39337 + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
39338 + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
39339 +}
39340 +
39341 +static void check_sd_coord(coord_t *coord, const reiser4_key * key)
39342 +{
39343 + reiser4_key ukey;
39344 +
39345 + coord_clear_iplug(coord);
39346 + if (zload(coord->node))
39347 + return;
39348 +
39349 + if (!coord_is_existing_unit(coord) ||
39350 + !item_plugin_by_coord(coord) ||
39351 + !keyeq(unit_key_by_coord(coord, &ukey), key) ||
39352 + (znode_get_level(coord->node) != LEAF_LEVEL) ||
39353 + !item_is_statdata(coord)) {
39354 + warning("nikita-1901", "Conspicuous seal");
39355 + reiser4_print_key("key", key);
39356 + print_coord("coord", coord, 1);
39357 + impossible("nikita-2877", "no way");
39358 + }
39359 + zrelse(coord->node);
39360 +}
39361 +
39362 +#else
39363 +#define check_inode_seal(inode, coord, key) noop
39364 +#define check_sd_coord(coord, key) noop
39365 +#endif
39366 +
39367 +/* insert new stat-data into tree. Called with inode state
39368 + locked. Return inode state locked. */
39369 +static int insert_new_sd(struct inode *inode/* inode to create sd for */)
39370 +{
39371 + int result;
39372 + reiser4_key key;
39373 + coord_t coord;
39374 + reiser4_item_data data;
39375 + char *area;
39376 + reiser4_inode *ref;
39377 + lock_handle lh;
39378 + oid_t oid;
39379 +
39380 + assert("nikita-723", inode != NULL);
39381 + assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
39382 +
39383 + ref = reiser4_inode_data(inode);
39384 + spin_lock_inode(inode);
39385 +
39386 + if (ref->plugin_mask != 0)
39387 + /* inode has non-standard plugins */
39388 + inode_set_extension(inode, PLUGIN_STAT);
39389 + /*
39390 + * prepare specification of new item to be inserted
39391 + */
39392 +
39393 + data.iplug = inode_sd_plugin(inode);
39394 + data.length = data.iplug->s.sd.save_len(inode);
39395 + spin_unlock_inode(inode);
39396 +
39397 + data.data = NULL;
39398 + data.user = 0;
39399 +/* could be optimized for case where there is only one node format in
39400 + * use in the filesystem, probably there are lots of such
39401 + * places we could optimize for only one node layout.... -Hans */
39402 + if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()) {
39403 + /* This is silly check, but we don't know actual node where
39404 + insertion will go into. */
39405 + return RETERR(-ENAMETOOLONG);
39406 + }
39407 + oid = oid_allocate(inode->i_sb);
39408 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be
39409 + * encapsulated into oid_allocate? */
39410 + if (oid == ABSOLUTE_MAX_OID)
39411 + return RETERR(-EOVERFLOW);
39412 +
39413 + set_inode_oid(inode, oid);
39414 +
39415 + coord_init_zero(&coord);
39416 + init_lh(&lh);
39417 +
39418 + result = insert_by_key(reiser4_tree_by_inode(inode),
39419 + build_sd_key(inode, &key), &data, &coord, &lh,
39420 + /* stat data lives on a leaf level */
39421 + LEAF_LEVEL, CBK_UNIQUE);
39422 +
39423 + /* we don't want to re-check that somebody didn't insert
39424 + stat-data while we were doing io, because if it did,
39425 + insert_by_key() returned error. */
39426 + /* but what _is_ possible is that plugin for inode's stat-data,
39427 + list of non-standard plugins or their state would change
39428 + during io, so that stat-data wouldn't fit into sd. To avoid
39429 + this race we keep inode_state lock. This lock has to be
39430 + taken each time you access inode in a way that would cause
39431 + changes in sd size: changing plugins etc.
39432 + */
39433 +
39434 + if (result == IBK_INSERT_OK) {
39435 + coord_clear_iplug(&coord);
39436 + result = zload(coord.node);
39437 + if (result == 0) {
39438 + /* have we really inserted stat data? */
39439 + assert("nikita-725", item_is_statdata(&coord));
39440 +
39441 + /* inode was just created. It is inserted into hash
39442 + table, but no directory entry was yet inserted into
39443 + parent. So, inode is inaccessible through
39444 + ->lookup(). All places that directly grab inode
39445 + from hash-table (like old knfsd), should check
39446 + IMMUTABLE flag that is set by common_create_child.
39447 + */
39448 + assert("nikita-3240", data.iplug != NULL);
39449 + assert("nikita-3241", data.iplug->s.sd.save != NULL);
39450 + area = item_body_by_coord(&coord);
39451 + result = data.iplug->s.sd.save(inode, &area);
39452 + znode_make_dirty(coord.node);
39453 + if (result == 0) {
39454 + /* object has stat-data now */
39455 + reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39456 + reiser4_inode_set_flag(inode,
39457 + REISER4_SDLEN_KNOWN);
39458 + /* initialise stat-data seal */
39459 + reiser4_seal_init(&ref->sd_seal, &coord, &key);
39460 + ref->sd_coord = coord;
39461 + check_inode_seal(inode, &coord, &key);
39462 + } else if (result != -ENOMEM)
39463 + /*
39464 + * convert any other error code to -EIO to
39465 + * avoid confusing user level with unexpected
39466 + * errors.
39467 + */
39468 + result = RETERR(-EIO);
39469 + zrelse(coord.node);
39470 + }
39471 + }
39472 + done_lh(&lh);
39473 +
39474 + if (result != 0)
39475 + key_warning(&key, inode, result);
39476 + else
39477 + oid_count_allocated();
39478 +
39479 + return result;
39480 +}
39481 +
39482 +/* find sd of inode in a tree, deal with errors */
39483 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39484 + znode_lock_mode lock_mode /* lock mode */ ,
39485 + coord_t *coord /* resulting coord */ ,
39486 + lock_handle * lh /* resulting lock handle */ ,
39487 + const reiser4_key * key /* resulting key */ ,
39488 + int silent)
39489 +{
39490 + int result;
39491 + __u32 flags;
39492 +
39493 + assert("nikita-1692", inode != NULL);
39494 + assert("nikita-1693", coord != NULL);
39495 + assert("nikita-1694", key != NULL);
39496 +
39497 + /* look for the object's stat data in a tree.
39498 + This returns in "node" pointer to a locked znode and in "pos"
39499 + position of an item found in node. Both are only valid if
39500 + coord_found is returned. */
39501 + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39502 + flags |= CBK_UNIQUE;
39503 + /*
39504 + * traverse tree to find stat data. We cannot use vroot here, because
39505 + * it only covers _body_ of the file, and stat data don't belong
39506 + * there.
39507 + */
39508 + result = coord_by_key(reiser4_tree_by_inode(inode),
39509 + key,
39510 + coord,
39511 + lh,
39512 + lock_mode,
39513 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39514 + if (REISER4_DEBUG && result == 0)
39515 + check_sd_coord(coord, key);
39516 +
39517 + if (result != 0 && !silent)
39518 + key_warning(key, inode, result);
39519 + return result;
39520 +}
39521 +
39522 +static int
39523 +locate_inode_sd(struct inode *inode,
39524 + reiser4_key * key, coord_t *coord, lock_handle * lh)
39525 +{
39526 + reiser4_inode *state;
39527 + seal_t seal;
39528 + int result;
39529 +
39530 + assert("nikita-3483", inode != NULL);
39531 +
39532 + state = reiser4_inode_data(inode);
39533 + spin_lock_inode(inode);
39534 + *coord = state->sd_coord;
39535 + coord_clear_iplug(coord);
39536 + seal = state->sd_seal;
39537 + spin_unlock_inode(inode);
39538 +
39539 + build_sd_key(inode, key);
39540 + if (reiser4_seal_is_set(&seal)) {
39541 + /* first, try to use seal */
39542 + result = reiser4_seal_validate(&seal,
39543 + coord,
39544 + key,
39545 + lh, ZNODE_WRITE_LOCK,
39546 + ZNODE_LOCK_LOPRI);
39547 + if (result == 0)
39548 + check_sd_coord(coord, key);
39549 + } else
39550 + result = -E_REPEAT;
39551 +
39552 + if (result != 0) {
39553 + coord_init_zero(coord);
39554 + result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39555 + }
39556 + return result;
39557 +}
39558 +
39559 +#if REISER4_DEBUG
39560 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39561 +{
39562 + return (get_key_locality(k1) == get_key_locality(k2) &&
39563 + get_key_type(k1) == get_key_type(k2) &&
39564 + get_key_band(k1) == get_key_band(k2) &&
39565 + get_key_ordering(k1) == get_key_ordering(k2) &&
39566 + get_key_objectid(k1) == get_key_objectid(k2));
39567 +}
39568 +
39569 +#include "../tree_walk.h"
39570 +
39571 +/* make some checks before and after stat-data resize operation */
39572 +static int check_sd_resize(struct inode *inode, coord_t *coord,
39573 + int length, int progress/* 1 means after resize */)
39574 +{
39575 + int ret = 0;
39576 + lock_handle left_lock;
39577 + coord_t left_coord;
39578 + reiser4_key left_key;
39579 + reiser4_key key;
39580 +
39581 + if (inode_file_plugin(inode) !=
39582 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39583 + return 0;
39584 + if (!length)
39585 + return 0;
39586 + if (coord->item_pos != 0)
39587 + return 0;
39588 +
39589 + init_lh(&left_lock);
39590 + ret = reiser4_get_left_neighbor(&left_lock,
39591 + coord->node,
39592 + ZNODE_WRITE_LOCK,
39593 + GN_CAN_USE_UPPER_LEVELS);
39594 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39595 + ret == -ENOENT || ret == -EINVAL
39596 + || ret == -E_DEADLOCK) {
39597 + ret = 0;
39598 + goto exit;
39599 + }
39600 + ret = zload(left_lock.node);
39601 + if (ret)
39602 + goto exit;
39603 + coord_init_last_unit(&left_coord, left_lock.node);
39604 + item_key_by_coord(&left_coord, &left_key);
39605 + item_key_by_coord(coord, &key);
39606 +
39607 + if (all_but_offset_key_eq(&key, &left_key))
39608 + /* corruption occured */
39609 + ret = 1;
39610 + zrelse(left_lock.node);
39611 + exit:
39612 + done_lh(&left_lock);
39613 + return ret;
39614 +}
39615 +#endif
39616 +
39617 +/* update stat-data at @coord */
39618 +static int
39619 +update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key,
39620 + lock_handle * lh)
39621 +{
39622 + int result;
39623 + reiser4_item_data data;
39624 + char *area;
39625 + reiser4_inode *state;
39626 + znode *loaded;
39627 +
39628 + state = reiser4_inode_data(inode);
39629 +
39630 + coord_clear_iplug(coord);
39631 + result = zload(coord->node);
39632 + if (result != 0)
39633 + return result;
39634 + loaded = coord->node;
39635 +
39636 + spin_lock_inode(inode);
39637 + assert("nikita-728", inode_sd_plugin(inode) != NULL);
39638 + data.iplug = inode_sd_plugin(inode);
39639 +
39640 + /* if inode has non-standard plugins, add appropriate stat data
39641 + * extension */
39642 + if (state->extmask & (1 << PLUGIN_STAT)) {
39643 + if (state->plugin_mask == 0)
39644 + inode_clr_extension(inode, PLUGIN_STAT);
39645 + } else if (state->plugin_mask != 0)
39646 + inode_set_extension(inode, PLUGIN_STAT);
39647 +
39648 + if (state->extmask & (1 << HEIR_STAT)) {
39649 + if (state->heir_mask == 0)
39650 + inode_clr_extension(inode, HEIR_STAT);
39651 + } else if (state->heir_mask != 0)
39652 + inode_set_extension(inode, HEIR_STAT);
39653 +
39654 + /* data.length is how much space to add to (or remove
39655 + from if negative) sd */
39656 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39657 + /* recalculate stat-data length */
39658 + data.length =
39659 + data.iplug->s.sd.save_len(inode) -
39660 + item_length_by_coord(coord);
39661 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39662 + } else
39663 + data.length = 0;
39664 + spin_unlock_inode(inode);
39665 +
39666 + /* if on-disk stat data is of different length than required
39667 + for this inode, resize it */
39668 +
39669 + if (data.length != 0) {
39670 + data.data = NULL;
39671 + data.user = 0;
39672 +
39673 + assert("edward-1441",
39674 + !check_sd_resize(inode, coord,
39675 + data.length, 0/* before resize */));
39676 +
39677 + /* insertion code requires that insertion point (coord) was
39678 + * between units. */
39679 + coord->between = AFTER_UNIT;
39680 + result = reiser4_resize_item(coord, &data, key, lh,
39681 + COPI_DONT_SHIFT_LEFT);
39682 + if (result != 0) {
39683 + key_warning(key, inode, result);
39684 + zrelse(loaded);
39685 + return result;
39686 + }
39687 + if (loaded != coord->node) {
39688 + /* reiser4_resize_item moved coord to another node.
39689 + Zload it */
39690 + zrelse(loaded);
39691 + coord_clear_iplug(coord);
39692 + result = zload(coord->node);
39693 + if (result != 0)
39694 + return result;
39695 + loaded = coord->node;
39696 + }
39697 + assert("edward-1442",
39698 + !check_sd_resize(inode, coord,
39699 + data.length, 1/* after resize */));
39700 + }
39701 + area = item_body_by_coord(coord);
39702 + spin_lock_inode(inode);
39703 + result = data.iplug->s.sd.save(inode, &area);
39704 + znode_make_dirty(coord->node);
39705 +
39706 + /* re-initialise stat-data seal */
39707 +
39708 + /*
39709 + * coord.between was possibly skewed from AT_UNIT when stat-data size
39710 + * was changed and new extensions were pasted into item.
39711 + */
39712 + coord->between = AT_UNIT;
39713 + reiser4_seal_init(&state->sd_seal, coord, key);
39714 + state->sd_coord = *coord;
39715 + spin_unlock_inode(inode);
39716 + check_inode_seal(inode, coord, key);
39717 + zrelse(loaded);
39718 + return result;
39719 +}
39720 +
39721 +/* Update existing stat-data in a tree. Called with inode state locked. Return
39722 + inode state locked. */
39723 +static int update_sd(struct inode *inode/* inode to update sd for */)
39724 +{
39725 + int result;
39726 + reiser4_key key;
39727 + coord_t coord;
39728 + lock_handle lh;
39729 +
39730 + assert("nikita-726", inode != NULL);
39731 +
39732 + /* no stat-data, nothing to update?! */
39733 + assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39734 +
39735 + init_lh(&lh);
39736 +
39737 + result = locate_inode_sd(inode, &key, &coord, &lh);
39738 + if (result == 0)
39739 + result = update_sd_at(inode, &coord, &key, &lh);
39740 + done_lh(&lh);
39741 +
39742 + return result;
39743 +}
39744 +
39745 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39746 + Remove object stat data. Space for that must be reserved by caller before
39747 +*/
39748 +static int
39749 +common_object_delete_no_reserve(struct inode *inode/* object to remove */)
39750 +{
39751 + int result;
39752 +
39753 + assert("nikita-1477", inode != NULL);
39754 +
39755 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39756 + reiser4_key sd_key;
39757 +
39758 + vfs_dq_free_inode(inode);
39759 + vfs_dq_drop(inode);
39760 +
39761 + build_sd_key(inode, &sd_key);
39762 + result =
39763 + reiser4_cut_tree(reiser4_tree_by_inode(inode),
39764 + &sd_key, &sd_key, NULL, 0);
39765 + if (result == 0) {
39766 + reiser4_inode_set_flag(inode, REISER4_NO_SD);
39767 + result = oid_release(inode->i_sb, get_inode_oid(inode));
39768 + if (result == 0) {
39769 + oid_count_released();
39770 +
39771 + result = safe_link_del(reiser4_tree_by_inode(inode),
39772 + get_inode_oid(inode),
39773 + SAFE_UNLINK);
39774 + }
39775 + }
39776 + } else
39777 + result = 0;
39778 + return result;
39779 +}
39780 +
39781 +/* helper for safelink_common */
39782 +static int process_truncate(struct inode *inode, __u64 size)
39783 +{
39784 + int result;
39785 + struct iattr attr;
39786 + file_plugin *fplug;
39787 + reiser4_context *ctx;
39788 + struct dentry dentry;
39789 +
39790 + assert("vs-21", is_in_reiser4_context());
39791 + ctx = reiser4_init_context(inode->i_sb);
39792 + assert("vs-22", !IS_ERR(ctx));
39793 +
39794 + attr.ia_size = size;
39795 + attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39796 + fplug = inode_file_plugin(inode);
39797 +
39798 + mutex_lock(&inode->i_mutex);
39799 + assert("vs-1704", get_current_context()->trans->atom == NULL);
39800 + dentry.d_inode = inode;
39801 + result = inode->i_op->setattr(&dentry, &attr);
39802 + mutex_unlock(&inode->i_mutex);
39803 +
39804 + context_set_commit_async(ctx);
39805 + reiser4_exit_context(ctx);
39806 +
39807 + return result;
39808 +}
39809 +
39810 +/*
39811 + Local variables:
39812 + c-indentation-style: "K&R"
39813 + mode-name: "LC"
39814 + c-basic-offset: 8
39815 + tab-width: 8
39816 + fill-column: 80
39817 + scroll-step: 1
39818 + End:
39819 +*/
39820 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/hash.c linux-2.6.33/fs/reiser4/plugin/hash.c
39821 --- linux-2.6.33.orig/fs/reiser4/plugin/hash.c 1970-01-01 01:00:00.000000000 +0100
39822 +++ linux-2.6.33/fs/reiser4/plugin/hash.c 2010-03-04 19:33:22.000000000 +0100
39823 @@ -0,0 +1,352 @@
39824 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39825 + * reiser4/README */
39826 +
39827 +/* Hash functions */
39828 +
39829 +#include "../debug.h"
39830 +#include "plugin_header.h"
39831 +#include "plugin.h"
39832 +#include "../super.h"
39833 +#include "../inode.h"
39834 +
39835 +#include <linux/types.h>
39836 +
39837 +/* old rupasov (yura) hash */
39838 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39839 + int len/* @name's length */)
39840 +{
39841 + int i;
39842 + int j;
39843 + int pow;
39844 + __u64 a;
39845 + __u64 c;
39846 +
39847 + assert("nikita-672", name != NULL);
39848 + assert("nikita-673", len >= 0);
39849 +
39850 + for (pow = 1, i = 1; i < len; ++i)
39851 + pow = pow * 10;
39852 +
39853 + if (len == 1)
39854 + a = name[0] - 48;
39855 + else
39856 + a = (name[0] - 48) * pow;
39857 +
39858 + for (i = 1; i < len; ++i) {
39859 + c = name[i] - 48;
39860 + for (pow = 1, j = i; j < len - 1; ++j)
39861 + pow = pow * 10;
39862 + a = a + c * pow;
39863 + }
39864 + for (; i < 40; ++i) {
39865 + c = '0' - 48;
39866 + for (pow = 1, j = i; j < len - 1; ++j)
39867 + pow = pow * 10;
39868 + a = a + c * pow;
39869 + }
39870 +
39871 + for (; i < 256; ++i) {
39872 + c = i;
39873 + for (pow = 1, j = i; j < len - 1; ++j)
39874 + pow = pow * 10;
39875 + a = a + c * pow;
39876 + }
39877 +
39878 + a = a << 7;
39879 + return a;
39880 +}
39881 +
39882 +/* r5 hash */
39883 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39884 + int len UNUSED_ARG/* @name's length */)
39885 +{
39886 + __u64 a = 0;
39887 +
39888 + assert("nikita-674", name != NULL);
39889 + assert("nikita-675", len >= 0);
39890 +
39891 + while (*name) {
39892 + a += *name << 4;
39893 + a += *name >> 4;
39894 + a *= 11;
39895 + name++;
39896 + }
39897 + return a;
39898 +}
39899 +
39900 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39901 + H0 = Key
39902 + Hi = E Mi(Hi-1) + Hi-1
39903 +
39904 + (see Applied Cryptography, 2nd edition, p448).
39905 +
39906 + Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39907 +
39908 + Jeremy has agreed to the contents of reiserfs/README. -Hans
39909 +
39910 + This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39911 +*/
39912 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39913 + int len/* @name's length */)
39914 +{
39915 + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39916 +
39917 + __u64 h0 = k[0], h1 = k[1];
39918 + __u64 a, b, c, d;
39919 + __u64 pad;
39920 + int i;
39921 +
39922 + assert("nikita-676", name != NULL);
39923 + assert("nikita-677", len >= 0);
39924 +
39925 +#define DELTA 0x9E3779B9u
39926 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
39927 +#define PARTROUNDS 6 /* 6 gets complete mixing */
39928 +
39929 +/* a, b, c, d - data; h0, h1 - accumulated hash */
39930 +#define TEACORE(rounds) \
39931 + do { \
39932 + __u64 sum = 0; \
39933 + int n = rounds; \
39934 + __u64 b0, b1; \
39935 + \
39936 + b0 = h0; \
39937 + b1 = h1; \
39938 + \
39939 + do { \
39940 + sum += DELTA; \
39941 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39942 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39943 + } while (--n); \
39944 + \
39945 + h0 += b0; \
39946 + h1 += b1; \
39947 + } while (0)
39948 +
39949 + pad = (__u64) len | ((__u64) len << 8);
39950 + pad |= pad << 16;
39951 +
39952 + while (len >= 16) {
39953 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39954 + 16 | (__u64) name[3] << 24;
39955 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39956 + 16 | (__u64) name[7] << 24;
39957 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39958 + 16 | (__u64) name[11] << 24;
39959 + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39960 + << 16 | (__u64) name[15] << 24;
39961 +
39962 + TEACORE(PARTROUNDS);
39963 +
39964 + len -= 16;
39965 + name += 16;
39966 + }
39967 +
39968 + if (len >= 12) {
39969 + /* assert(len < 16); */
39970 + if (len >= 16)
39971 + *(int *)0 = 0;
39972 +
39973 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39974 + 16 | (__u64) name[3] << 24;
39975 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39976 + 16 | (__u64) name[7] << 24;
39977 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39978 + 16 | (__u64) name[11] << 24;
39979 +
39980 + d = pad;
39981 + for (i = 12; i < len; i++) {
39982 + d <<= 8;
39983 + d |= name[i];
39984 + }
39985 + } else if (len >= 8) {
39986 + /* assert(len < 12); */
39987 + if (len >= 12)
39988 + *(int *)0 = 0;
39989 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39990 + 16 | (__u64) name[3] << 24;
39991 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39992 + 16 | (__u64) name[7] << 24;
39993 +
39994 + c = d = pad;
39995 + for (i = 8; i < len; i++) {
39996 + c <<= 8;
39997 + c |= name[i];
39998 + }
39999 + } else if (len >= 4) {
40000 + /* assert(len < 8); */
40001 + if (len >= 8)
40002 + *(int *)0 = 0;
40003 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
40004 + 16 | (__u64) name[3] << 24;
40005 +
40006 + b = c = d = pad;
40007 + for (i = 4; i < len; i++) {
40008 + b <<= 8;
40009 + b |= name[i];
40010 + }
40011 + } else {
40012 + /* assert(len < 4); */
40013 + if (len >= 4)
40014 + *(int *)0 = 0;
40015 + a = b = c = d = pad;
40016 + for (i = 0; i < len; i++) {
40017 + a <<= 8;
40018 + a |= name[i];
40019 + }
40020 + }
40021 +
40022 + TEACORE(FULLROUNDS);
40023 +
40024 +/* return 0;*/
40025 + return h0 ^ h1;
40026 +
40027 +}
40028 +
40029 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
40030 +
40031 + See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
40032 +
40033 + Excerpts:
40034 +
40035 + FNV hashes are designed to be fast while maintaining a low collision
40036 + rate.
40037 +
40038 + [This version also seems to preserve lexicographical order locally.]
40039 +
40040 + FNV hash algorithms and source code have been released into the public
40041 + domain.
40042 +
40043 +*/
40044 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
40045 + int len UNUSED_ARG/* @name's length */)
40046 +{
40047 + unsigned long long a = 0xcbf29ce484222325ull;
40048 + const unsigned long long fnv_64_prime = 0x100000001b3ull;
40049 +
40050 + assert("nikita-678", name != NULL);
40051 + assert("nikita-679", len >= 0);
40052 +
40053 + /* FNV-1 hash each octet in the buffer */
40054 + for (; *name; ++name) {
40055 + /* multiply by the 32 bit FNV magic prime mod 2^64 */
40056 + a *= fnv_64_prime;
40057 + /* xor the bottom with the current octet */
40058 + a ^= (unsigned long long)(*name);
40059 + }
40060 + /* return our new hash value */
40061 + return a;
40062 +}
40063 +
40064 +/* degenerate hash function used to simplify testing of non-unique key
40065 + handling */
40066 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
40067 + int len UNUSED_ARG/* @name's length */)
40068 +{
40069 + return 0xc0c0c0c010101010ull;
40070 +}
40071 +
40072 +static int change_hash(struct inode *inode,
40073 + reiser4_plugin * plugin,
40074 + pset_member memb)
40075 +{
40076 + int result;
40077 +
40078 + assert("nikita-3503", inode != NULL);
40079 + assert("nikita-3504", plugin != NULL);
40080 +
40081 + assert("nikita-3505", is_reiser4_inode(inode));
40082 + assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
40083 +
40084 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
40085 + return RETERR(-EINVAL);
40086 +
40087 + result = 0;
40088 + if (inode_hash_plugin(inode) == NULL ||
40089 + inode_hash_plugin(inode)->h.id != plugin->h.id) {
40090 + if (is_dir_empty(inode) == 0)
40091 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
40092 + PSET_HASH, plugin);
40093 + else
40094 + result = RETERR(-ENOTEMPTY);
40095 +
40096 + }
40097 + return result;
40098 +}
40099 +
40100 +static reiser4_plugin_ops hash_plugin_ops = {
40101 + .init = NULL,
40102 + .load = NULL,
40103 + .save_len = NULL,
40104 + .save = NULL,
40105 + .change = change_hash
40106 +};
40107 +
40108 +/* hash plugins */
40109 +hash_plugin hash_plugins[LAST_HASH_ID] = {
40110 + [RUPASOV_HASH_ID] = {
40111 + .h = {
40112 + .type_id = REISER4_HASH_PLUGIN_TYPE,
40113 + .id = RUPASOV_HASH_ID,
40114 + .pops = &hash_plugin_ops,
40115 + .label = "rupasov",
40116 + .desc = "Original Yura's hash",
40117 + .linkage = {NULL, NULL}
40118 + },
40119 + .hash = hash_rupasov
40120 + },
40121 + [R5_HASH_ID] = {
40122 + .h = {
40123 + .type_id = REISER4_HASH_PLUGIN_TYPE,
40124 + .id = R5_HASH_ID,
40125 + .pops = &hash_plugin_ops,
40126 + .label = "r5",
40127 + .desc = "r5 hash",
40128 + .linkage = {NULL, NULL}
40129 + },
40130 + .hash = hash_r5
40131 + },
40132 + [TEA_HASH_ID] = {
40133 + .h = {
40134 + .type_id = REISER4_HASH_PLUGIN_TYPE,
40135 + .id = TEA_HASH_ID,
40136 + .pops = &hash_plugin_ops,
40137 + .label = "tea",
40138 + .desc = "tea hash",
40139 + .linkage = {NULL, NULL}
40140 + },
40141 + .hash = hash_tea
40142 + },
40143 + [FNV1_HASH_ID] = {
40144 + .h = {
40145 + .type_id = REISER4_HASH_PLUGIN_TYPE,
40146 + .id = FNV1_HASH_ID,
40147 + .pops = &hash_plugin_ops,
40148 + .label = "fnv1",
40149 + .desc = "fnv1 hash",
40150 + .linkage = {NULL, NULL}
40151 + },
40152 + .hash = hash_fnv1
40153 + },
40154 + [DEGENERATE_HASH_ID] = {
40155 + .h = {
40156 + .type_id = REISER4_HASH_PLUGIN_TYPE,
40157 + .id = DEGENERATE_HASH_ID,
40158 + .pops = &hash_plugin_ops,
40159 + .label = "degenerate hash",
40160 + .desc = "Degenerate hash: only for testing",
40161 + .linkage = {NULL, NULL}
40162 + },
40163 + .hash = hash_deg
40164 + }
40165 +};
40166 +
40167 +/* Make Linus happy.
40168 + Local variables:
40169 + c-indentation-style: "K&R"
40170 + mode-name: "LC"
40171 + c-basic-offset: 8
40172 + tab-width: 8
40173 + fill-column: 120
40174 + End:
40175 +*/
40176 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.33/fs/reiser4/plugin/inode_ops.c
40177 --- linux-2.6.33.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 01:00:00.000000000 +0100
40178 +++ linux-2.6.33/fs/reiser4/plugin/inode_ops.c 2010-03-04 19:33:22.000000000 +0100
40179 @@ -0,0 +1,906 @@
40180 +/*
40181 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
40182 + */
40183 +
40184 +/*
40185 + * this file contains typical implementations for most of methods of struct
40186 + * inode_operations
40187 + */
40188 +
40189 +#include "../inode.h"
40190 +#include "../safe_link.h"
40191 +
40192 +#include <linux/quotaops.h>
40193 +#include <linux/namei.h>
40194 +
40195 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
40196 + reiser4_object_create_data *data);
40197 +
40198 +/**
40199 + * reiser4_create_common - create of inode operations
40200 + * @parent: inode of parent directory
40201 + * @dentry: dentry of new object to create
40202 + * @mode: the permissions to use
40203 + * @nameidata:
40204 + *
40205 + * This is common implementation of vfs's create method of struct
40206 + * inode_operations.
40207 + * Creates regular file using file plugin from parent directory plugin set.
40208 + */
40209 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
40210 + int mode, struct nameidata *nameidata)
40211 +{
40212 + reiser4_object_create_data data;
40213 + file_plugin *fplug;
40214 +
40215 + memset(&data, 0, sizeof data);
40216 + data.mode = S_IFREG | mode;
40217 + fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
40218 + if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
40219 + warning("vpf-1900", "'%s' is not a regular file plugin.",
40220 + fplug->h.label);
40221 + return RETERR(-EIO);
40222 + }
40223 + data.id = fplug->h.id;
40224 + return create_vfs_object(parent, dentry, &data);
40225 +}
40226 +
40227 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
40228 +void check_light_weight(struct inode *inode, struct inode *parent);
40229 +
40230 +/**
40231 + * reiser4_lookup_common - lookup of inode operations
40232 + * @parent: inode of directory to lookup into
40233 + * @dentry: name to look for
40234 + * @nameidata:
40235 + *
40236 + * This is common implementation of vfs's lookup method of struct
40237 + * inode_operations.
40238 + */
40239 +struct dentry *reiser4_lookup_common(struct inode *parent,
40240 + struct dentry *dentry,
40241 + struct nameidata *nameidata)
40242 +{
40243 + reiser4_context *ctx;
40244 + int result;
40245 + struct dentry *new;
40246 + struct inode *inode;
40247 + reiser4_dir_entry_desc entry;
40248 +
40249 + ctx = reiser4_init_context(parent->i_sb);
40250 + if (IS_ERR(ctx))
40251 + return (struct dentry *)ctx;
40252 +
40253 + /* set up operations on dentry. */
40254 + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
40255 +
40256 + result = reiser4_lookup_name(parent, dentry, &entry.key);
40257 + if (result) {
40258 + context_set_commit_async(ctx);
40259 + reiser4_exit_context(ctx);
40260 + if (result == -ENOENT) {
40261 + /* object not found */
40262 + if (!IS_DEADDIR(parent))
40263 + d_add(dentry, NULL);
40264 + return NULL;
40265 + }
40266 + return ERR_PTR(result);
40267 + }
40268 +
40269 + inode = reiser4_iget(parent->i_sb, &entry.key, 0);
40270 + if (IS_ERR(inode)) {
40271 + context_set_commit_async(ctx);
40272 + reiser4_exit_context(ctx);
40273 + return ERR_PTR(PTR_ERR(inode));
40274 + }
40275 +
40276 + /* success */
40277 + check_light_weight(inode, parent);
40278 + new = d_splice_alias(inode, dentry);
40279 + reiser4_iget_complete(inode);
40280 +
40281 + /* prevent balance_dirty_pages() from being called: we don't want to
40282 + * do this under directory i_mutex. */
40283 + context_set_commit_async(ctx);
40284 + reiser4_exit_context(ctx);
40285 + return new;
40286 +}
40287 +
40288 +static reiser4_block_nr common_estimate_link(struct inode *parent,
40289 + struct inode *object);
40290 +int reiser4_update_dir(struct inode *);
40291 +
40292 +/**
40293 + * reiser4_link_common - link of inode operations
40294 + * @existing: dentry of object which is to get new name
40295 + * @parent: directory where new name is to be created
40296 + * @newname: new name
40297 + *
40298 + * This is common implementation of vfs's link method of struct
40299 + * inode_operations.
40300 + */
40301 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
40302 + struct dentry *newname)
40303 +{
40304 + reiser4_context *ctx;
40305 + int result;
40306 + struct inode *object;
40307 + dir_plugin *parent_dplug;
40308 + reiser4_dir_entry_desc entry;
40309 + reiser4_object_create_data data;
40310 + reiser4_block_nr reserve;
40311 +
40312 + ctx = reiser4_init_context(parent->i_sb);
40313 + if (IS_ERR(ctx))
40314 + return PTR_ERR(ctx);
40315 +
40316 + assert("nikita-1431", existing != NULL);
40317 + assert("nikita-1432", parent != NULL);
40318 + assert("nikita-1433", newname != NULL);
40319 +
40320 + object = existing->d_inode;
40321 + assert("nikita-1434", object != NULL);
40322 +
40323 + /* check for race with create_object() */
40324 + if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
40325 + context_set_commit_async(ctx);
40326 + reiser4_exit_context(ctx);
40327 + return RETERR(-E_REPEAT);
40328 + }
40329 +
40330 + parent_dplug = inode_dir_plugin(parent);
40331 +
40332 + memset(&entry, 0, sizeof entry);
40333 + entry.obj = object;
40334 +
40335 + data.mode = object->i_mode;
40336 + data.id = inode_file_plugin(object)->h.id;
40337 +
40338 + reserve = common_estimate_link(parent, existing->d_inode);
40339 + if ((__s64) reserve < 0) {
40340 + context_set_commit_async(ctx);
40341 + reiser4_exit_context(ctx);
40342 + return reserve;
40343 + }
40344 +
40345 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40346 + context_set_commit_async(ctx);
40347 + reiser4_exit_context(ctx);
40348 + return RETERR(-ENOSPC);
40349 + }
40350 +
40351 + /*
40352 + * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
40353 + * means that link(2) can race against unlink(2) or rename(2), and
40354 + * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
40355 + *
40356 + * For such inode we have to undo special processing done in
40357 + * reiser4_unlink() viz. creation of safe-link.
40358 + */
40359 + if (unlikely(object->i_nlink == 0)) {
40360 + result = safe_link_del(reiser4_tree_by_inode(object),
40361 + get_inode_oid(object), SAFE_UNLINK);
40362 + if (result != 0) {
40363 + context_set_commit_async(ctx);
40364 + reiser4_exit_context(ctx);
40365 + return result;
40366 + }
40367 + }
40368 +
40369 + /* increment nlink of @existing and update its stat data */
40370 + result = reiser4_add_nlink(object, parent, 1);
40371 + if (result == 0) {
40372 + /* add entry to the parent */
40373 + result =
40374 + parent_dplug->add_entry(parent, newname, &data, &entry);
40375 + if (result != 0) {
40376 + /* failed to add entry to the parent, decrement nlink
40377 + of @existing */
40378 + reiser4_del_nlink(object, parent, 1);
40379 + /*
40380 + * now, if that failed, we have a file with too big
40381 + * nlink---space leak, much better than directory
40382 + * entry pointing to nowhere
40383 + */
40384 + }
40385 + }
40386 + if (result == 0) {
40387 + atomic_inc(&object->i_count);
40388 + /*
40389 + * Upon successful completion, link() shall mark for update
40390 + * the st_ctime field of the file. Also, the st_ctime and
40391 + * st_mtime fields of the directory that contains the new
40392 + * entry shall be marked for update. --SUS
40393 + */
40394 + result = reiser4_update_dir(parent);
40395 + }
40396 + if (result == 0)
40397 + d_instantiate(newname, existing->d_inode);
40398 +
40399 + context_set_commit_async(ctx);
40400 + reiser4_exit_context(ctx);
40401 + return result;
40402 +}
40403 +
40404 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40405 +
40406 +/**
40407 + * reiser4_unlink_common - unlink of inode operations
40408 + * @parent: inode of directory to remove name from
40409 + * @victim: name to be removed
40410 + *
40411 + * This is common implementation of vfs's unlink method of struct
40412 + * inode_operations.
40413 + */
40414 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40415 +{
40416 + reiser4_context *ctx;
40417 + int result;
40418 + struct inode *object;
40419 + file_plugin *fplug;
40420 +
40421 + ctx = reiser4_init_context(parent->i_sb);
40422 + if (IS_ERR(ctx))
40423 + return PTR_ERR(ctx);
40424 +
40425 + object = victim->d_inode;
40426 + fplug = inode_file_plugin(object);
40427 + assert("nikita-2882", fplug->detach != NULL);
40428 +
40429 + result = unlink_check_and_grab(parent, victim);
40430 + if (result != 0) {
40431 + context_set_commit_async(ctx);
40432 + reiser4_exit_context(ctx);
40433 + return result;
40434 + }
40435 +
40436 + result = fplug->detach(object, parent);
40437 + if (result == 0) {
40438 + dir_plugin *parent_dplug;
40439 + reiser4_dir_entry_desc entry;
40440 +
40441 + parent_dplug = inode_dir_plugin(parent);
40442 + memset(&entry, 0, sizeof entry);
40443 +
40444 + /* first, delete directory entry */
40445 + result = parent_dplug->rem_entry(parent, victim, &entry);
40446 + if (result == 0) {
40447 + /*
40448 + * if name was removed successfully, we _have_ to
40449 + * return 0 from this function, because upper level
40450 + * caller (vfs_{rmdir,unlink}) expect this.
40451 + *
40452 + * now that directory entry is removed, update
40453 + * stat-data
40454 + */
40455 + reiser4_del_nlink(object, parent, 1);
40456 + /*
40457 + * Upon successful completion, unlink() shall mark for
40458 + * update the st_ctime and st_mtime fields of the
40459 + * parent directory. Also, if the file's link count is
40460 + * not 0, the st_ctime field of the file shall be
40461 + * marked for update. --SUS
40462 + */
40463 + reiser4_update_dir(parent);
40464 + /* add safe-link for this file */
40465 + if (object->i_nlink == 0)
40466 + safe_link_add(object, SAFE_UNLINK);
40467 + }
40468 + }
40469 +
40470 + if (unlikely(result != 0)) {
40471 + if (result != -ENOMEM)
40472 + warning("nikita-3398", "Cannot unlink %llu (%i)",
40473 + (unsigned long long)get_inode_oid(object),
40474 + result);
40475 + /* if operation failed commit pending inode modifications to
40476 + * the stat-data */
40477 + reiser4_update_sd(object);
40478 + reiser4_update_sd(parent);
40479 + }
40480 +
40481 + reiser4_release_reserved(object->i_sb);
40482 +
40483 + /* @object's i_ctime was updated by ->rem_link() method(). */
40484 +
40485 + /* @victim can be already removed from the disk by this time. Inode is
40486 + then marked so that iput() wouldn't try to remove stat data. But
40487 + inode itself is still there.
40488 + */
40489 +
40490 + /*
40491 + * we cannot release directory semaphore here, because name has
40492 + * already been deleted, but dentry (@victim) still exists. Prevent
40493 + * balance_dirty_pages() from being called on exiting this context: we
40494 + * don't want to do this under directory i_mutex.
40495 + */
40496 + context_set_commit_async(ctx);
40497 + reiser4_exit_context(ctx);
40498 + return result;
40499 +}
40500 +
40501 +/**
40502 + * reiser4_symlink_common - symlink of inode operations
40503 + * @parent: inode of parent directory
40504 + * @dentry: dentry of object to be created
40505 + * @linkname: string symlink is to contain
40506 + *
40507 + * This is common implementation of vfs's symlink method of struct
40508 + * inode_operations.
40509 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40510 + */
40511 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40512 + const char *linkname)
40513 +{
40514 + reiser4_object_create_data data;
40515 +
40516 + memset(&data, 0, sizeof data);
40517 + data.name = linkname;
40518 + data.id = SYMLINK_FILE_PLUGIN_ID;
40519 + data.mode = S_IFLNK | S_IRWXUGO;
40520 + return create_vfs_object(parent, dentry, &data);
40521 +}
40522 +
40523 +/**
40524 + * reiser4_mkdir_common - mkdir of inode operations
40525 + * @parent: inode of parent directory
40526 + * @dentry: dentry of object to be created
40527 + * @mode: the permissions to use
40528 + *
40529 + * This is common implementation of vfs's mkdir method of struct
40530 + * inode_operations.
40531 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40532 + */
40533 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40534 +{
40535 + reiser4_object_create_data data;
40536 +
40537 + memset(&data, 0, sizeof data);
40538 + data.mode = S_IFDIR | mode;
40539 + data.id = DIRECTORY_FILE_PLUGIN_ID;
40540 + return create_vfs_object(parent, dentry, &data);
40541 +}
40542 +
40543 +/**
40544 + * reiser4_mknod_common - mknod of inode operations
40545 + * @parent: inode of parent directory
40546 + * @dentry: dentry of object to be created
40547 + * @mode: the permissions to use and file type
40548 + * @rdev: minor and major of new device file
40549 + *
40550 + * This is common implementation of vfs's mknod method of struct
40551 + * inode_operations.
40552 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40553 + */
40554 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40555 + int mode, dev_t rdev)
40556 +{
40557 + reiser4_object_create_data data;
40558 +
40559 + memset(&data, 0, sizeof data);
40560 + data.mode = mode;
40561 + data.rdev = rdev;
40562 + data.id = SPECIAL_FILE_PLUGIN_ID;
40563 + return create_vfs_object(parent, dentry, &data);
40564 +}
40565 +
40566 +/*
40567 + * implementation of vfs's rename method of struct inode_operations for typical
40568 + * directory is in inode_ops_rename.c
40569 + */
40570 +
40571 +/**
40572 + * reiser4_follow_link_common - follow_link of inode operations
40573 + * @dentry: dentry of symlink
40574 + * @data:
40575 + *
40576 + * This is common implementation of vfs's followlink method of struct
40577 + * inode_operations.
40578 + * Assumes that inode's i_private points to the content of symbolic link.
40579 + */
40580 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40581 +{
40582 + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40583 +
40584 + if (!dentry->d_inode->i_private
40585 + || !reiser4_inode_get_flag(dentry->d_inode,
40586 + REISER4_GENERIC_PTR_USED))
40587 + return ERR_PTR(RETERR(-EINVAL));
40588 + nd_set_link(nd, dentry->d_inode->i_private);
40589 + return NULL;
40590 +}
40591 +
40592 +/**
40593 + * reiser4_permission_common - permission of inode operations
40594 + * @inode: inode to check permissions for
40595 + * @mask: mode bits to check permissions for
40596 + * @nameidata:
40597 + *
40598 + * Uses generic function to check for rwx permissions.
40599 + */
40600 +int reiser4_permission_common(struct inode *inode, int mask)
40601 +{
40602 + return generic_permission(inode, mask, NULL);
40603 +}
40604 +
40605 +static int setattr_reserve(reiser4_tree *);
40606 +
40607 +/* this is common implementation of vfs's setattr method of struct
40608 + inode_operations
40609 +*/
40610 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40611 +{
40612 + reiser4_context *ctx;
40613 + struct inode *inode;
40614 + int result;
40615 +
40616 + inode = dentry->d_inode;
40617 + result = inode_change_ok(inode, attr);
40618 + if (result)
40619 + return result;
40620 +
40621 + ctx = reiser4_init_context(inode->i_sb);
40622 + if (IS_ERR(ctx))
40623 + return PTR_ERR(ctx);
40624 +
40625 + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40626 +
40627 + /*
40628 + * grab disk space and call standard inode_setattr().
40629 + */
40630 + result = setattr_reserve(reiser4_tree_by_inode(inode));
40631 + if (!result) {
40632 + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40633 + || (attr->ia_valid & ATTR_GID
40634 + && attr->ia_gid != inode->i_gid)) {
40635 + result = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
40636 + if (result) {
40637 + context_set_commit_async(ctx);
40638 + reiser4_exit_context(ctx);
40639 + return result;
40640 + }
40641 + }
40642 + result = inode_setattr(inode, attr);
40643 + if (!result)
40644 + reiser4_update_sd(inode);
40645 + }
40646 +
40647 + context_set_commit_async(ctx);
40648 + reiser4_exit_context(ctx);
40649 + return result;
40650 +}
40651 +
40652 +/* this is common implementation of vfs's getattr method of struct
40653 + inode_operations
40654 +*/
40655 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40656 + struct dentry *dentry, struct kstat *stat)
40657 +{
40658 + struct inode *obj;
40659 +
40660 + assert("nikita-2298", dentry != NULL);
40661 + assert("nikita-2299", stat != NULL);
40662 + assert("nikita-2300", dentry->d_inode != NULL);
40663 +
40664 + obj = dentry->d_inode;
40665 +
40666 + stat->dev = obj->i_sb->s_dev;
40667 + stat->ino = oid_to_uino(get_inode_oid(obj));
40668 + stat->mode = obj->i_mode;
40669 + /* don't confuse userland with huge nlink. This is not entirely
40670 + * correct, because nlink_t is not necessary 16 bit signed. */
40671 + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40672 + stat->uid = obj->i_uid;
40673 + stat->gid = obj->i_gid;
40674 + stat->rdev = obj->i_rdev;
40675 + stat->atime = obj->i_atime;
40676 + stat->mtime = obj->i_mtime;
40677 + stat->ctime = obj->i_ctime;
40678 + stat->size = obj->i_size;
40679 + stat->blocks =
40680 + (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40681 + /* "preferred" blocksize for efficient file system I/O */
40682 + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40683 +
40684 + return 0;
40685 +}
40686 +
40687 +/* Estimate the maximum amount of nodes which might be allocated or changed on
40688 + typical new object creation. Typical creation consists of calling create
40689 + method of file plugin, adding directory entry to parent and update parent
40690 + directory's stat data.
40691 +*/
40692 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,
40693 + /* parent object */
40694 + struct inode *object
40695 + /* object */)
40696 +{
40697 + assert("vpf-309", parent != NULL);
40698 + assert("vpf-307", object != NULL);
40699 +
40700 + return
40701 + /* object creation estimation */
40702 + inode_file_plugin(object)->estimate.create(object) +
40703 + /* stat data of parent directory estimation */
40704 + inode_file_plugin(parent)->estimate.update(parent) +
40705 + /* adding entry estimation */
40706 + inode_dir_plugin(parent)->estimate.add_entry(parent) +
40707 + /* to undo in the case of failure */
40708 + inode_dir_plugin(parent)->estimate.rem_entry(parent);
40709 +}
40710 +
40711 +/* Create child in directory.
40712 +
40713 + . get object's plugin
40714 + . get fresh inode
40715 + . initialize inode
40716 + . add object's stat-data
40717 + . initialize object's directory
40718 + . add entry to the parent
40719 + . instantiate dentry
40720 +
40721 +*/
40722 +static int do_create_vfs_child(reiser4_object_create_data * data,/* parameters
40723 + of new
40724 + object */
40725 + struct inode **retobj)
40726 +{
40727 + int result;
40728 +
40729 + struct dentry *dentry; /* parent object */
40730 + struct inode *parent; /* new name */
40731 +
40732 + dir_plugin *par_dir; /* directory plugin on the parent */
40733 + dir_plugin *obj_dir; /* directory plugin on the new object */
40734 + file_plugin *obj_plug; /* object plugin on the new object */
40735 + struct inode *object; /* new object */
40736 + reiser4_block_nr reserve;
40737 +
40738 + reiser4_dir_entry_desc entry; /* new directory entry */
40739 +
40740 + assert("nikita-1420", data != NULL);
40741 + parent = data->parent;
40742 + dentry = data->dentry;
40743 +
40744 + assert("nikita-1418", parent != NULL);
40745 + assert("nikita-1419", dentry != NULL);
40746 +
40747 + /* check, that name is acceptable for parent */
40748 + par_dir = inode_dir_plugin(parent);
40749 + if (par_dir->is_name_acceptable &&
40750 + !par_dir->is_name_acceptable(parent,
40751 + dentry->d_name.name,
40752 + (int)dentry->d_name.len))
40753 + return RETERR(-ENAMETOOLONG);
40754 +
40755 + result = 0;
40756 + obj_plug = file_plugin_by_id((int)data->id);
40757 + if (obj_plug == NULL) {
40758 + warning("nikita-430", "Cannot find plugin %i", data->id);
40759 + return RETERR(-ENOENT);
40760 + }
40761 + object = new_inode(parent->i_sb);
40762 + if (object == NULL)
40763 + return RETERR(-ENOMEM);
40764 + /* we'll update i_nlink below */
40765 + object->i_nlink = 0;
40766 + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40767 + * to simplify error handling: if some error occurs before i_ino is
40768 + * initialized with oid, i_ino should already be set to some
40769 + * distinguished value. */
40770 + object->i_ino = 0;
40771 +
40772 + /* So that on error iput will be called. */
40773 + *retobj = object;
40774 +
40775 + if (vfs_dq_alloc_inode(object)) {
40776 + vfs_dq_drop(object);
40777 + object->i_flags |= S_NOQUOTA;
40778 + return RETERR(-EDQUOT);
40779 + }
40780 +
40781 + memset(&entry, 0, sizeof entry);
40782 + entry.obj = object;
40783 +
40784 + set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40785 + file_plugin_to_plugin(obj_plug));
40786 + result = obj_plug->set_plug_in_inode(object, parent, data);
40787 + if (result) {
40788 + warning("nikita-431", "Cannot install plugin %i on %llx",
40789 + data->id, (unsigned long long)get_inode_oid(object));
40790 + vfs_dq_free_inode(object);
40791 + object->i_flags |= S_NOQUOTA;
40792 + return result;
40793 + }
40794 +
40795 + /* reget plugin after installation */
40796 + obj_plug = inode_file_plugin(object);
40797 +
40798 + if (obj_plug->create_object == NULL) {
40799 + vfs_dq_free_inode(object);
40800 + object->i_flags |= S_NOQUOTA;
40801 + return RETERR(-EPERM);
40802 + }
40803 +
40804 + /* if any of hash, tail, sd or permission plugins for newly created
40805 + object are not set yet set them here inheriting them from parent
40806 + directory
40807 + */
40808 + assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40809 + result = obj_plug->adjust_to_parent(object,
40810 + parent,
40811 + object->i_sb->s_root->d_inode);
40812 + if (result == 0)
40813 + result = finish_pset(object);
40814 + if (result != 0) {
40815 + warning("nikita-432", "Cannot inherit from %llx to %llx",
40816 + (unsigned long long)get_inode_oid(parent),
40817 + (unsigned long long)get_inode_oid(object));
40818 + vfs_dq_free_inode(object);
40819 + object->i_flags |= S_NOQUOTA;
40820 + return result;
40821 + }
40822 +
40823 + /* setup inode and file-operations for this inode */
40824 + setup_inode_ops(object, data);
40825 +
40826 + /* call file plugin's method to initialize plugin specific part of
40827 + * inode */
40828 + if (obj_plug->init_inode_data)
40829 + obj_plug->init_inode_data(object, data, 1/*create */);
40830 +
40831 + /* obtain directory plugin (if any) for new object. */
40832 + obj_dir = inode_dir_plugin(object);
40833 + if (obj_dir != NULL && obj_dir->init == NULL) {
40834 + vfs_dq_free_inode(object);
40835 + object->i_flags |= S_NOQUOTA;
40836 + return RETERR(-EPERM);
40837 + }
40838 +
40839 + reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40840 +
40841 + reserve = estimate_create_vfs_object(parent, object);
40842 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40843 + vfs_dq_free_inode(object);
40844 + object->i_flags |= S_NOQUOTA;
40845 + return RETERR(-ENOSPC);
40846 + }
40847 +
40848 + /* mark inode `immutable'. We disable changes to the file being
40849 + created until valid directory entry for it is inserted. Otherwise,
40850 + if file were expanded and insertion of directory entry fails, we
40851 + have to remove file, but we only alloted enough space in
40852 + transaction to remove _empty_ file. 3.x code used to remove stat
40853 + data in different transaction thus possibly leaking disk space on
40854 + crash. This all only matters if it's possible to access file
40855 + without name, for example, by inode number
40856 + */
40857 + reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40858 +
40859 + /* create empty object, this includes allocation of new objectid. For
40860 + directories this implies creation of dot and dotdot */
40861 + assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40862 +
40863 + /* mark inode as `loaded'. From this point onward
40864 + reiser4_delete_inode() will try to remove its stat-data. */
40865 + reiser4_inode_set_flag(object, REISER4_LOADED);
40866 +
40867 + result = obj_plug->create_object(object, parent, data);
40868 + if (result != 0) {
40869 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40870 + if (result != -ENAMETOOLONG && result != -ENOMEM)
40871 + warning("nikita-2219",
40872 + "Failed to create sd for %llu",
40873 + (unsigned long long)get_inode_oid(object));
40874 + vfs_dq_free_inode(object);
40875 + object->i_flags |= S_NOQUOTA;
40876 + return result;
40877 + }
40878 +
40879 + if (obj_dir != NULL)
40880 + result = obj_dir->init(object, parent, data);
40881 + if (result == 0) {
40882 + assert("nikita-434", !reiser4_inode_get_flag(object,
40883 + REISER4_NO_SD));
40884 + /* insert inode into VFS hash table */
40885 + insert_inode_hash(object);
40886 + /* create entry */
40887 + result = par_dir->add_entry(parent, dentry, data, &entry);
40888 + if (result == 0) {
40889 + result = reiser4_add_nlink(object, parent, 0);
40890 + /* If O_CREAT is set and the file did not previously
40891 + exist, upon successful completion, open() shall
40892 + mark for update the st_atime, st_ctime, and
40893 + st_mtime fields of the file and the st_ctime and
40894 + st_mtime fields of the parent directory. --SUS
40895 + */
40896 + /* @object times are already updated by
40897 + reiser4_add_nlink() */
40898 + if (result == 0)
40899 + reiser4_update_dir(parent);
40900 + if (result != 0)
40901 + /* cleanup failure to add nlink */
40902 + par_dir->rem_entry(parent, dentry, &entry);
40903 + }
40904 + if (result != 0)
40905 + /* cleanup failure to add entry */
40906 + obj_plug->detach(object, parent);
40907 + } else if (result != -ENOMEM)
40908 + warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40909 + (unsigned long long)get_inode_oid(object), result);
40910 +
40911 + /*
40912 + * update stat-data, committing all pending modifications to the inode
40913 + * fields.
40914 + */
40915 + reiser4_update_sd(object);
40916 + if (result != 0) {
40917 + vfs_dq_free_inode(object);
40918 + object->i_flags |= S_NOQUOTA;
40919 + /* if everything was ok (result == 0), parent stat-data is
40920 + * already updated above (update_parent_dir()) */
40921 + reiser4_update_sd(parent);
40922 + /* failure to create entry, remove object */
40923 + obj_plug->delete_object(object);
40924 + }
40925 +
40926 + /* file has name now, clear immutable flag */
40927 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40928 +
40929 + /* on error, iput() will call ->delete_inode(). We should keep track
40930 + of the existence of stat-data for this inode and avoid attempt to
40931 + remove it in reiser4_delete_inode(). This is accomplished through
40932 + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40933 + */
40934 + return result;
40935 +}
40936 +
40937 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40938 + reiser4_mknod and reiser4_symlink
40939 +*/
40940 +static int
40941 +create_vfs_object(struct inode *parent,
40942 + struct dentry *dentry, reiser4_object_create_data * data)
40943 +{
40944 + reiser4_context *ctx;
40945 + int result;
40946 + struct inode *child;
40947 +
40948 + ctx = reiser4_init_context(parent->i_sb);
40949 + if (IS_ERR(ctx))
40950 + return PTR_ERR(ctx);
40951 + context_set_commit_async(ctx);
40952 +
40953 + data->parent = parent;
40954 + data->dentry = dentry;
40955 + child = NULL;
40956 + result = do_create_vfs_child(data, &child);
40957 + if (unlikely(result != 0)) {
40958 + if (child != NULL) {
40959 + reiser4_make_bad_inode(child);
40960 + iput(child);
40961 + }
40962 + } else
40963 + d_instantiate(dentry, child);
40964 +
40965 + reiser4_exit_context(ctx);
40966 + return result;
40967 +}
40968 +
40969 +/**
40970 + * helper for link_common. Estimate disk space necessary to add a link
40971 + * from @parent to @object
40972 + */
40973 +static reiser4_block_nr common_estimate_link(struct inode *parent /* parent
40974 + * directory
40975 + */,
40976 + struct inode *object /* object to
40977 + * which new
40978 + * link is
40979 + * being
40980 + * created */)
40981 +{
40982 + reiser4_block_nr res = 0;
40983 + file_plugin *fplug;
40984 + dir_plugin *dplug;
40985 +
40986 + assert("vpf-317", object != NULL);
40987 + assert("vpf-318", parent != NULL);
40988 +
40989 + fplug = inode_file_plugin(object);
40990 + dplug = inode_dir_plugin(parent);
40991 + /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice
40992 + * instead of multiplying by 2? */
40993 + /* reiser4_add_nlink(object) */
40994 + res += fplug->estimate.update(object);
40995 + /* add_entry(parent) */
40996 + res += dplug->estimate.add_entry(parent);
40997 + /* reiser4_del_nlink(object) */
40998 + res += fplug->estimate.update(object);
40999 + /* update_dir(parent) */
41000 + res += inode_file_plugin(parent)->estimate.update(parent);
41001 + /* safe-link */
41002 + res += estimate_one_item_removal(reiser4_tree_by_inode(object));
41003 +
41004 + return res;
41005 +}
41006 +
41007 +/* Estimate disk space necessary to remove a link between @parent and
41008 + @object.
41009 +*/
41010 +static reiser4_block_nr estimate_unlink(struct inode *parent /* parent
41011 + * directory */,
41012 + struct inode *object /* object to which
41013 + * new link is
41014 + * being created
41015 + */)
41016 +{
41017 + reiser4_block_nr res = 0;
41018 + file_plugin *fplug;
41019 + dir_plugin *dplug;
41020 +
41021 + assert("vpf-317", object != NULL);
41022 + assert("vpf-318", parent != NULL);
41023 +
41024 + fplug = inode_file_plugin(object);
41025 + dplug = inode_dir_plugin(parent);
41026 +
41027 + /* rem_entry(parent) */
41028 + res += dplug->estimate.rem_entry(parent);
41029 + /* reiser4_del_nlink(object) */
41030 + res += fplug->estimate.update(object);
41031 + /* update_dir(parent) */
41032 + res += inode_file_plugin(parent)->estimate.update(parent);
41033 + /* fplug->unlink */
41034 + res += fplug->estimate.unlink(object, parent);
41035 + /* safe-link */
41036 + res += estimate_one_insert_item(reiser4_tree_by_inode(object));
41037 +
41038 + return res;
41039 +}
41040 +
41041 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
41042 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
41043 +{
41044 + file_plugin *fplug;
41045 + struct inode *child;
41046 + int result;
41047 +
41048 + result = 0;
41049 + child = victim->d_inode;
41050 + fplug = inode_file_plugin(child);
41051 +
41052 + /* check for race with create_object() */
41053 + if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
41054 + return RETERR(-E_REPEAT);
41055 + /* object being deleted should have stat data */
41056 + assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
41057 +
41058 + /* ask object plugin */
41059 + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
41060 + return RETERR(-ENOTEMPTY);
41061 +
41062 + result = (int)estimate_unlink(parent, child);
41063 + if (result < 0)
41064 + return result;
41065 +
41066 + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
41067 +}
41068 +
41069 +/* helper for reiser4_setattr_common */
41070 +static int setattr_reserve(reiser4_tree * tree)
41071 +{
41072 + assert("vs-1096", is_grab_enabled(get_current_context()));
41073 + return reiser4_grab_space(estimate_one_insert_into_item(tree),
41074 + BA_CAN_COMMIT);
41075 +}
41076 +
41077 +/* helper function. Standards require that for many file-system operations
41078 + on success ctime and mtime of parent directory is to be updated. */
41079 +int reiser4_update_dir(struct inode *dir)
41080 +{
41081 + assert("nikita-2525", dir != NULL);
41082 +
41083 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
41084 + return reiser4_update_sd(dir);
41085 +}
41086 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.33/fs/reiser4/plugin/inode_ops_rename.c
41087 --- linux-2.6.33.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 01:00:00.000000000 +0100
41088 +++ linux-2.6.33/fs/reiser4/plugin/inode_ops_rename.c 2010-03-04 19:33:22.000000000 +0100
41089 @@ -0,0 +1,925 @@
41090 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
41091 + * reiser4/README */
41092 +
41093 +#include "../inode.h"
41094 +#include "../safe_link.h"
41095 +
41096 +static const char *possible_leak = "Possible disk space leak.";
41097 +
41098 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
41099 +
41100 + Helper function called from hashed_rename() */
41101 +static int replace_name(struct inode *to_inode, /* inode where @from_coord is
41102 + * to be re-targeted at */
41103 + struct inode *from_dir, /* directory where @from_coord
41104 + * lives */
41105 + struct inode *from_inode, /* inode @from_coord
41106 + * originally point to */
41107 + coord_t *from_coord, /* where directory entry is in
41108 + * the tree */
41109 + lock_handle * from_lh/* lock handle on @from_coord */)
41110 +{
41111 + item_plugin *from_item;
41112 + int result;
41113 + znode *node;
41114 +
41115 + coord_clear_iplug(from_coord);
41116 + node = from_coord->node;
41117 + result = zload(node);
41118 + if (result != 0)
41119 + return result;
41120 + from_item = item_plugin_by_coord(from_coord);
41121 + if (plugin_of_group(item_plugin_by_coord(from_coord),
41122 + DIR_ENTRY_ITEM_TYPE)) {
41123 + reiser4_key to_key;
41124 +
41125 + build_sd_key(to_inode, &to_key);
41126 +
41127 + /* everything is found and prepared to change directory entry
41128 + at @from_coord to point to @to_inode.
41129 +
41130 + @to_inode is just about to get new name, so bump its link
41131 + counter.
41132 +
41133 + */
41134 + result = reiser4_add_nlink(to_inode, from_dir, 0);
41135 + if (result != 0) {
41136 + /* Don't issue warning: this may be plain -EMLINK */
41137 + zrelse(node);
41138 + return result;
41139 + }
41140 +
41141 + result =
41142 + from_item->s.dir.update_key(from_coord, &to_key, from_lh);
41143 + if (result != 0) {
41144 + reiser4_del_nlink(to_inode, from_dir, 0);
41145 + zrelse(node);
41146 + return result;
41147 + }
41148 +
41149 + /* @from_inode just lost its name, he-he.
41150 +
41151 + If @from_inode was directory, it contained dotdot pointing
41152 + to @from_dir. @from_dir i_nlink will be decreased when
41153 + iput() will be called on @from_inode.
41154 +
41155 + If file-system is not ADG (hard-links are
41156 + supported on directories), iput(from_inode) will not remove
41157 + @from_inode, and thus above is incorrect, but hard-links on
41158 + directories are problematic in many other respects.
41159 + */
41160 + result = reiser4_del_nlink(from_inode, from_dir, 0);
41161 + if (result != 0) {
41162 + warning("nikita-2330",
41163 + "Cannot remove link from source: %i. %s",
41164 + result, possible_leak);
41165 + }
41166 + /* Has to return success, because entry is already
41167 + * modified. */
41168 + result = 0;
41169 +
41170 + /* NOTE-NIKITA consider calling plugin method in stead of
41171 + accessing inode fields directly. */
41172 + from_dir->i_mtime = CURRENT_TIME;
41173 + } else {
41174 + warning("nikita-2326", "Unexpected item type");
41175 + result = RETERR(-EIO);
41176 + }
41177 + zrelse(node);
41178 + return result;
41179 +}
41180 +
41181 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
41182 +
41183 + Helper function used by hashed_rename(). */
41184 +static int add_name(struct inode *inode, /* inode where @coord is to be
41185 + * re-targeted at */
41186 + struct inode *dir, /* directory where @coord lives */
41187 + struct dentry *name, /* new name */
41188 + coord_t *coord, /* where directory entry is in the tree
41189 + */
41190 + lock_handle * lh, /* lock handle on @coord */
41191 + int is_dir/* true, if @inode is directory */)
41192 +{
41193 + int result;
41194 + reiser4_dir_entry_desc entry;
41195 +
41196 + assert("nikita-2333", lh->node == coord->node);
41197 + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
41198 +
41199 + memset(&entry, 0, sizeof entry);
41200 + entry.obj = inode;
41201 + /* build key of directory entry description */
41202 + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
41203 +
41204 + /* ext2 does this in different order: first inserts new entry,
41205 + then increases directory nlink. We don't want do this,
41206 + because reiser4_add_nlink() calls ->add_link() plugin
41207 + method that can fail for whatever reason, leaving as with
41208 + cleanup problems.
41209 + */
41210 + /* @inode is getting new name */
41211 + reiser4_add_nlink(inode, dir, 0);
41212 + /* create @new_name in @new_dir pointing to
41213 + @old_inode */
41214 + result = WITH_COORD(coord,
41215 + inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
41216 + coord,
41217 + lh,
41218 + name,
41219 + &entry));
41220 + if (result != 0) {
41221 + int result2;
41222 + result2 = reiser4_del_nlink(inode, dir, 0);
41223 + if (result2 != 0) {
41224 + warning("nikita-2327",
41225 + "Cannot drop link on %lli %i. %s",
41226 + (unsigned long long)get_inode_oid(inode),
41227 + result2, possible_leak);
41228 + }
41229 + } else
41230 + INODE_INC_FIELD(dir, i_size);
41231 + return result;
41232 +}
41233 +
41234 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory
41235 + * where @old is
41236 + * located */
41237 + struct dentry *old_name,/* old name */
41238 + struct inode *new_dir, /* directory
41239 + * where @new is
41240 + * located */
41241 + struct dentry *new_name /* new name */)
41242 +{
41243 + reiser4_block_nr res1, res2;
41244 + dir_plugin * p_parent_old, *p_parent_new;
41245 + file_plugin * p_child_old, *p_child_new;
41246 +
41247 + assert("vpf-311", old_dir != NULL);
41248 + assert("vpf-312", new_dir != NULL);
41249 + assert("vpf-313", old_name != NULL);
41250 + assert("vpf-314", new_name != NULL);
41251 +
41252 + p_parent_old = inode_dir_plugin(old_dir);
41253 + p_parent_new = inode_dir_plugin(new_dir);
41254 + p_child_old = inode_file_plugin(old_name->d_inode);
41255 + if (new_name->d_inode)
41256 + p_child_new = inode_file_plugin(new_name->d_inode);
41257 + else
41258 + p_child_new = NULL;
41259 +
41260 + /* find_entry - can insert one leaf. */
41261 + res1 = res2 = 1;
41262 +
41263 + /* replace_name */
41264 + {
41265 + /* reiser4_add_nlink(p_child_old) and
41266 + * reiser4_del_nlink(p_child_old) */
41267 + res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
41268 + /* update key */
41269 + res1 += 1;
41270 + /* reiser4_del_nlink(p_child_new) */
41271 + if (p_child_new)
41272 + res1 += p_child_new->estimate.update(new_name->d_inode);
41273 + }
41274 +
41275 + /* else add_name */
41276 + {
41277 + /* reiser4_add_nlink(p_parent_new) and
41278 + * reiser4_del_nlink(p_parent_new) */
41279 + res2 +=
41280 + 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
41281 + /* reiser4_add_nlink(p_parent_old) */
41282 + res2 += p_child_old->estimate.update(old_name->d_inode);
41283 + /* add_entry(p_parent_new) */
41284 + res2 += p_parent_new->estimate.add_entry(new_dir);
41285 + /* reiser4_del_nlink(p_parent_old) */
41286 + res2 += p_child_old->estimate.update(old_name->d_inode);
41287 + }
41288 +
41289 + res1 = res1 < res2 ? res2 : res1;
41290 +
41291 + /* reiser4_write_sd(p_parent_new) */
41292 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41293 +
41294 + /* reiser4_write_sd(p_child_new) */
41295 + if (p_child_new)
41296 + res1 += p_child_new->estimate.update(new_name->d_inode);
41297 +
41298 + /* hashed_rem_entry(p_parent_old) */
41299 + res1 += p_parent_old->estimate.rem_entry(old_dir);
41300 +
41301 + /* reiser4_del_nlink(p_child_old) */
41302 + res1 += p_child_old->estimate.update(old_name->d_inode);
41303 +
41304 + /* replace_name */
41305 + {
41306 + /* reiser4_add_nlink(p_parent_dir_new) */
41307 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41308 + /* update_key */
41309 + res1 += 1;
41310 + /* reiser4_del_nlink(p_parent_new) */
41311 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41312 + /* reiser4_del_nlink(p_parent_old) */
41313 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41314 + }
41315 +
41316 + /* reiser4_write_sd(p_parent_old) */
41317 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41318 +
41319 + /* reiser4_write_sd(p_child_old) */
41320 + res1 += p_child_old->estimate.update(old_name->d_inode);
41321 +
41322 + return res1;
41323 +}
41324 +
41325 +static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory
41326 + * where @old
41327 + * is located
41328 + */
41329 + struct dentry *old_name,/* old name
41330 + */
41331 + struct inode *new_dir, /* directory
41332 + * where @new
41333 + * is located
41334 + */
41335 + struct dentry *new_name /* new name
41336 + */)
41337 +{
41338 + reiser4_block_nr reserve;
41339 +
41340 + reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
41341 +
41342 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41343 + return RETERR(-ENOSPC);
41344 +
41345 + return 0;
41346 +}
41347 +
41348 +/* check whether @old_inode and @new_inode can be moved within file system
41349 + * tree. This singles out attempts to rename pseudo-files, for example. */
41350 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
41351 + struct inode *new_dir, struct inode *new_inode)
41352 +{
41353 + file_plugin *fplug;
41354 + dir_plugin *dplug;
41355 +
41356 + assert("nikita-3370", old_inode != NULL);
41357 +
41358 + dplug = inode_dir_plugin(new_dir);
41359 + fplug = inode_file_plugin(old_inode);
41360 +
41361 + if (dplug == NULL)
41362 + return RETERR(-ENOTDIR);
41363 + else if (new_dir->i_op->create == NULL)
41364 + return RETERR(-EPERM);
41365 + else if (!fplug->can_add_link(old_inode))
41366 + return RETERR(-EMLINK);
41367 + else if (new_inode != NULL) {
41368 + fplug = inode_file_plugin(new_inode);
41369 + if (fplug->can_rem_link != NULL &&
41370 + !fplug->can_rem_link(new_inode))
41371 + return RETERR(-EBUSY);
41372 + }
41373 + return 0;
41374 +}
41375 +
41376 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * ,
41377 + znode_lock_mode, reiser4_dir_entry_desc *);
41378 +int reiser4_update_dir(struct inode *);
41379 +
41380 +/* this is common implementation of vfs's rename method of struct
41381 + inode_operations
41382 + See comments in the body.
41383 +
41384 + It is arguable that this function can be made generic so, that it
41385 + will be applicable to any kind of directory plugin that deals with
41386 + directories composed out of directory entries. The only obstacle
41387 + here is that we don't have any data-type to represent directory
41388 + entry. This should be re-considered when more than one different
41389 + directory plugin will be implemented.
41390 +*/
41391 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41392 + * is located */ ,
41393 + struct dentry *old_name /* old name */ ,
41394 + struct inode *new_dir /* directory where @new
41395 + * is located */ ,
41396 + struct dentry *new_name/* new name */)
41397 +{
41398 + /* From `The Open Group Base Specifications Issue 6'
41399 +
41400 + If either the old or new argument names a symbolic link, rename()
41401 + shall operate on the symbolic link itself, and shall not resolve
41402 + the last component of the argument. If the old argument and the new
41403 + argument resolve to the same existing file, rename() shall return
41404 + successfully and perform no other action.
41405 +
41406 + [this is done by VFS: vfs_rename()]
41407 +
41408 + If the old argument points to the pathname of a file that is not a
41409 + directory, the new argument shall not point to the pathname of a
41410 + directory.
41411 +
41412 + [checked by VFS: vfs_rename->may_delete()]
41413 +
41414 + If the link named by the new argument exists, it shall
41415 + be removed and old renamed to new. In this case, a link named new
41416 + shall remain visible to other processes throughout the renaming
41417 + operation and refer either to the file referred to by new or old
41418 + before the operation began.
41419 +
41420 + [we should assure this]
41421 +
41422 + Write access permission is required for
41423 + both the directory containing old and the directory containing new.
41424 +
41425 + [checked by VFS: vfs_rename->may_delete(), may_create()]
41426 +
41427 + If the old argument points to the pathname of a directory, the new
41428 + argument shall not point to the pathname of a file that is not a
41429 + directory.
41430 +
41431 + [checked by VFS: vfs_rename->may_delete()]
41432 +
41433 + If the directory named by the new argument exists, it
41434 + shall be removed and old renamed to new. In this case, a link named
41435 + new shall exist throughout the renaming operation and shall refer
41436 + either to the directory referred to by new or old before the
41437 + operation began.
41438 +
41439 + [we should assure this]
41440 +
41441 + If new names an existing directory, it shall be
41442 + required to be an empty directory.
41443 +
41444 + [we should check this]
41445 +
41446 + If the old argument points to a pathname of a symbolic link, the
41447 + symbolic link shall be renamed. If the new argument points to a
41448 + pathname of a symbolic link, the symbolic link shall be removed.
41449 +
41450 + The new pathname shall not contain a path prefix that names
41451 + old. Write access permission is required for the directory
41452 + containing old and the directory containing new. If the old
41453 + argument points to the pathname of a directory, write access
41454 + permission may be required for the directory named by old, and, if
41455 + it exists, the directory named by new.
41456 +
41457 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
41458 +
41459 + If the link named by the new argument exists and the file's link
41460 + count becomes 0 when it is removed and no process has the file
41461 + open, the space occupied by the file shall be freed and the file
41462 + shall no longer be accessible. If one or more processes have the
41463 + file open when the last link is removed, the link shall be removed
41464 + before rename() returns, but the removal of the file contents shall
41465 + be postponed until all references to the file are closed.
41466 +
41467 + [iput() handles this, but we can do this manually, a la
41468 + reiser4_unlink()]
41469 +
41470 + Upon successful completion, rename() shall mark for update the
41471 + st_ctime and st_mtime fields of the parent directory of each file.
41472 +
41473 + [N/A]
41474 +
41475 + */
41476 + reiser4_context *ctx;
41477 + int result;
41478 + int is_dir; /* is @old_name directory */
41479 +
41480 + struct inode *old_inode;
41481 + struct inode *new_inode;
41482 + coord_t *new_coord;
41483 +
41484 + struct reiser4_dentry_fsdata *new_fsdata;
41485 + dir_plugin *dplug;
41486 + file_plugin *fplug;
41487 +
41488 + reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41489 + lock_handle * new_lh, *dotdot_lh;
41490 + struct dentry *dotdot_name;
41491 + struct reiser4_dentry_fsdata *dataonstack;
41492 +
41493 + ctx = reiser4_init_context(old_dir->i_sb);
41494 + if (IS_ERR(ctx))
41495 + return PTR_ERR(ctx);
41496 +
41497 + old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41498 + sizeof(*dotdot_name) + sizeof(*dataonstack),
41499 + reiser4_ctx_gfp_mask_get());
41500 + if (!old_entry) {
41501 + context_set_commit_async(ctx);
41502 + reiser4_exit_context(ctx);
41503 + return RETERR(-ENOMEM);
41504 + }
41505 +
41506 + new_entry = old_entry + 1;
41507 + dotdot_entry = old_entry + 2;
41508 + new_lh = (lock_handle *)(old_entry + 3);
41509 + dotdot_lh = new_lh + 1;
41510 + dotdot_name = (struct dentry *)(new_lh + 2);
41511 + dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41512 +
41513 + assert("nikita-2318", old_dir != NULL);
41514 + assert("nikita-2319", new_dir != NULL);
41515 + assert("nikita-2320", old_name != NULL);
41516 + assert("nikita-2321", new_name != NULL);
41517 +
41518 + old_inode = old_name->d_inode;
41519 + new_inode = new_name->d_inode;
41520 +
41521 + dplug = inode_dir_plugin(old_dir);
41522 + fplug = NULL;
41523 +
41524 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
41525 + if (IS_ERR(new_fsdata)) {
41526 + kfree(old_entry);
41527 + context_set_commit_async(ctx);
41528 + reiser4_exit_context(ctx);
41529 + return PTR_ERR(new_fsdata);
41530 + }
41531 +
41532 + new_coord = &new_fsdata->dec.entry_coord;
41533 + coord_clear_iplug(new_coord);
41534 +
41535 + is_dir = S_ISDIR(old_inode->i_mode);
41536 +
41537 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41538 +
41539 + /* if target is existing directory and it's not empty---return error.
41540 +
41541 + This check is done specifically, because is_dir_empty() requires
41542 + tree traversal and have to be done before locks are taken.
41543 + */
41544 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41545 + kfree(old_entry);
41546 + context_set_commit_async(ctx);
41547 + reiser4_exit_context(ctx);
41548 + return RETERR(-ENOTEMPTY);
41549 + }
41550 +
41551 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
41552 + if (result != 0) {
41553 + kfree(old_entry);
41554 + context_set_commit_async(ctx);
41555 + reiser4_exit_context(ctx);
41556 + return result;
41557 + }
41558 +
41559 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
41560 + new_dir, new_name);
41561 + if (result != 0) {
41562 + kfree(old_entry);
41563 + context_set_commit_async(ctx);
41564 + reiser4_exit_context(ctx);
41565 + return result;
41566 + }
41567 +
41568 + init_lh(new_lh);
41569 +
41570 + /* find entry for @new_name */
41571 + result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41572 + new_entry);
41573 +
41574 + if (IS_CBKERR(result)) {
41575 + done_lh(new_lh);
41576 + kfree(old_entry);
41577 + context_set_commit_async(ctx);
41578 + reiser4_exit_context(ctx);
41579 + return result;
41580 + }
41581 +
41582 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
41583 +
41584 + /* add or replace name for @old_inode as @new_name */
41585 + if (new_inode != NULL) {
41586 + /* target (@new_name) exists. */
41587 + /* Not clear what to do with objects that are
41588 + both directories and files at the same time. */
41589 + if (result == CBK_COORD_FOUND) {
41590 + result = replace_name(old_inode,
41591 + new_dir,
41592 + new_inode, new_coord, new_lh);
41593 + if (result == 0)
41594 + fplug = inode_file_plugin(new_inode);
41595 + } else if (result == CBK_COORD_NOTFOUND) {
41596 + /* VFS told us that @new_name is bound to existing
41597 + inode, but we failed to find directory entry. */
41598 + warning("nikita-2324", "Target not found");
41599 + result = RETERR(-ENOENT);
41600 + }
41601 + } else {
41602 + /* target (@new_name) doesn't exists. */
41603 + if (result == CBK_COORD_NOTFOUND)
41604 + result = add_name(old_inode,
41605 + new_dir,
41606 + new_name, new_coord, new_lh, is_dir);
41607 + else if (result == CBK_COORD_FOUND) {
41608 + /* VFS told us that @new_name is "negative" dentry,
41609 + but we found directory entry. */
41610 + warning("nikita-2331", "Target found unexpectedly");
41611 + result = RETERR(-EIO);
41612 + }
41613 + }
41614 +
41615 + assert("nikita-3462", ergo(result == 0,
41616 + old_inode->i_nlink >= 2 + !!is_dir));
41617 +
41618 + /* We are done with all modifications to the @new_dir, release lock on
41619 + node. */
41620 + done_lh(new_lh);
41621 +
41622 + if (fplug != NULL) {
41623 + /* detach @new_inode from name-space */
41624 + result = fplug->detach(new_inode, new_dir);
41625 + if (result != 0)
41626 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
41627 + (unsigned long long)get_inode_oid(new_inode),
41628 + result, possible_leak);
41629 + }
41630 +
41631 + if (new_inode != NULL)
41632 + reiser4_update_sd(new_inode);
41633 +
41634 + if (result == 0) {
41635 + old_entry->obj = old_inode;
41636 +
41637 + dplug->build_entry_key(old_dir,
41638 + &old_name->d_name, &old_entry->key);
41639 +
41640 + /* At this stage new name was introduced for
41641 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41642 + counters were updated.
41643 +
41644 + We want to remove @old_name now. If @old_inode wasn't
41645 + directory this is simple.
41646 + */
41647 + result = dplug->rem_entry(old_dir, old_name, old_entry);
41648 + if (result != 0 && result != -ENOMEM) {
41649 + warning("nikita-2335",
41650 + "Cannot remove old name: %i", result);
41651 + } else {
41652 + result = reiser4_del_nlink(old_inode, old_dir, 0);
41653 + if (result != 0 && result != -ENOMEM) {
41654 + warning("nikita-2337",
41655 + "Cannot drop link on old: %i", result);
41656 + }
41657 + }
41658 +
41659 + if (result == 0 && is_dir) {
41660 + /* @old_inode is directory. We also have to update
41661 + dotdot entry. */
41662 + coord_t *dotdot_coord;
41663 +
41664 + memset(dataonstack, 0, sizeof dataonstack);
41665 + memset(dotdot_entry, 0, sizeof dotdot_entry);
41666 + dotdot_entry->obj = old_dir;
41667 + memset(dotdot_name, 0, sizeof dotdot_name);
41668 + dotdot_name->d_name.name = "..";
41669 + dotdot_name->d_name.len = 2;
41670 + /*
41671 + * allocate ->d_fsdata on the stack to avoid using
41672 + * reiser4_get_dentry_fsdata(). Locking is not needed,
41673 + * because dentry is private to the current thread.
41674 + */
41675 + dotdot_name->d_fsdata = dataonstack;
41676 + init_lh(dotdot_lh);
41677 +
41678 + dotdot_coord = &dataonstack->dec.entry_coord;
41679 + coord_clear_iplug(dotdot_coord);
41680 +
41681 + result = reiser4_find_entry(old_inode, dotdot_name,
41682 + dotdot_lh, ZNODE_WRITE_LOCK,
41683 + dotdot_entry);
41684 + if (result == 0) {
41685 + /* replace_name() decreases i_nlink on
41686 + * @old_dir */
41687 + result = replace_name(new_dir,
41688 + old_inode,
41689 + old_dir,
41690 + dotdot_coord, dotdot_lh);
41691 + } else
41692 + result = RETERR(-EIO);
41693 + done_lh(dotdot_lh);
41694 + }
41695 + }
41696 + reiser4_update_dir(new_dir);
41697 + reiser4_update_dir(old_dir);
41698 + reiser4_update_sd(old_inode);
41699 + if (result == 0) {
41700 + file_plugin *fplug;
41701 +
41702 + if (new_inode != NULL) {
41703 + /* add safe-link for target file (in case we removed
41704 + * last reference to the poor fellow */
41705 + fplug = inode_file_plugin(new_inode);
41706 + if (new_inode->i_nlink == 0)
41707 + result = safe_link_add(new_inode, SAFE_UNLINK);
41708 + }
41709 + }
41710 + kfree(old_entry);
41711 + context_set_commit_async(ctx);
41712 + reiser4_exit_context(ctx);
41713 + return result;
41714 +}
41715 +
41716 +#if 0
41717 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41718 + * is located */ ,
41719 + struct dentry *old_name /* old name */ ,
41720 + struct inode *new_dir /* directory where @new
41721 + * is located */ ,
41722 + struct dentry *new_name/* new name */)
41723 +{
41724 + /* From `The Open Group Base Specifications Issue 6'
41725 +
41726 + If either the old or new argument names a symbolic link, rename()
41727 + shall operate on the symbolic link itself, and shall not resolve
41728 + the last component of the argument. If the old argument and the new
41729 + argument resolve to the same existing file, rename() shall return
41730 + successfully and perform no other action.
41731 +
41732 + [this is done by VFS: vfs_rename()]
41733 +
41734 + If the old argument points to the pathname of a file that is not a
41735 + directory, the new argument shall not point to the pathname of a
41736 + directory.
41737 +
41738 + [checked by VFS: vfs_rename->may_delete()]
41739 +
41740 + If the link named by the new argument exists, it shall
41741 + be removed and old renamed to new. In this case, a link named new
41742 + shall remain visible to other processes throughout the renaming
41743 + operation and refer either to the file referred to by new or old
41744 + before the operation began.
41745 +
41746 + [we should assure this]
41747 +
41748 + Write access permission is required for
41749 + both the directory containing old and the directory containing new.
41750 +
41751 + [checked by VFS: vfs_rename->may_delete(), may_create()]
41752 +
41753 + If the old argument points to the pathname of a directory, the new
41754 + argument shall not point to the pathname of a file that is not a
41755 + directory.
41756 +
41757 + [checked by VFS: vfs_rename->may_delete()]
41758 +
41759 + If the directory named by the new argument exists, it
41760 + shall be removed and old renamed to new. In this case, a link named
41761 + new shall exist throughout the renaming operation and shall refer
41762 + either to the directory referred to by new or old before the
41763 + operation began.
41764 +
41765 + [we should assure this]
41766 +
41767 + If new names an existing directory, it shall be
41768 + required to be an empty directory.
41769 +
41770 + [we should check this]
41771 +
41772 + If the old argument points to a pathname of a symbolic link, the
41773 + symbolic link shall be renamed. If the new argument points to a
41774 + pathname of a symbolic link, the symbolic link shall be removed.
41775 +
41776 + The new pathname shall not contain a path prefix that names
41777 + old. Write access permission is required for the directory
41778 + containing old and the directory containing new. If the old
41779 + argument points to the pathname of a directory, write access
41780 + permission may be required for the directory named by old, and, if
41781 + it exists, the directory named by new.
41782 +
41783 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
41784 +
41785 + If the link named by the new argument exists and the file's link
41786 + count becomes 0 when it is removed and no process has the file
41787 + open, the space occupied by the file shall be freed and the file
41788 + shall no longer be accessible. If one or more processes have the
41789 + file open when the last link is removed, the link shall be removed
41790 + before rename() returns, but the removal of the file contents shall
41791 + be postponed until all references to the file are closed.
41792 +
41793 + [iput() handles this, but we can do this manually, a la
41794 + reiser4_unlink()]
41795 +
41796 + Upon successful completion, rename() shall mark for update the
41797 + st_ctime and st_mtime fields of the parent directory of each file.
41798 +
41799 + [N/A]
41800 +
41801 + */
41802 + reiser4_context *ctx;
41803 + int result;
41804 + int is_dir; /* is @old_name directory */
41805 + struct inode *old_inode;
41806 + struct inode *new_inode;
41807 + reiser4_dir_entry_desc old_entry;
41808 + reiser4_dir_entry_desc new_entry;
41809 + coord_t *new_coord;
41810 + struct reiser4_dentry_fsdata *new_fsdata;
41811 + lock_handle new_lh;
41812 + dir_plugin *dplug;
41813 + file_plugin *fplug;
41814 +
41815 + ctx = reiser4_init_context(old_dir->i_sb);
41816 + if (IS_ERR(ctx))
41817 + return PTR_ERR(ctx);
41818 +
41819 + assert("nikita-2318", old_dir != NULL);
41820 + assert("nikita-2319", new_dir != NULL);
41821 + assert("nikita-2320", old_name != NULL);
41822 + assert("nikita-2321", new_name != NULL);
41823 +
41824 + old_inode = old_name->d_inode;
41825 + new_inode = new_name->d_inode;
41826 +
41827 + dplug = inode_dir_plugin(old_dir);
41828 + fplug = NULL;
41829 +
41830 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
41831 + if (IS_ERR(new_fsdata)) {
41832 + result = PTR_ERR(new_fsdata);
41833 + goto exit;
41834 + }
41835 +
41836 + new_coord = &new_fsdata->dec.entry_coord;
41837 + coord_clear_iplug(new_coord);
41838 +
41839 + is_dir = S_ISDIR(old_inode->i_mode);
41840 +
41841 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41842 +
41843 + /* if target is existing directory and it's not empty---return error.
41844 +
41845 + This check is done specifically, because is_dir_empty() requires
41846 + tree traversal and have to be done before locks are taken.
41847 + */
41848 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41849 + return RETERR(-ENOTEMPTY);
41850 +
41851 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
41852 + if (result != 0)
41853 + goto exit;
41854 +
41855 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
41856 + new_dir, new_name);
41857 + if (result != 0)
41858 + goto exit;
41859 +
41860 + init_lh(&new_lh);
41861 +
41862 + /* find entry for @new_name */
41863 + result = reiser4_find_entry(new_dir, new_name, &new_lh,
41864 + ZNODE_WRITE_LOCK, &new_entry);
41865 +
41866 + if (IS_CBKERR(result)) {
41867 + done_lh(&new_lh);
41868 + goto exit;
41869 + }
41870 +
41871 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
41872 +
41873 + /* add or replace name for @old_inode as @new_name */
41874 + if (new_inode != NULL) {
41875 + /* target (@new_name) exists. */
41876 + /* Not clear what to do with objects that are
41877 + both directories and files at the same time. */
41878 + if (result == CBK_COORD_FOUND) {
41879 + result = replace_name(old_inode,
41880 + new_dir,
41881 + new_inode, new_coord, &new_lh);
41882 + if (result == 0)
41883 + fplug = inode_file_plugin(new_inode);
41884 + } else if (result == CBK_COORD_NOTFOUND) {
41885 + /* VFS told us that @new_name is bound to existing
41886 + inode, but we failed to find directory entry. */
41887 + warning("nikita-2324", "Target not found");
41888 + result = RETERR(-ENOENT);
41889 + }
41890 + } else {
41891 + /* target (@new_name) doesn't exists. */
41892 + if (result == CBK_COORD_NOTFOUND)
41893 + result = add_name(old_inode,
41894 + new_dir,
41895 + new_name, new_coord, &new_lh, is_dir);
41896 + else if (result == CBK_COORD_FOUND) {
41897 + /* VFS told us that @new_name is "negative" dentry,
41898 + but we found directory entry. */
41899 + warning("nikita-2331", "Target found unexpectedly");
41900 + result = RETERR(-EIO);
41901 + }
41902 + }
41903 +
41904 + assert("nikita-3462", ergo(result == 0,
41905 + old_inode->i_nlink >= 2 + !!is_dir));
41906 +
41907 + /* We are done with all modifications to the @new_dir, release lock on
41908 + node. */
41909 + done_lh(&new_lh);
41910 +
41911 + if (fplug != NULL) {
41912 + /* detach @new_inode from name-space */
41913 + result = fplug->detach(new_inode, new_dir);
41914 + if (result != 0)
41915 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
41916 + (unsigned long long)get_inode_oid(new_inode),
41917 + result, possible_leak);
41918 + }
41919 +
41920 + if (new_inode != NULL)
41921 + reiser4_update_sd(new_inode);
41922 +
41923 + if (result == 0) {
41924 + memset(&old_entry, 0, sizeof old_entry);
41925 + old_entry.obj = old_inode;
41926 +
41927 + dplug->build_entry_key(old_dir,
41928 + &old_name->d_name, &old_entry.key);
41929 +
41930 + /* At this stage new name was introduced for
41931 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41932 + counters were updated.
41933 +
41934 + We want to remove @old_name now. If @old_inode wasn't
41935 + directory this is simple.
41936 + */
41937 + result = dplug->rem_entry(old_dir, old_name, &old_entry);
41938 + /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41939 + if (result != 0 && result != -ENOMEM) {
41940 + warning("nikita-2335",
41941 + "Cannot remove old name: %i", result);
41942 + } else {
41943 + result = reiser4_del_nlink(old_inode, old_dir, 0);
41944 + if (result != 0 && result != -ENOMEM) {
41945 + warning("nikita-2337",
41946 + "Cannot drop link on old: %i", result);
41947 + }
41948 + }
41949 +
41950 + if (result == 0 && is_dir) {
41951 + /* @old_inode is directory. We also have to update
41952 + dotdot entry. */
41953 + coord_t *dotdot_coord;
41954 + lock_handle dotdot_lh;
41955 + struct dentry dotdot_name;
41956 + reiser4_dir_entry_desc dotdot_entry;
41957 + struct reiser4_dentry_fsdata dataonstack;
41958 + struct reiser4_dentry_fsdata *fsdata;
41959 +
41960 + memset(&dataonstack, 0, sizeof dataonstack);
41961 + memset(&dotdot_entry, 0, sizeof dotdot_entry);
41962 + dotdot_entry.obj = old_dir;
41963 + memset(&dotdot_name, 0, sizeof dotdot_name);
41964 + dotdot_name.d_name.name = "..";
41965 + dotdot_name.d_name.len = 2;
41966 + /*
41967 + * allocate ->d_fsdata on the stack to avoid using
41968 + * reiser4_get_dentry_fsdata(). Locking is not needed,
41969 + * because dentry is private to the current thread.
41970 + */
41971 + dotdot_name.d_fsdata = &dataonstack;
41972 + init_lh(&dotdot_lh);
41973 +
41974 + fsdata = &dataonstack;
41975 + dotdot_coord = &fsdata->dec.entry_coord;
41976 + coord_clear_iplug(dotdot_coord);
41977 +
41978 + result = reiser4_find_entry(old_inode,
41979 + &dotdot_name,
41980 + &dotdot_lh,
41981 + ZNODE_WRITE_LOCK,
41982 + &dotdot_entry);
41983 + if (result == 0) {
41984 + /* replace_name() decreases i_nlink on
41985 + * @old_dir */
41986 + result = replace_name(new_dir,
41987 + old_inode,
41988 + old_dir,
41989 + dotdot_coord, &dotdot_lh);
41990 + } else
41991 + result = RETERR(-EIO);
41992 + done_lh(&dotdot_lh);
41993 + }
41994 + }
41995 + reiser4_update_dir(new_dir);
41996 + reiser4_update_dir(old_dir);
41997 + reiser4_update_sd(old_inode);
41998 + if (result == 0) {
41999 + file_plugin *fplug;
42000 +
42001 + if (new_inode != NULL) {
42002 + /* add safe-link for target file (in case we removed
42003 + * last reference to the poor fellow */
42004 + fplug = inode_file_plugin(new_inode);
42005 + if (new_inode->i_nlink == 0)
42006 + result = safe_link_add(new_inode, SAFE_UNLINK);
42007 + }
42008 + }
42009 +exit:
42010 + context_set_commit_async(ctx);
42011 + reiser4_exit_context(ctx);
42012 + return result;
42013 +}
42014 +#endif
42015 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/acl.h linux-2.6.33/fs/reiser4/plugin/item/acl.h
42016 --- linux-2.6.33.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 01:00:00.000000000 +0100
42017 +++ linux-2.6.33/fs/reiser4/plugin/item/acl.h 2010-03-04 19:33:22.000000000 +0100
42018 @@ -0,0 +1,66 @@
42019 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42020 +
42021 +/* Directory entry. */
42022 +
42023 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
42024 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
42025 +
42026 +#include "../../forward.h"
42027 +#include "../../dformat.h"
42028 +#include "../../kassign.h"
42029 +#include "../../key.h"
42030 +
42031 +#include <linux/fs.h>
42032 +#include <linux/dcache.h> /* for struct dentry */
42033 +
42034 +typedef struct directory_entry_format {
42035 + /* key of object stat-data. It's not necessary to store whole
42036 + key here, because it's always key of stat-data, so minor
42037 + packing locality and offset can be omitted here. But this
42038 + relies on particular key allocation scheme for stat-data, so,
42039 + for extensibility sake, whole key can be stored here.
42040 +
42041 + We store key as array of bytes, because we don't want 8-byte
42042 + alignment of dir entries.
42043 + */
42044 + obj_key_id id;
42045 + /* file name. Null terminated string. */
42046 + d8 name[0];
42047 +} directory_entry_format;
42048 +
42049 +void print_de(const char *prefix, coord_t * coord);
42050 +int extract_key_de(const coord_t * coord, reiser4_key * key);
42051 +int update_key_de(const coord_t * coord, const reiser4_key * key,
42052 + lock_handle * lh);
42053 +char *extract_name_de(const coord_t * coord, char *buf);
42054 +unsigned extract_file_type_de(const coord_t * coord);
42055 +int add_entry_de(struct inode *dir, coord_t * coord,
42056 + lock_handle * lh, const struct dentry *name,
42057 + reiser4_dir_entry_desc * entry);
42058 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
42059 + lock_handle * lh, reiser4_dir_entry_desc * entry);
42060 +int max_name_len_de(const struct inode *dir);
42061 +
42062 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
42063 +
42064 +char *extract_dent_name(const coord_t * coord,
42065 + directory_entry_format * dent, char *buf);
42066 +
42067 +#if REISER4_LARGE_KEY
42068 +#define DE_NAME_BUF_LEN (24)
42069 +#else
42070 +#define DE_NAME_BUF_LEN (16)
42071 +#endif
42072 +
42073 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
42074 +#endif
42075 +
42076 +/* Make Linus happy.
42077 + Local variables:
42078 + c-indentation-style: "K&R"
42079 + mode-name: "LC"
42080 + c-basic-offset: 8
42081 + tab-width: 8
42082 + fill-column: 120
42083 + End:
42084 +*/
42085 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.33/fs/reiser4/plugin/item/blackbox.c
42086 --- linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 01:00:00.000000000 +0100
42087 +++ linux-2.6.33/fs/reiser4/plugin/item/blackbox.c 2010-03-04 19:33:22.000000000 +0100
42088 @@ -0,0 +1,142 @@
42089 +/* Copyright 2003 by Hans Reiser, licensing governed by
42090 + * reiser4/README */
42091 +
42092 +/* Black box item implementation */
42093 +
42094 +#include "../../forward.h"
42095 +#include "../../debug.h"
42096 +#include "../../dformat.h"
42097 +#include "../../kassign.h"
42098 +#include "../../coord.h"
42099 +#include "../../tree.h"
42100 +#include "../../lock.h"
42101 +
42102 +#include "blackbox.h"
42103 +#include "item.h"
42104 +#include "../plugin.h"
42105 +
42106 +int
42107 +store_black_box(reiser4_tree * tree,
42108 + const reiser4_key * key, void *data, int length)
42109 +{
42110 + int result;
42111 + reiser4_item_data idata;
42112 + coord_t coord;
42113 + lock_handle lh;
42114 +
42115 + memset(&idata, 0, sizeof idata);
42116 +
42117 + idata.data = data;
42118 + idata.user = 0;
42119 + idata.length = length;
42120 + idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
42121 +
42122 + init_lh(&lh);
42123 + result = insert_by_key(tree, key,
42124 + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
42125 +
42126 + assert("nikita-3413",
42127 + ergo(result == 0,
42128 + WITH_COORD(&coord,
42129 + item_length_by_coord(&coord) == length)));
42130 +
42131 + done_lh(&lh);
42132 + return result;
42133 +}
42134 +
42135 +int
42136 +load_black_box(reiser4_tree * tree,
42137 + reiser4_key * key, void *data, int length, int exact)
42138 +{
42139 + int result;
42140 + coord_t coord;
42141 + lock_handle lh;
42142 +
42143 + init_lh(&lh);
42144 + result = coord_by_key(tree, key,
42145 + &coord, &lh, ZNODE_READ_LOCK,
42146 + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
42147 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
42148 +
42149 + if (result == 0) {
42150 + int ilen;
42151 +
42152 + result = zload(coord.node);
42153 + if (result == 0) {
42154 + ilen = item_length_by_coord(&coord);
42155 + if (ilen <= length) {
42156 + memcpy(data, item_body_by_coord(&coord), ilen);
42157 + unit_key_by_coord(&coord, key);
42158 + } else if (exact) {
42159 + /*
42160 + * item is larger than buffer provided by the
42161 + * user. Only issue a warning if @exact is
42162 + * set. If @exact is false, we are iterating
42163 + * over all safe-links and here we are reaching
42164 + * the end of the iteration.
42165 + */
42166 + warning("nikita-3415",
42167 + "Wrong black box length: %i > %i",
42168 + ilen, length);
42169 + result = RETERR(-EIO);
42170 + }
42171 + zrelse(coord.node);
42172 + }
42173 + }
42174 +
42175 + done_lh(&lh);
42176 + return result;
42177 +
42178 +}
42179 +
42180 +int
42181 +update_black_box(reiser4_tree * tree,
42182 + const reiser4_key * key, void *data, int length)
42183 +{
42184 + int result;
42185 + coord_t coord;
42186 + lock_handle lh;
42187 +
42188 + init_lh(&lh);
42189 + result = coord_by_key(tree, key,
42190 + &coord, &lh, ZNODE_READ_LOCK,
42191 + FIND_EXACT,
42192 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
42193 + if (result == 0) {
42194 + int ilen;
42195 +
42196 + result = zload(coord.node);
42197 + if (result == 0) {
42198 + ilen = item_length_by_coord(&coord);
42199 + if (length <= ilen) {
42200 + memcpy(item_body_by_coord(&coord), data,
42201 + length);
42202 + } else {
42203 + warning("nikita-3437",
42204 + "Wrong black box length: %i < %i",
42205 + ilen, length);
42206 + result = RETERR(-EIO);
42207 + }
42208 + zrelse(coord.node);
42209 + }
42210 + }
42211 +
42212 + done_lh(&lh);
42213 + return result;
42214 +
42215 +}
42216 +
42217 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
42218 +{
42219 + return reiser4_cut_tree(tree, key, key, NULL, 1);
42220 +}
42221 +
42222 +/* Make Linus happy.
42223 + Local variables:
42224 + c-indentation-style: "K&R"
42225 + mode-name: "LC"
42226 + c-basic-offset: 8
42227 + tab-width: 8
42228 + fill-column: 120
42229 + End:
42230 +*/
42231 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.33/fs/reiser4/plugin/item/blackbox.h
42232 --- linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 01:00:00.000000000 +0100
42233 +++ linux-2.6.33/fs/reiser4/plugin/item/blackbox.h 2010-03-04 19:33:22.000000000 +0100
42234 @@ -0,0 +1,33 @@
42235 +/* Copyright 2003 by Hans Reiser, licensing governed by
42236 + * reiser4/README */
42237 +
42238 +/* "Black box" entry to fixed-width contain user supplied data */
42239 +
42240 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
42241 +#define __FS_REISER4_BLACK_BOX_H__
42242 +
42243 +#include "../../forward.h"
42244 +#include "../../dformat.h"
42245 +#include "../../kassign.h"
42246 +#include "../../key.h"
42247 +
42248 +extern int store_black_box(reiser4_tree * tree,
42249 + const reiser4_key * key, void *data, int length);
42250 +extern int load_black_box(reiser4_tree * tree,
42251 + reiser4_key * key, void *data, int length, int exact);
42252 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
42253 +extern int update_black_box(reiser4_tree * tree,
42254 + const reiser4_key * key, void *data, int length);
42255 +
42256 +/* __FS_REISER4_BLACK_BOX_H__ */
42257 +#endif
42258 +
42259 +/* Make Linus happy.
42260 + Local variables:
42261 + c-indentation-style: "K&R"
42262 + mode-name: "LC"
42263 + c-basic-offset: 8
42264 + tab-width: 8
42265 + fill-column: 120
42266 + End:
42267 +*/
42268 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/cde.c linux-2.6.33/fs/reiser4/plugin/item/cde.c
42269 --- linux-2.6.33.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 01:00:00.000000000 +0100
42270 +++ linux-2.6.33/fs/reiser4/plugin/item/cde.c 2010-03-04 19:33:22.000000000 +0100
42271 @@ -0,0 +1,1008 @@
42272 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42273 +
42274 +/* Directory entry implementation */
42275 +
42276 +/* DESCRIPTION:
42277 +
42278 + This is "compound" directory item plugin implementation. This directory
42279 + item type is compound (as opposed to the "simple directory item" in
42280 + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
42281 + entries.
42282 +
42283 + The reason behind this decision is disk space efficiency: all directory
42284 + entries inside the same directory have identical fragment in their
42285 + keys. This, of course, depends on key assignment policy. In our default key
42286 + assignment policy, all directory entries have the same locality which is
42287 + equal to the object id of their directory.
42288 +
42289 + Composing directory item out of several directory entries for the same
42290 + directory allows us to store said key fragment only once. That is, this is
42291 + some ad hoc form of key compression (stem compression) that is implemented
42292 + here, because general key compression is not supposed to be implemented in
42293 + v4.0.
42294 +
42295 + Another decision that was made regarding all directory item plugins, is
42296 + that they will store entry keys unaligned. This is for that sake of disk
42297 + space efficiency again.
42298 +
42299 + In should be noted, that storing keys unaligned increases CPU consumption,
42300 + at least on some architectures.
42301 +
42302 + Internal on-disk structure of the compound directory item is the following:
42303 +
42304 + HEADER cde_item_format. Here number of entries is stored.
42305 + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
42306 + ENTRY_HEADER_1 offset of entry body are stored.
42307 + ENTRY_HEADER_2 (basically two last parts of key)
42308 + ...
42309 + ENTRY_HEADER_N
42310 + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
42311 + ENTRY_BODY_1 NUL-terminated name are stored.
42312 + ENTRY_BODY_2 (part of statadta key in the
42313 + sence that since all SDs have
42314 + zero offset, this offset is not
42315 + stored on disk).
42316 + ...
42317 + ENTRY_BODY_N
42318 +
42319 + When it comes to the balancing, each directory entry in compound directory
42320 + item is unit, that is, something that can be cut from one item and pasted
42321 + into another item of the same type. Handling of unit cut and paste is major
42322 + reason for the complexity of code below.
42323 +
42324 +*/
42325 +
42326 +#include "../../forward.h"
42327 +#include "../../debug.h"
42328 +#include "../../dformat.h"
42329 +#include "../../kassign.h"
42330 +#include "../../key.h"
42331 +#include "../../coord.h"
42332 +#include "sde.h"
42333 +#include "cde.h"
42334 +#include "item.h"
42335 +#include "../node/node.h"
42336 +#include "../plugin.h"
42337 +#include "../../znode.h"
42338 +#include "../../carry.h"
42339 +#include "../../tree.h"
42340 +#include "../../inode.h"
42341 +
42342 +#include <linux/fs.h> /* for struct inode */
42343 +#include <linux/dcache.h> /* for struct dentry */
42344 +#include <linux/quotaops.h>
42345 +
42346 +#if 0
42347 +#define CHECKME(coord) \
42348 +({ \
42349 + const char *message; \
42350 + coord_t dup; \
42351 + \
42352 + coord_dup_nocheck(&dup, (coord)); \
42353 + dup.unit_pos = 0; \
42354 + assert("nikita-2871", cde_check(&dup, &message) == 0); \
42355 +})
42356 +#else
42357 +#define CHECKME(coord) noop
42358 +#endif
42359 +
42360 +/* return body of compound directory item at @coord */
42361 +static inline cde_item_format *formatted_at(const coord_t * coord)
42362 +{
42363 + assert("nikita-1282", coord != NULL);
42364 + return item_body_by_coord(coord);
42365 +}
42366 +
42367 +/* return entry header at @coord */
42368 +static inline cde_unit_header *header_at(const coord_t *
42369 + coord /* coord of item */ ,
42370 + int idx /* index of unit */ )
42371 +{
42372 + assert("nikita-1283", coord != NULL);
42373 + return &formatted_at(coord)->entry[idx];
42374 +}
42375 +
42376 +/* return number of units in compound directory item at @coord */
42377 +static int units(const coord_t * coord /* coord of item */ )
42378 +{
42379 + return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
42380 +}
42381 +
42382 +/* return offset of the body of @idx-th entry in @coord */
42383 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
42384 + int idx /* index of unit */ )
42385 +{
42386 + if (idx < units(coord))
42387 + return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
42388 + else if (idx == units(coord))
42389 + return item_length_by_coord(coord);
42390 + else
42391 + impossible("nikita-1308", "Wrong idx");
42392 + return 0;
42393 +}
42394 +
42395 +/* set offset of the body of @idx-th entry in @coord */
42396 +static void set_offset(const coord_t * coord /* coord of item */ ,
42397 + int idx /* index of unit */ ,
42398 + unsigned int offset /* new offset */ )
42399 +{
42400 + put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
42401 +}
42402 +
42403 +static void adj_offset(const coord_t * coord /* coord of item */ ,
42404 + int idx /* index of unit */ ,
42405 + int delta /* offset change */ )
42406 +{
42407 + d16 *doffset;
42408 + __u16 offset;
42409 +
42410 + doffset = &header_at(coord, idx)->offset;
42411 + offset = le16_to_cpu(get_unaligned(doffset));
42412 + offset += delta;
42413 + put_unaligned(cpu_to_le16((__u16) offset), doffset);
42414 +}
42415 +
42416 +/* return pointer to @offset-th byte from the beginning of @coord */
42417 +static char *address(const coord_t * coord /* coord of item */ ,
42418 + int offset)
42419 +{
42420 + return ((char *)item_body_by_coord(coord)) + offset;
42421 +}
42422 +
42423 +/* return pointer to the body of @idx-th entry in @coord */
42424 +static directory_entry_format *entry_at(const coord_t * coord /* coord of
42425 + * item */ ,
42426 + int idx /* index of unit */ )
42427 +{
42428 + return (directory_entry_format *) address(coord,
42429 + (int)offset_of(coord, idx));
42430 +}
42431 +
42432 +/* return number of unit referenced by @coord */
42433 +static int idx_of(const coord_t * coord /* coord of item */ )
42434 +{
42435 + assert("nikita-1285", coord != NULL);
42436 + return coord->unit_pos;
42437 +}
42438 +
42439 +/* find position where entry with @entry_key would be inserted into @coord */
42440 +static int find(const coord_t * coord /* coord of item */ ,
42441 + const reiser4_key * entry_key /* key to look for */ ,
42442 + cmp_t * last /* result of last comparison */ )
42443 +{
42444 + int entries;
42445 +
42446 + int left;
42447 + int right;
42448 +
42449 + cde_unit_header *header;
42450 +
42451 + assert("nikita-1295", coord != NULL);
42452 + assert("nikita-1296", entry_key != NULL);
42453 + assert("nikita-1297", last != NULL);
42454 +
42455 + entries = units(coord);
42456 + left = 0;
42457 + right = entries - 1;
42458 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42459 + int median;
42460 +
42461 + median = (left + right) >> 1;
42462 +
42463 + header = header_at(coord, median);
42464 + *last = de_id_key_cmp(&header->hash, entry_key);
42465 + switch (*last) {
42466 + case LESS_THAN:
42467 + left = median;
42468 + break;
42469 + case GREATER_THAN:
42470 + right = median;
42471 + break;
42472 + case EQUAL_TO:{
42473 + do {
42474 + median--;
42475 + header--;
42476 + } while (median >= 0 &&
42477 + de_id_key_cmp(&header->hash,
42478 + entry_key) == EQUAL_TO);
42479 + return median + 1;
42480 + }
42481 + }
42482 + }
42483 + header = header_at(coord, left);
42484 + for (; left < entries; ++left, ++header) {
42485 + prefetch(header + 1);
42486 + *last = de_id_key_cmp(&header->hash, entry_key);
42487 + if (*last != LESS_THAN)
42488 + break;
42489 + }
42490 + if (left < entries)
42491 + return left;
42492 + else
42493 + return RETERR(-ENOENT);
42494 +
42495 +}
42496 +
42497 +/* expand @coord as to accommodate for insertion of @no new entries starting
42498 + from @pos, with total bodies size @size. */
42499 +static int expand_item(const coord_t * coord /* coord of item */ ,
42500 + int pos /* unit position */ , int no /* number of new
42501 + * units*/ ,
42502 + int size /* total size of new units' data */ ,
42503 + unsigned int data_size /* free space already reserved
42504 + * in the item for insertion */ )
42505 +{
42506 + int entries;
42507 + cde_unit_header *header;
42508 + char *dent;
42509 + int i;
42510 +
42511 + assert("nikita-1310", coord != NULL);
42512 + assert("nikita-1311", pos >= 0);
42513 + assert("nikita-1312", no > 0);
42514 + assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42515 + assert("nikita-1343",
42516 + item_length_by_coord(coord) >=
42517 + (int)(size + data_size + no * sizeof *header));
42518 +
42519 + entries = units(coord);
42520 +
42521 + if (pos == entries)
42522 + dent = address(coord, size);
42523 + else
42524 + dent = (char *)entry_at(coord, pos);
42525 + /* place where new header will be in */
42526 + header = header_at(coord, pos);
42527 + /* free space for new entry headers */
42528 + memmove(header + no, header,
42529 + (unsigned)(address(coord, size) - (char *)header));
42530 + /* if adding to the end initialise first new header */
42531 + if (pos == entries) {
42532 + set_offset(coord, pos, (unsigned)size);
42533 + }
42534 +
42535 + /* adjust entry pointer and size */
42536 + dent = dent + no * sizeof *header;
42537 + size += no * sizeof *header;
42538 + /* free space for new entries */
42539 + memmove(dent + data_size, dent,
42540 + (unsigned)(address(coord, size) - dent));
42541 +
42542 + /* increase counter */
42543 + entries += no;
42544 + put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42545 +
42546 + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42547 + bytes. */
42548 + for (i = 0; i <= pos; ++i)
42549 + adj_offset(coord, i, no * sizeof *header);
42550 + /* [ pos + no ... +\infty ) entries were shifted by ( no *
42551 + sizeof *header + data_size ) bytes */
42552 + for (i = pos + no; i < entries; ++i)
42553 + adj_offset(coord, i, no * sizeof *header + data_size);
42554 + return 0;
42555 +}
42556 +
42557 +/* insert new @entry into item */
42558 +static int expand(const coord_t * coord /* coord of item */ ,
42559 + struct cde_entry * entry /* entry to insert */ ,
42560 + int len /* length of @entry data */ ,
42561 + int *pos /* position to insert */ ,
42562 + reiser4_dir_entry_desc * dir_entry /* parameters for new
42563 + * entry */ )
42564 +{
42565 + cmp_t cmp_res;
42566 + int datasize;
42567 +
42568 + *pos = find(coord, &dir_entry->key, &cmp_res);
42569 + if (*pos < 0)
42570 + *pos = units(coord);
42571 +
42572 + datasize = sizeof(directory_entry_format);
42573 + if (is_longname(entry->name->name, entry->name->len))
42574 + datasize += entry->name->len + 1;
42575 +
42576 + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42577 + datasize);
42578 + return 0;
42579 +}
42580 +
42581 +/* paste body of @entry into item */
42582 +static int paste_entry(const coord_t * coord /* coord of item */ ,
42583 + struct cde_entry * entry /* new entry */ ,
42584 + int pos /* position to insert */ ,
42585 + reiser4_dir_entry_desc * dir_entry /* parameters for
42586 + * new entry */ )
42587 +{
42588 + cde_unit_header *header;
42589 + directory_entry_format *dent;
42590 + const char *name;
42591 + int len;
42592 +
42593 + header = header_at(coord, pos);
42594 + dent = entry_at(coord, pos);
42595 +
42596 + build_de_id_by_key(&dir_entry->key, &header->hash);
42597 + build_inode_key_id(entry->obj, &dent->id);
42598 + /* AUDIT unsafe strcpy() operation! It should be replaced with
42599 + much less CPU hungry
42600 + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42601 +
42602 + Also a more major thing is that there should be a way to figure out
42603 + amount of space in dent -> name and be able to check that we are
42604 + not going to overwrite more than we supposed to */
42605 + name = entry->name->name;
42606 + len = entry->name->len;
42607 + if (is_longname(name, len)) {
42608 + strcpy((unsigned char *)dent->name, name);
42609 + put_unaligned(0, &dent->name[len]);
42610 + }
42611 + return 0;
42612 +}
42613 +
42614 +/* estimate how much space is necessary in item to insert/paste set of entries
42615 + described in @data. */
42616 +int estimate_cde(const coord_t * coord /* coord of item */ ,
42617 + const reiser4_item_data * data /* parameters for new item */ )
42618 +{
42619 + struct cde_entry_data *e;
42620 + int result;
42621 + int i;
42622 +
42623 + e = (struct cde_entry_data *) data->data;
42624 +
42625 + assert("nikita-1288", e != NULL);
42626 + assert("nikita-1289", e->num_of_entries >= 0);
42627 +
42628 + if (coord == NULL)
42629 + /* insert */
42630 + result = sizeof(cde_item_format);
42631 + else
42632 + /* paste */
42633 + result = 0;
42634 +
42635 + result += e->num_of_entries *
42636 + (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42637 + for (i = 0; i < e->num_of_entries; ++i) {
42638 + const char *name;
42639 + int len;
42640 +
42641 + name = e->entry[i].name->name;
42642 + len = e->entry[i].name->len;
42643 + assert("nikita-2054", strlen(name) == len);
42644 + if (is_longname(name, len))
42645 + result += len + 1;
42646 + }
42647 + ((reiser4_item_data *) data)->length = result;
42648 + return result;
42649 +}
42650 +
42651 +/* ->nr_units() method for this item plugin. */
42652 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42653 +{
42654 + return units(coord);
42655 +}
42656 +
42657 +/* ->unit_key() method for this item plugin. */
42658 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42659 + reiser4_key * key /* resulting key */ )
42660 +{
42661 + assert("nikita-1452", coord != NULL);
42662 + assert("nikita-1345", idx_of(coord) < units(coord));
42663 + assert("nikita-1346", key != NULL);
42664 +
42665 + item_key_by_coord(coord, key);
42666 + extract_key_from_de_id(extract_dir_id_from_key(key),
42667 + &header_at(coord, idx_of(coord))->hash, key);
42668 + return key;
42669 +}
42670 +
42671 +/* mergeable_cde(): implementation of ->mergeable() item method.
42672 +
42673 + Two directory items are mergeable iff they are from the same
42674 + directory. That simple.
42675 +
42676 +*/
42677 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42678 + const coord_t * p2 /* coord of second item */ )
42679 +{
42680 + reiser4_key k1;
42681 + reiser4_key k2;
42682 +
42683 + assert("nikita-1339", p1 != NULL);
42684 + assert("nikita-1340", p2 != NULL);
42685 +
42686 + return
42687 + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42688 + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42689 + extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42690 +
42691 +}
42692 +
42693 +/* ->max_key_inside() method for this item plugin. */
42694 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42695 + reiser4_key * result /* resulting key */ )
42696 +{
42697 + assert("nikita-1342", coord != NULL);
42698 +
42699 + item_key_by_coord(coord, result);
42700 + set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42701 + set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42702 + set_key_offset(result, get_key_offset(reiser4_max_key()));
42703 + return result;
42704 +}
42705 +
42706 +/* @data contains data which are to be put into tree */
42707 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42708 + const reiser4_key * key /* key to check */ ,
42709 + const reiser4_item_data * data /* parameters of new
42710 + * item/unit being
42711 + * created */ )
42712 +{
42713 + reiser4_key item_key;
42714 +
42715 + /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42716 + data->iplug is initialized */
42717 + assert("vs-457", data && data->iplug);
42718 +/* assert( "vs-553", data -> user == 0 );*/
42719 + item_key_by_coord(coord, &item_key);
42720 +
42721 + return (item_plugin_by_coord(coord) == data->iplug) &&
42722 + (extract_dir_id_from_key(&item_key) ==
42723 + extract_dir_id_from_key(key));
42724 +}
42725 +
42726 +#if REISER4_DEBUG
42727 +/* cde_check ->check() method for compressed directory items
42728 +
42729 + used for debugging, every item should have here the most complete
42730 + possible check of the consistency of the item that the inventor can
42731 + construct
42732 +*/
42733 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42734 + const char **error /* where to store error message */)
42735 +{
42736 + int i;
42737 + int result;
42738 + char *item_start;
42739 + char *item_end;
42740 + reiser4_key key;
42741 +
42742 + coord_t c;
42743 +
42744 + assert("nikita-1357", coord != NULL);
42745 + assert("nikita-1358", error != NULL);
42746 +
42747 + if (!ergo(coord->item_pos != 0,
42748 + is_dot_key(item_key_by_coord(coord, &key)))) {
42749 + *error = "CDE doesn't start with dot";
42750 + return -1;
42751 + }
42752 + item_start = item_body_by_coord(coord);
42753 + item_end = item_start + item_length_by_coord(coord);
42754 +
42755 + coord_dup(&c, coord);
42756 + result = 0;
42757 + for (i = 0; i < units(coord); ++i) {
42758 + directory_entry_format *entry;
42759 +
42760 + if ((char *)(header_at(coord, i) + 1) >
42761 + item_end - units(coord) * sizeof *entry) {
42762 + *error = "CDE header is out of bounds";
42763 + result = -1;
42764 + break;
42765 + }
42766 + entry = entry_at(coord, i);
42767 + if ((char *)entry < item_start + sizeof(cde_item_format)) {
42768 + *error = "CDE header is too low";
42769 + result = -1;
42770 + break;
42771 + }
42772 + if ((char *)(entry + 1) > item_end) {
42773 + *error = "CDE header is too high";
42774 + result = -1;
42775 + break;
42776 + }
42777 + }
42778 +
42779 + return result;
42780 +}
42781 +#endif
42782 +
42783 +/* ->init() method for this item plugin. */
42784 +int init_cde(coord_t * coord /* coord of item */ ,
42785 + coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
42786 + UNUSED_ARG)
42787 +{
42788 + put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42789 + return 0;
42790 +}
42791 +
42792 +/* ->lookup() method for this item plugin. */
42793 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42794 + lookup_bias bias /* search bias */ ,
42795 + coord_t * coord /* coord of item to lookup in */ )
42796 +{
42797 + cmp_t last_comp;
42798 + int pos;
42799 +
42800 + reiser4_key utmost_key;
42801 +
42802 + assert("nikita-1293", coord != NULL);
42803 + assert("nikita-1294", key != NULL);
42804 +
42805 + CHECKME(coord);
42806 +
42807 + if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42808 + coord->unit_pos = 0;
42809 + coord->between = BEFORE_UNIT;
42810 + return CBK_COORD_NOTFOUND;
42811 + }
42812 + pos = find(coord, key, &last_comp);
42813 + if (pos >= 0) {
42814 + coord->unit_pos = (int)pos;
42815 + switch (last_comp) {
42816 + case EQUAL_TO:
42817 + coord->between = AT_UNIT;
42818 + return CBK_COORD_FOUND;
42819 + case GREATER_THAN:
42820 + coord->between = BEFORE_UNIT;
42821 + return RETERR(-ENOENT);
42822 + case LESS_THAN:
42823 + default:
42824 + impossible("nikita-1298", "Broken find");
42825 + return RETERR(-EIO);
42826 + }
42827 + } else {
42828 + coord->unit_pos = units(coord) - 1;
42829 + coord->between = AFTER_UNIT;
42830 + return (bias ==
42831 + FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42832 + CBK_COORD_NOTFOUND;
42833 + }
42834 +}
42835 +
42836 +/* ->paste() method for this item plugin. */
42837 +int paste_cde(coord_t * coord /* coord of item */ ,
42838 + reiser4_item_data * data /* parameters of new unit being
42839 + * inserted */ ,
42840 + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42841 +{
42842 + struct cde_entry_data *e;
42843 + int result;
42844 + int i;
42845 +
42846 + CHECKME(coord);
42847 + e = (struct cde_entry_data *) data->data;
42848 +
42849 + result = 0;
42850 + for (i = 0; i < e->num_of_entries; ++i) {
42851 + int pos;
42852 + int phantom_size;
42853 +
42854 + phantom_size = data->length;
42855 + if (units(coord) == 0)
42856 + phantom_size -= sizeof(cde_item_format);
42857 +
42858 + result =
42859 + expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42860 + if (result != 0)
42861 + break;
42862 + result = paste_entry(coord, e->entry + i, pos, data->arg);
42863 + if (result != 0)
42864 + break;
42865 + }
42866 + CHECKME(coord);
42867 + return result;
42868 +}
42869 +
42870 +/* amount of space occupied by all entries starting from @idx both headers and
42871 + bodies. */
42872 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42873 + int idx /* index of unit */ )
42874 +{
42875 + assert("nikita-1299", coord != NULL);
42876 + assert("nikita-1300", idx < (int)units(coord));
42877 +
42878 + return sizeof(cde_item_format) +
42879 + (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42880 + idx + 1) -
42881 + offset_of(coord, 0);
42882 +}
42883 +
42884 +/* how many but not more than @want units of @source can be merged with
42885 + item in @target node. If pend == append - we try to append last item
42886 + of @target by first units of @source. If pend == prepend - we try to
42887 + "prepend" first item in @target by last units of @source. @target
42888 + node has @free_space bytes of free space. Total size of those units
42889 + are returned via @size */
42890 +int can_shift_cde(unsigned free_space /* free space in item */ ,
42891 + coord_t * coord /* coord of source item */ ,
42892 + znode * target /* target node */ ,
42893 + shift_direction pend /* shift direction */ ,
42894 + unsigned *size /* resulting number of shifted bytes */ ,
42895 + unsigned want /* maximal number of bytes to shift */ )
42896 +{
42897 + int shift;
42898 +
42899 + CHECKME(coord);
42900 + if (want == 0) {
42901 + *size = 0;
42902 + return 0;
42903 + }
42904 +
42905 + /* pend == SHIFT_LEFT <==> shifting to the left */
42906 + if (pend == SHIFT_LEFT) {
42907 + for (shift = min((int)want - 1, units(coord)); shift >= 0;
42908 + --shift) {
42909 + *size = part_size(coord, shift);
42910 + if (target != NULL)
42911 + *size -= sizeof(cde_item_format);
42912 + if (*size <= free_space)
42913 + break;
42914 + }
42915 + shift = shift + 1;
42916 + } else {
42917 + int total_size;
42918 +
42919 + assert("nikita-1301", pend == SHIFT_RIGHT);
42920 +
42921 + total_size = item_length_by_coord(coord);
42922 + for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42923 + ++shift) {
42924 + *size = total_size - part_size(coord, shift);
42925 + if (target == NULL)
42926 + *size += sizeof(cde_item_format);
42927 + if (*size <= free_space)
42928 + break;
42929 + }
42930 + shift = units(coord) - shift - 1;
42931 + }
42932 + if (shift == 0)
42933 + *size = 0;
42934 + CHECKME(coord);
42935 + return shift;
42936 +}
42937 +
42938 +/* ->copy_units() method for this item plugin. */
42939 +void copy_units_cde(coord_t * target /* coord of target item */ ,
42940 + coord_t * source /* coord of source item */ ,
42941 + unsigned from /* starting unit */ ,
42942 + unsigned count /* how many units to copy */ ,
42943 + shift_direction where_is_free_space /* shift direction */ ,
42944 + unsigned free_space /* free space in item */ )
42945 +{
42946 + char *header_from;
42947 + char *header_to;
42948 +
42949 + char *entry_from;
42950 + char *entry_to;
42951 +
42952 + int pos_in_target;
42953 + int data_size;
42954 + int data_delta;
42955 + int i;
42956 +
42957 + assert("nikita-1303", target != NULL);
42958 + assert("nikita-1304", source != NULL);
42959 + assert("nikita-1305", (int)from < units(source));
42960 + assert("nikita-1307", (int)(from + count) <= units(source));
42961 +
42962 + if (where_is_free_space == SHIFT_LEFT) {
42963 + assert("nikita-1453", from == 0);
42964 + pos_in_target = units(target);
42965 + } else {
42966 + assert("nikita-1309", (int)(from + count) == units(source));
42967 + pos_in_target = 0;
42968 + memmove(item_body_by_coord(target),
42969 + (char *)item_body_by_coord(target) + free_space,
42970 + item_length_by_coord(target) - free_space);
42971 + }
42972 +
42973 + CHECKME(target);
42974 + CHECKME(source);
42975 +
42976 + /* expand @target */
42977 + data_size =
42978 + offset_of(source, (int)(from + count)) - offset_of(source,
42979 + (int)from);
42980 +
42981 + if (units(target) == 0)
42982 + free_space -= sizeof(cde_item_format);
42983 +
42984 + expand_item(target, pos_in_target, (int)count,
42985 + (int)(item_length_by_coord(target) - free_space),
42986 + (unsigned)data_size);
42987 +
42988 + /* copy first @count units of @source into @target */
42989 + data_delta =
42990 + offset_of(target, pos_in_target) - offset_of(source, (int)from);
42991 +
42992 + /* copy entries */
42993 + entry_from = (char *)entry_at(source, (int)from);
42994 + entry_to = (char *)entry_at(source, (int)(from + count));
42995 + memmove(entry_at(target, pos_in_target), entry_from,
42996 + (unsigned)(entry_to - entry_from));
42997 +
42998 + /* copy headers */
42999 + header_from = (char *)header_at(source, (int)from);
43000 + header_to = (char *)header_at(source, (int)(from + count));
43001 + memmove(header_at(target, pos_in_target), header_from,
43002 + (unsigned)(header_to - header_from));
43003 +
43004 + /* update offsets */
43005 + for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
43006 + adj_offset(target, i, data_delta);
43007 + CHECKME(target);
43008 + CHECKME(source);
43009 +}
43010 +
43011 +/* ->cut_units() method for this item plugin. */
43012 +int cut_units_cde(coord_t * coord /* coord of item */ ,
43013 + pos_in_node_t from /* start unit pos */ ,
43014 + pos_in_node_t to /* stop unit pos */ ,
43015 + struct carry_cut_data *cdata UNUSED_ARG,
43016 + reiser4_key * smallest_removed, reiser4_key * new_first)
43017 +{
43018 + char *header_from;
43019 + char *header_to;
43020 +
43021 + char *entry_from;
43022 + char *entry_to;
43023 +
43024 + int size;
43025 + int entry_delta;
43026 + int header_delta;
43027 + int i;
43028 +
43029 + unsigned count;
43030 +
43031 + CHECKME(coord);
43032 +
43033 + count = to - from + 1;
43034 +
43035 + assert("nikita-1454", coord != NULL);
43036 + assert("nikita-1455", (int)(from + count) <= units(coord));
43037 +
43038 + if (smallest_removed)
43039 + unit_key_by_coord(coord, smallest_removed);
43040 +
43041 + if (new_first) {
43042 + coord_t next;
43043 +
43044 + /* not everything is cut from item head */
43045 + assert("vs-1527", from == 0);
43046 + assert("vs-1528", to < units(coord) - 1);
43047 +
43048 + coord_dup(&next, coord);
43049 + next.unit_pos++;
43050 + unit_key_by_coord(&next, new_first);
43051 + }
43052 +
43053 + size = item_length_by_coord(coord);
43054 + if (count == (unsigned)units(coord)) {
43055 + return size;
43056 + }
43057 +
43058 + header_from = (char *)header_at(coord, (int)from);
43059 + header_to = (char *)header_at(coord, (int)(from + count));
43060 +
43061 + entry_from = (char *)entry_at(coord, (int)from);
43062 + entry_to = (char *)entry_at(coord, (int)(from + count));
43063 +
43064 + /* move headers */
43065 + memmove(header_from, header_to,
43066 + (unsigned)(address(coord, size) - header_to));
43067 +
43068 + header_delta = header_to - header_from;
43069 +
43070 + entry_from -= header_delta;
43071 + entry_to -= header_delta;
43072 + size -= header_delta;
43073 +
43074 + /* copy entries */
43075 + memmove(entry_from, entry_to,
43076 + (unsigned)(address(coord, size) - entry_to));
43077 +
43078 + entry_delta = entry_to - entry_from;
43079 + size -= entry_delta;
43080 +
43081 + /* update offsets */
43082 +
43083 + for (i = 0; i < (int)from; ++i)
43084 + adj_offset(coord, i, -header_delta);
43085 +
43086 + for (i = from; i < units(coord) - (int)count; ++i)
43087 + adj_offset(coord, i, -header_delta - entry_delta);
43088 +
43089 + put_unaligned(cpu_to_le16((__u16) units(coord) - count),
43090 + &formatted_at(coord)->num_of_entries);
43091 +
43092 + if (from == 0) {
43093 + /* entries from head was removed - move remaining to right */
43094 + memmove((char *)item_body_by_coord(coord) +
43095 + header_delta + entry_delta, item_body_by_coord(coord),
43096 + (unsigned)size);
43097 + if (REISER4_DEBUG)
43098 + memset(item_body_by_coord(coord), 0,
43099 + (unsigned)header_delta + entry_delta);
43100 + } else {
43101 + /* freed space is already at the end of item */
43102 + if (REISER4_DEBUG)
43103 + memset((char *)item_body_by_coord(coord) + size, 0,
43104 + (unsigned)header_delta + entry_delta);
43105 + }
43106 +
43107 + return header_delta + entry_delta;
43108 +}
43109 +
43110 +int kill_units_cde(coord_t * coord /* coord of item */ ,
43111 + pos_in_node_t from /* start unit pos */ ,
43112 + pos_in_node_t to /* stop unit pos */ ,
43113 + struct carry_kill_data *kdata UNUSED_ARG,
43114 + reiser4_key * smallest_removed, reiser4_key * new_first)
43115 +{
43116 + return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
43117 +}
43118 +
43119 +/* ->s.dir.extract_key() method for this item plugin. */
43120 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
43121 + reiser4_key * key /* resulting key */ )
43122 +{
43123 + directory_entry_format *dent;
43124 +
43125 + assert("nikita-1155", coord != NULL);
43126 + assert("nikita-1156", key != NULL);
43127 +
43128 + dent = entry_at(coord, idx_of(coord));
43129 + return extract_key_from_id(&dent->id, key);
43130 +}
43131 +
43132 +int
43133 +update_key_cde(const coord_t * coord, const reiser4_key * key,
43134 + lock_handle * lh UNUSED_ARG)
43135 +{
43136 + directory_entry_format *dent;
43137 + obj_key_id obj_id;
43138 + int result;
43139 +
43140 + assert("nikita-2344", coord != NULL);
43141 + assert("nikita-2345", key != NULL);
43142 +
43143 + dent = entry_at(coord, idx_of(coord));
43144 + result = build_obj_key_id(key, &obj_id);
43145 + if (result == 0) {
43146 + dent->id = obj_id;
43147 + znode_make_dirty(coord->node);
43148 + }
43149 + return 0;
43150 +}
43151 +
43152 +/* ->s.dir.extract_name() method for this item plugin. */
43153 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
43154 +{
43155 + directory_entry_format *dent;
43156 +
43157 + assert("nikita-1157", coord != NULL);
43158 +
43159 + dent = entry_at(coord, idx_of(coord));
43160 + return extract_dent_name(coord, dent, buf);
43161 +}
43162 +
43163 +static int cde_bytes(int pasting, const reiser4_item_data * data)
43164 +{
43165 + int result;
43166 +
43167 + result = data->length;
43168 + if (!pasting)
43169 + result -= sizeof(cde_item_format);
43170 + return result;
43171 +}
43172 +
43173 +/* ->s.dir.add_entry() method for this item plugin */
43174 +int add_entry_cde(struct inode *dir /* directory object */ ,
43175 + coord_t * coord /* coord of item */ ,
43176 + lock_handle * lh /* lock handle for insertion */ ,
43177 + const struct dentry *name /* name to insert */ ,
43178 + reiser4_dir_entry_desc * dir_entry /* parameters of new
43179 + * directory entry */ )
43180 +{
43181 + reiser4_item_data data;
43182 + struct cde_entry entry;
43183 + struct cde_entry_data edata;
43184 + int result;
43185 +
43186 + assert("nikita-1656", coord->node == lh->node);
43187 + assert("nikita-1657", znode_is_write_locked(coord->node));
43188 +
43189 + edata.num_of_entries = 1;
43190 + edata.entry = &entry;
43191 +
43192 + entry.dir = dir;
43193 + entry.obj = dir_entry->obj;
43194 + entry.name = &name->d_name;
43195 +
43196 + data.data = (char *)&edata;
43197 + data.user = 0; /* &edata is not user space */
43198 + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
43199 + data.arg = dir_entry;
43200 + assert("nikita-1302", data.iplug != NULL);
43201 +
43202 + result = is_dot_key(&dir_entry->key);
43203 + data.length = estimate_cde(result ? coord : NULL, &data);
43204 +
43205 + /* NOTE-NIKITA quota plugin? */
43206 + if (vfs_dq_alloc_space_nodirty(dir, cde_bytes(result, &data)))
43207 + return RETERR(-EDQUOT);
43208 +
43209 + if (result)
43210 + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
43211 + else
43212 + result = reiser4_resize_item(coord, &data, &dir_entry->key,
43213 + lh, 0);
43214 + return result;
43215 +}
43216 +
43217 +/* ->s.dir.rem_entry() */
43218 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
43219 + const struct qstr *name, coord_t * coord /* coord of item */ ,
43220 + lock_handle * lh UNUSED_ARG /* lock handle for
43221 + * removal */ ,
43222 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
43223 + * directory entry
43224 + * being removed */ )
43225 +{
43226 + coord_t shadow;
43227 + int result;
43228 + int length;
43229 + ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
43230 +
43231 + assert("nikita-2870", strlen(name->name) == name->len);
43232 + assert("nikita-2869",
43233 + !strcmp(name->name, extract_name_cde(coord, buf)));
43234 +
43235 + length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
43236 + if (is_longname(name->name, name->len))
43237 + length += name->len + 1;
43238 +
43239 + if (inode_get_bytes(dir) < length) {
43240 + warning("nikita-2628", "Dir is broke: %llu: %llu",
43241 + (unsigned long long)get_inode_oid(dir),
43242 + inode_get_bytes(dir));
43243 +
43244 + return RETERR(-EIO);
43245 + }
43246 +
43247 + /* cut_node() is supposed to take pointers to _different_
43248 + coords, because it will modify them without respect to
43249 + possible aliasing. To work around this, create temporary copy
43250 + of @coord.
43251 + */
43252 + coord_dup(&shadow, coord);
43253 + result =
43254 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
43255 + if (result == 0) {
43256 + /* NOTE-NIKITA quota plugin? */
43257 + vfs_dq_free_space_nodirty(dir, length);
43258 + }
43259 + return result;
43260 +}
43261 +
43262 +/* ->s.dir.max_name_len() method for this item plugin */
43263 +int max_name_len_cde(const struct inode *dir /* directory */ )
43264 +{
43265 + return
43266 + reiser4_tree_by_inode(dir)->nplug->max_item_size() -
43267 + sizeof(directory_entry_format) - sizeof(cde_item_format) -
43268 + sizeof(cde_unit_header) - 2;
43269 +}
43270 +
43271 +/* Make Linus happy.
43272 + Local variables:
43273 + c-indentation-style: "K&R"
43274 + mode-name: "LC"
43275 + c-basic-offset: 8
43276 + tab-width: 8
43277 + fill-column: 120
43278 + End:
43279 +*/
43280 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/cde.h linux-2.6.33/fs/reiser4/plugin/item/cde.h
43281 --- linux-2.6.33.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 01:00:00.000000000 +0100
43282 +++ linux-2.6.33/fs/reiser4/plugin/item/cde.h 2010-03-04 19:33:22.000000000 +0100
43283 @@ -0,0 +1,87 @@
43284 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43285 +
43286 +/* Compound directory item. See cde.c for description. */
43287 +
43288 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
43289 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
43290 +
43291 +#include "../../forward.h"
43292 +#include "../../kassign.h"
43293 +#include "../../dformat.h"
43294 +
43295 +#include <linux/fs.h> /* for struct inode */
43296 +#include <linux/dcache.h> /* for struct dentry, etc */
43297 +
43298 +typedef struct cde_unit_header {
43299 + de_id hash;
43300 + d16 offset;
43301 +} cde_unit_header;
43302 +
43303 +typedef struct cde_item_format {
43304 + d16 num_of_entries;
43305 + cde_unit_header entry[0];
43306 +} cde_item_format;
43307 +
43308 +struct cde_entry {
43309 + const struct inode *dir;
43310 + const struct inode *obj;
43311 + const struct qstr *name;
43312 +};
43313 +
43314 +struct cde_entry_data {
43315 + int num_of_entries;
43316 + struct cde_entry *entry;
43317 +};
43318 +
43319 +/* plugin->item.b.* */
43320 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
43321 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
43322 + const reiser4_item_data *);
43323 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
43324 +pos_in_node_t nr_units_cde(const coord_t * coord);
43325 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
43326 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
43327 +void print_cde(const char *prefix, coord_t * coord);
43328 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
43329 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
43330 + coord_t * coord);
43331 +int paste_cde(coord_t * coord, reiser4_item_data * data,
43332 + carry_plugin_info * info UNUSED_ARG);
43333 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
43334 + shift_direction pend, unsigned *size, unsigned want);
43335 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
43336 + unsigned count, shift_direction where_is_free_space,
43337 + unsigned free_space);
43338 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43339 + struct carry_cut_data *, reiser4_key * smallest_removed,
43340 + reiser4_key * new_first);
43341 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43342 + struct carry_kill_data *, reiser4_key * smallest_removed,
43343 + reiser4_key * new_first);
43344 +void print_cde(const char *prefix, coord_t * coord);
43345 +int reiser4_check_cde(const coord_t * coord, const char **error);
43346 +
43347 +/* plugin->u.item.s.dir.* */
43348 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
43349 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
43350 + lock_handle * lh);
43351 +char *extract_name_cde(const coord_t * coord, char *buf);
43352 +int add_entry_cde(struct inode *dir, coord_t * coord,
43353 + lock_handle * lh, const struct dentry *name,
43354 + reiser4_dir_entry_desc * entry);
43355 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
43356 + lock_handle * lh, reiser4_dir_entry_desc * entry);
43357 +int max_name_len_cde(const struct inode *dir);
43358 +
43359 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
43360 +#endif
43361 +
43362 +/* Make Linus happy.
43363 + Local variables:
43364 + c-indentation-style: "K&R"
43365 + mode-name: "LC"
43366 + c-basic-offset: 8
43367 + tab-width: 8
43368 + fill-column: 120
43369 + End:
43370 +*/
43371 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.33/fs/reiser4/plugin/item/ctail.c
43372 --- linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 01:00:00.000000000 +0100
43373 +++ linux-2.6.33/fs/reiser4/plugin/item/ctail.c 2010-03-04 19:33:22.000000000 +0100
43374 @@ -0,0 +1,1613 @@
43375 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43376 +
43377 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
43378 +
43379 +/* DESCRIPTION:
43380 +
43381 +Each cryptcompress object is stored on disk as a set of clusters sliced
43382 +into ctails.
43383 +
43384 +Internal on-disk structure:
43385 +
43386 + HEADER (1) Here stored disk cluster shift
43387 + BODY
43388 +*/
43389 +
43390 +#include "../../forward.h"
43391 +#include "../../debug.h"
43392 +#include "../../dformat.h"
43393 +#include "../../kassign.h"
43394 +#include "../../key.h"
43395 +#include "../../coord.h"
43396 +#include "item.h"
43397 +#include "../node/node.h"
43398 +#include "../plugin.h"
43399 +#include "../object.h"
43400 +#include "../../znode.h"
43401 +#include "../../carry.h"
43402 +#include "../../tree.h"
43403 +#include "../../inode.h"
43404 +#include "../../super.h"
43405 +#include "../../context.h"
43406 +#include "../../page_cache.h"
43407 +#include "../cluster.h"
43408 +#include "../../flush.h"
43409 +#include "../../tree_walk.h"
43410 +
43411 +#include <linux/pagevec.h>
43412 +#include <linux/swap.h>
43413 +#include <linux/fs.h>
43414 +
43415 +/* return body of ctail item at @coord */
43416 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43417 +{
43418 + assert("edward-60", coord != NULL);
43419 + return item_body_by_coord(coord);
43420 +}
43421 +
43422 +static int cluster_shift_by_coord(const coord_t * coord)
43423 +{
43424 + return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43425 +}
43426 +
43427 +static inline void dclust_set_extension_shift(hint_t * hint)
43428 +{
43429 + assert("edward-1270",
43430 + item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43431 + hint->ext_coord.extension.ctail.shift =
43432 + cluster_shift_by_coord(&hint->ext_coord.coord);
43433 +}
43434 +
43435 +static loff_t off_by_coord(const coord_t * coord)
43436 +{
43437 + reiser4_key key;
43438 + return get_key_offset(item_key_by_coord(coord, &key));
43439 +}
43440 +
43441 +int coord_is_unprepped_ctail(const coord_t * coord)
43442 +{
43443 + assert("edward-1233", coord != NULL);
43444 + assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43445 + assert("edward-1235",
43446 + ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43447 + nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43448 +
43449 + return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43450 +}
43451 +
43452 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43453 +{
43454 + int shift;
43455 +
43456 + if (inode != NULL) {
43457 + shift = inode_cluster_shift(inode);
43458 + assert("edward-1236",
43459 + ergo(!coord_is_unprepped_ctail(coord),
43460 + shift == cluster_shift_by_coord(coord)));
43461 + } else {
43462 + assert("edward-1237", !coord_is_unprepped_ctail(coord));
43463 + shift = cluster_shift_by_coord(coord);
43464 + }
43465 + return off_by_coord(coord) >> shift;
43466 +}
43467 +
43468 +static int disk_cluster_size(const coord_t * coord)
43469 +{
43470 + assert("edward-1156",
43471 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43472 + /* calculation of disk cluster size
43473 + is meaninless if ctail is unprepped */
43474 + assert("edward-1238", !coord_is_unprepped_ctail(coord));
43475 +
43476 + return 1 << cluster_shift_by_coord(coord);
43477 +}
43478 +
43479 +/* true if the key is of first disk cluster item */
43480 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43481 +{
43482 + assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43483 +
43484 + return coord_is_unprepped_ctail(coord) ||
43485 + ((get_key_offset(key) &
43486 + ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43487 +}
43488 +
43489 +static char *first_unit(coord_t * coord)
43490 +{
43491 + /* FIXME: warning: pointer of type `void *' used in arithmetic */
43492 + return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43493 +}
43494 +
43495 +/* plugin->u.item.b.max_key_inside :
43496 + tail_max_key_inside */
43497 +
43498 +/* plugin->u.item.b.can_contain_key */
43499 +int
43500 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43501 + const reiser4_item_data * data)
43502 +{
43503 + reiser4_key item_key;
43504 +
43505 + if (item_plugin_by_coord(coord) != data->iplug)
43506 + return 0;
43507 +
43508 + item_key_by_coord(coord, &item_key);
43509 + if (get_key_locality(key) != get_key_locality(&item_key) ||
43510 + get_key_objectid(key) != get_key_objectid(&item_key))
43511 + return 0;
43512 + if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43513 + get_key_offset(key))
43514 + return 0;
43515 + if (is_disk_cluster_key(key, coord))
43516 + return 0;
43517 + return 1;
43518 +}
43519 +
43520 +/* plugin->u.item.b.mergeable */
43521 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43522 +{
43523 + reiser4_key key1, key2;
43524 +
43525 + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43526 + assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43527 + UNIX_FILE_METADATA_ITEM_TYPE));
43528 +
43529 + if (item_id_by_coord(p2) != CTAIL_ID) {
43530 + /* second item is of another type */
43531 + return 0;
43532 + }
43533 +
43534 + item_key_by_coord(p1, &key1);
43535 + item_key_by_coord(p2, &key2);
43536 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
43537 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
43538 + get_key_type(&key1) != get_key_type(&key2)) {
43539 + /* items of different objects */
43540 + return 0;
43541 + }
43542 + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43543 + /* not adjacent items */
43544 + return 0;
43545 + if (is_disk_cluster_key(&key2, p2))
43546 + return 0;
43547 + return 1;
43548 +}
43549 +
43550 +/* plugin->u.item.b.nr_units */
43551 +pos_in_node_t nr_units_ctail(const coord_t * coord)
43552 +{
43553 + return (item_length_by_coord(coord) -
43554 + sizeof(ctail_formatted_at(coord)->cluster_shift));
43555 +}
43556 +
43557 +/* plugin->u.item.b.estimate:
43558 + estimate how much space is needed to insert/paste @data->length bytes
43559 + into ctail at @coord */
43560 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
43561 + const reiser4_item_data *
43562 + data /* parameters for new item */ )
43563 +{
43564 + if (coord == NULL)
43565 + /* insert */
43566 + return (sizeof(ctail_item_format) + data->length);
43567 + else
43568 + /* paste */
43569 + return data->length;
43570 +}
43571 +
43572 +/* ->init() method for this item plugin. */
43573 +int init_ctail(coord_t * to /* coord of item */ ,
43574 + coord_t * from /* old_item */ ,
43575 + reiser4_item_data * data /* structure used for insertion */ )
43576 +{
43577 + int cluster_shift; /* cpu value to convert */
43578 +
43579 + if (data) {
43580 + assert("edward-463", data->length > sizeof(ctail_item_format));
43581 + cluster_shift = *((int *)(data->arg));
43582 + data->length -= sizeof(ctail_item_format);
43583 + } else {
43584 + assert("edward-464", from != NULL);
43585 + assert("edward-855", ctail_ok(from));
43586 + cluster_shift = (int)(cluster_shift_by_coord(from));
43587 + }
43588 + put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43589 + assert("edward-856", ctail_ok(to));
43590 + return 0;
43591 +}
43592 +
43593 +/* plugin->u.item.b.lookup:
43594 + NULL: We are looking for item keys only */
43595 +
43596 +#if REISER4_DEBUG
43597 +int ctail_ok(const coord_t * coord)
43598 +{
43599 + return coord_is_unprepped_ctail(coord) ||
43600 + cluster_shift_ok(cluster_shift_by_coord(coord));
43601 +}
43602 +
43603 +/* plugin->u.item.b.check */
43604 +int check_ctail(const coord_t * coord, const char **error)
43605 +{
43606 + if (!ctail_ok(coord)) {
43607 + if (error)
43608 + *error = "bad cluster shift in ctail";
43609 + return 1;
43610 + }
43611 + return 0;
43612 +}
43613 +#endif
43614 +
43615 +/* plugin->u.item.b.paste */
43616 +int
43617 +paste_ctail(coord_t * coord, reiser4_item_data * data,
43618 + carry_plugin_info * info UNUSED_ARG)
43619 +{
43620 + unsigned old_nr_units;
43621 +
43622 + assert("edward-268", data->data != NULL);
43623 + /* copy only from kernel space */
43624 + assert("edward-66", data->user == 0);
43625 +
43626 + old_nr_units =
43627 + item_length_by_coord(coord) - sizeof(ctail_item_format) -
43628 + data->length;
43629 +
43630 + /* ctail items never get pasted in the middle */
43631 +
43632 + if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43633 +
43634 + /* paste at the beginning when create new item */
43635 + assert("edward-450",
43636 + item_length_by_coord(coord) ==
43637 + data->length + sizeof(ctail_item_format));
43638 + assert("edward-451", old_nr_units == 0);
43639 + } else if (coord->unit_pos == old_nr_units - 1
43640 + && coord->between == AFTER_UNIT) {
43641 +
43642 + /* paste at the end */
43643 + coord->unit_pos++;
43644 + } else
43645 + impossible("edward-453", "bad paste position");
43646 +
43647 + memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43648 +
43649 + assert("edward-857", ctail_ok(coord));
43650 +
43651 + return 0;
43652 +}
43653 +
43654 +/* plugin->u.item.b.fast_paste */
43655 +
43656 +/* plugin->u.item.b.can_shift
43657 + number of units is returned via return value, number of bytes via @size. For
43658 + ctail items they coincide */
43659 +int
43660 +can_shift_ctail(unsigned free_space, coord_t * source,
43661 + znode * target, shift_direction direction UNUSED_ARG,
43662 + unsigned *size /* number of bytes */ , unsigned want)
43663 +{
43664 + /* make sure that that we do not want to shift more than we have */
43665 + assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43666 +
43667 + *size = min(want, free_space);
43668 +
43669 + if (!target) {
43670 + /* new item will be created */
43671 + if (*size <= sizeof(ctail_item_format)) {
43672 + *size = 0;
43673 + return 0;
43674 + }
43675 + return *size - sizeof(ctail_item_format);
43676 + }
43677 + return *size;
43678 +}
43679 +
43680 +/* plugin->u.item.b.copy_units
43681 + cooperates with ->can_shift() */
43682 +void
43683 +copy_units_ctail(coord_t * target, coord_t * source,
43684 + unsigned from, unsigned count /* units */ ,
43685 + shift_direction where_is_free_space,
43686 + unsigned free_space /* bytes */ )
43687 +{
43688 + /* make sure that item @target is expanded already */
43689 + assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43690 + assert("edward-70", free_space == count || free_space == count + 1);
43691 +
43692 + assert("edward-858", ctail_ok(source));
43693 +
43694 + if (where_is_free_space == SHIFT_LEFT) {
43695 + /* append item @target with @count first bytes of @source:
43696 + this restriction came from ordinary tails */
43697 + assert("edward-71", from == 0);
43698 + assert("edward-860", ctail_ok(target));
43699 +
43700 + memcpy(first_unit(target) + nr_units_ctail(target) - count,
43701 + first_unit(source), count);
43702 + } else {
43703 + /* target item is moved to right already */
43704 + reiser4_key key;
43705 +
43706 + assert("edward-72", nr_units_ctail(source) == from + count);
43707 +
43708 + if (free_space == count) {
43709 + init_ctail(target, source, NULL);
43710 + } else {
43711 + /* new item has been created */
43712 + assert("edward-862", ctail_ok(target));
43713 + }
43714 + memcpy(first_unit(target), first_unit(source) + from, count);
43715 +
43716 + assert("edward-863", ctail_ok(target));
43717 +
43718 + /* new units are inserted before first unit in an item,
43719 + therefore, we have to update item key */
43720 + item_key_by_coord(source, &key);
43721 + set_key_offset(&key, get_key_offset(&key) + from);
43722 +
43723 + node_plugin_by_node(target->node)->update_item_key(target, &key,
43724 + NULL /*info */);
43725 + }
43726 +}
43727 +
43728 +/* plugin->u.item.b.create_hook */
43729 +int create_hook_ctail(const coord_t * coord, void *arg)
43730 +{
43731 + assert("edward-864", znode_is_loaded(coord->node));
43732 +
43733 + znode_set_convertible(coord->node);
43734 + return 0;
43735 +}
43736 +
43737 +/* plugin->u.item.b.kill_hook */
43738 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43739 + pos_in_node_t count, carry_kill_data * kdata)
43740 +{
43741 + struct inode *inode;
43742 +
43743 + assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43744 + assert("edward-291", znode_is_write_locked(coord->node));
43745 +
43746 + inode = kdata->inode;
43747 + if (inode) {
43748 + reiser4_key key;
43749 + struct cryptcompress_info * info;
43750 + cloff_t index;
43751 +
43752 + item_key_by_coord(coord, &key);
43753 + info = cryptcompress_inode_data(inode);
43754 + index = off_to_clust(get_key_offset(&key), inode);
43755 +
43756 + if (from == 0) {
43757 + info->trunc_index = index;
43758 + if (is_disk_cluster_key(&key, coord)) {
43759 + /*
43760 + * first item of disk cluster is to be killed
43761 + */
43762 + truncate_complete_page_cluster(
43763 + inode, index, kdata->params.truncate);
43764 + inode_sub_bytes(inode,
43765 + inode_cluster_size(inode));
43766 + }
43767 + }
43768 + }
43769 + return 0;
43770 +}
43771 +
43772 +/* for shift_hook_ctail(),
43773 + return true if the first disk cluster item has dirty child
43774 +*/
43775 +static int ctail_convertible(const coord_t * coord)
43776 +{
43777 + int result;
43778 + reiser4_key key;
43779 + jnode *child = NULL;
43780 +
43781 + assert("edward-477", coord != NULL);
43782 + assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43783 +
43784 + if (coord_is_unprepped_ctail(coord))
43785 + /* unprepped ctail should be converted */
43786 + return 1;
43787 +
43788 + item_key_by_coord(coord, &key);
43789 + child = jlookup(current_tree,
43790 + get_key_objectid(&key),
43791 + off_to_pg(off_by_coord(coord)));
43792 + if (!child)
43793 + return 0;
43794 + result = JF_ISSET(child, JNODE_DIRTY);
43795 + jput(child);
43796 + return result;
43797 +}
43798 +
43799 +/* FIXME-EDWARD */
43800 +/* plugin->u.item.b.shift_hook */
43801 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43802 + unsigned from UNUSED_ARG /* start unit */ ,
43803 + unsigned count UNUSED_ARG /* stop unit */ ,
43804 + znode * old_node /* old parent */ )
43805 +{
43806 + assert("edward-479", item != NULL);
43807 + assert("edward-480", item->node != old_node);
43808 +
43809 + if (!znode_convertible(old_node) || znode_convertible(item->node))
43810 + return 0;
43811 + if (ctail_convertible(item))
43812 + znode_set_convertible(item->node);
43813 + return 0;
43814 +}
43815 +
43816 +static int
43817 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43818 + int cut, void *p, reiser4_key * smallest_removed,
43819 + reiser4_key * new_first)
43820 +{
43821 + pos_in_node_t count; /* number of units to cut */
43822 + char *item;
43823 +
43824 + count = to - from + 1;
43825 + item = item_body_by_coord(coord);
43826 +
43827 + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43828 +
43829 + if (smallest_removed) {
43830 + /* store smallest key removed */
43831 + item_key_by_coord(coord, smallest_removed);
43832 + set_key_offset(smallest_removed,
43833 + get_key_offset(smallest_removed) + from);
43834 + }
43835 +
43836 + if (new_first) {
43837 + assert("vs-1531", from == 0);
43838 +
43839 + item_key_by_coord(coord, new_first);
43840 + set_key_offset(new_first,
43841 + get_key_offset(new_first) + from + count);
43842 + }
43843 +
43844 + if (!cut)
43845 + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43846 +
43847 + if (from == 0) {
43848 + if (count != nr_units_ctail(coord)) {
43849 + /* part of item is removed, so move free space at the beginning
43850 + of the item and update item key */
43851 + reiser4_key key;
43852 + memcpy(item + to + 1, item, sizeof(ctail_item_format));
43853 + item_key_by_coord(coord, &key);
43854 + set_key_offset(&key, get_key_offset(&key) + count);
43855 + node_plugin_by_node(coord->node)->update_item_key(coord,
43856 + &key,
43857 + NULL);
43858 + } else {
43859 + /* cut_units should not be called to cut evrything */
43860 + assert("vs-1532", ergo(cut, 0));
43861 + /* whole item is cut, so more then amount of space occupied
43862 + by units got freed */
43863 + count += sizeof(ctail_item_format);
43864 + }
43865 + if (REISER4_DEBUG)
43866 + memset(item, 0, count);
43867 + } else if (REISER4_DEBUG)
43868 + memset(item + sizeof(ctail_item_format) + from, 0, count);
43869 + return count;
43870 +}
43871 +
43872 +/* plugin->u.item.b.cut_units */
43873 +int
43874 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43875 + carry_cut_data * cdata, reiser4_key * smallest_removed,
43876 + reiser4_key * new_first)
43877 +{
43878 + return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43879 + smallest_removed, new_first);
43880 +}
43881 +
43882 +/* plugin->u.item.b.kill_units */
43883 +int
43884 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43885 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43886 + reiser4_key * new_first)
43887 +{
43888 + return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43889 + smallest_removed, new_first);
43890 +}
43891 +
43892 +/* plugin->u.item.s.file.read */
43893 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43894 +{
43895 + uf_coord_t *uf_coord;
43896 + coord_t *coord;
43897 +
43898 + uf_coord = &hint->ext_coord;
43899 + coord = &uf_coord->coord;
43900 + assert("edward-127", f->user == 0);
43901 + assert("edward-129", coord && coord->node);
43902 + assert("edward-130", coord_is_existing_unit(coord));
43903 + assert("edward-132", znode_is_loaded(coord->node));
43904 +
43905 + /* start read only from the beginning of ctail */
43906 + assert("edward-133", coord->unit_pos == 0);
43907 + /* read only whole ctails */
43908 + assert("edward-135", nr_units_ctail(coord) <= f->length);
43909 +
43910 + assert("edward-136", reiser4_schedulable());
43911 + assert("edward-886", ctail_ok(coord));
43912 +
43913 + if (f->data)
43914 + memcpy(f->data, (char *)first_unit(coord),
43915 + (size_t) nr_units_ctail(coord));
43916 +
43917 + dclust_set_extension_shift(hint);
43918 + mark_page_accessed(znode_page(coord->node));
43919 + move_flow_forward(f, nr_units_ctail(coord));
43920 +
43921 + return 0;
43922 +}
43923 +
43924 +/**
43925 + * Prepare transform stream with plain text for page
43926 + * @page taking into account synchronization issues.
43927 + */
43928 +static int ctail_read_disk_cluster(struct cluster_handle * clust,
43929 + struct inode * inode, struct page * page,
43930 + znode_lock_mode mode)
43931 +{
43932 + int result;
43933 +
43934 + assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43935 + assert("edward-671", clust->hint != NULL);
43936 + assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43937 + assert("edward-672", cryptcompress_inode_ok(inode));
43938 + assert("edward-1527", PageLocked(page));
43939 +
43940 + unlock_page(page);
43941 +
43942 + /* set input stream */
43943 + result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43944 + if (result) {
43945 + lock_page(page);
43946 + return result;
43947 + }
43948 + result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43949 + lock_page(page);
43950 + if (result)
43951 + return result;
43952 + /*
43953 + * at this point we have locked position in the tree
43954 + */
43955 + assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43956 +
43957 + if (page->mapping != inode->i_mapping) {
43958 + /* page was truncated */
43959 + reiser4_unset_hint(clust->hint);
43960 + reset_cluster_params(clust);
43961 + return AOP_TRUNCATED_PAGE;
43962 + }
43963 + if (PageUptodate(page)) {
43964 + /* disk cluster can be obsolete, don't use it! */
43965 + reiser4_unset_hint(clust->hint);
43966 + reset_cluster_params(clust);
43967 + return 0;
43968 + }
43969 + if (clust->dstat == FAKE_DISK_CLUSTER ||
43970 + clust->dstat == UNPR_DISK_CLUSTER ||
43971 + clust->dstat == TRNC_DISK_CLUSTER) {
43972 + /*
43973 + * this information about disk cluster will be valid
43974 + * as long as we keep the position in the tree locked
43975 + */
43976 + tfm_cluster_set_uptodate(&clust->tc);
43977 + return 0;
43978 + }
43979 + /* now prepare output stream.. */
43980 + result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43981 + if (result)
43982 + return result;
43983 + /* ..and fill this with plain text */
43984 + result = reiser4_inflate_cluster(clust, inode);
43985 + if (result)
43986 + return result;
43987 + /*
43988 + * The stream is ready! It won't be obsolete as
43989 + * long as we keep last disk cluster item locked.
43990 + */
43991 + tfm_cluster_set_uptodate(&clust->tc);
43992 + return 0;
43993 +}
43994 +
43995 +/*
43996 + * fill one page with plain text.
43997 + */
43998 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43999 + struct page *page, znode_lock_mode mode)
44000 +{
44001 + int ret;
44002 + unsigned cloff;
44003 + char *data;
44004 + size_t to_page;
44005 + struct tfm_cluster * tc = &clust->tc;
44006 +
44007 + assert("edward-212", PageLocked(page));
44008 +
44009 + if (unlikely(page->mapping != inode->i_mapping))
44010 + return AOP_TRUNCATED_PAGE;
44011 + if (PageUptodate(page))
44012 + goto exit;
44013 + to_page = pbytes(page_index(page), inode);
44014 + if (to_page == 0) {
44015 + zero_user(page, 0, PAGE_CACHE_SIZE);
44016 + SetPageUptodate(page);
44017 + goto exit;
44018 + }
44019 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
44020 + clust->index = pg_to_clust(page->index, inode);
44021 +
44022 + /* this will unlock/lock the page */
44023 + ret = ctail_read_disk_cluster(clust, inode, page, mode);
44024 +
44025 + assert("edward-212", PageLocked(page));
44026 + if (ret)
44027 + return ret;
44028 +
44029 + /* refresh bytes */
44030 + to_page = pbytes(page_index(page), inode);
44031 + if (to_page == 0) {
44032 + zero_user(page, 0, PAGE_CACHE_SIZE);
44033 + SetPageUptodate(page);
44034 + goto exit;
44035 + }
44036 + }
44037 + if (PageUptodate(page))
44038 + /* somebody else fill it already */
44039 + goto exit;
44040 +
44041 + assert("edward-119", tfm_cluster_is_uptodate(tc));
44042 + assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
44043 +
44044 + switch (clust->dstat) {
44045 + case UNPR_DISK_CLUSTER:
44046 + BUG_ON(1);
44047 + case TRNC_DISK_CLUSTER:
44048 + /*
44049 + * Race with truncate!
44050 + * We resolve it in favour of the last one (the only way,
44051 + * as in this case plain text is unrecoverable)
44052 + */
44053 + case FAKE_DISK_CLUSTER:
44054 + /* fill the page by zeroes */
44055 + zero_user(page, 0, PAGE_CACHE_SIZE);
44056 + SetPageUptodate(page);
44057 + break;
44058 + case PREP_DISK_CLUSTER:
44059 + /* fill page by transformed stream with plain text */
44060 + assert("edward-1058", !PageUptodate(page));
44061 + assert("edward-120", tc->len <= inode_cluster_size(inode));
44062 +
44063 + /* page index in this logical cluster */
44064 + cloff = pg_to_off_to_cloff(page->index, inode);
44065 +
44066 + data = kmap(page);
44067 + memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
44068 + memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
44069 + flush_dcache_page(page);
44070 + kunmap(page);
44071 + SetPageUptodate(page);
44072 + break;
44073 + default:
44074 + impossible("edward-1169", "bad disk cluster state");
44075 + }
44076 + exit:
44077 + return 0;
44078 +}
44079 +
44080 +/* plugin->u.item.s.file.readpage */
44081 +int readpage_ctail(void *vp, struct page *page)
44082 +{
44083 + int result;
44084 + hint_t * hint;
44085 + struct cluster_handle * clust = vp;
44086 +
44087 + assert("edward-114", clust != NULL);
44088 + assert("edward-115", PageLocked(page));
44089 + assert("edward-116", !PageUptodate(page));
44090 + assert("edward-118", page->mapping && page->mapping->host);
44091 + assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
44092 +
44093 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44094 + if (hint == NULL) {
44095 + unlock_page(page);
44096 + return RETERR(-ENOMEM);
44097 + }
44098 + clust->hint = hint;
44099 + result = load_file_hint(clust->file, hint);
44100 + if (result) {
44101 + kfree(hint);
44102 + unlock_page(page);
44103 + return result;
44104 + }
44105 + assert("vs-25", hint->ext_coord.lh == &hint->lh);
44106 +
44107 + result = do_readpage_ctail(page->mapping->host, clust, page,
44108 + ZNODE_READ_LOCK);
44109 + assert("edward-213", PageLocked(page));
44110 + assert("edward-1163", ergo(!result, PageUptodate(page)));
44111 +
44112 + unlock_page(page);
44113 + done_lh(&hint->lh);
44114 + hint->ext_coord.valid = 0;
44115 + save_file_hint(clust->file, hint);
44116 + kfree(hint);
44117 + tfm_cluster_clr_uptodate(&clust->tc);
44118 +
44119 + return result;
44120 +}
44121 +
44122 +/* Helper function for ->readpages() */
44123 +static int ctail_read_page_cluster(struct cluster_handle * clust,
44124 + struct inode *inode)
44125 +{
44126 + int i;
44127 + int result;
44128 + assert("edward-779", clust != NULL);
44129 + assert("edward-1059", clust->win == NULL);
44130 + assert("edward-780", inode != NULL);
44131 +
44132 + result = prepare_page_cluster(inode, clust, READ_OP);
44133 + if (result)
44134 + return result;
44135 +
44136 + assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
44137 +
44138 + for (i = 0; i < clust->nr_pages; i++) {
44139 + struct page *page = clust->pages[i];
44140 + lock_page(page);
44141 + result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
44142 + unlock_page(page);
44143 + if (result)
44144 + break;
44145 + }
44146 + tfm_cluster_clr_uptodate(&clust->tc);
44147 + put_page_cluster(clust, inode, READ_OP);
44148 + return result;
44149 +}
44150 +
44151 +/* filler for read_cache_pages() */
44152 +static int ctail_readpages_filler(void * data, struct page * page)
44153 +{
44154 + int ret = 0;
44155 + struct cluster_handle * clust = data;
44156 + struct inode * inode = clust->file->f_dentry->d_inode;
44157 +
44158 + assert("edward-1525", page->mapping == inode->i_mapping);
44159 +
44160 + if (PageUptodate(page)) {
44161 + unlock_page(page);
44162 + return 0;
44163 + }
44164 + if (pbytes(page_index(page), inode) == 0) {
44165 + zero_user(page, 0, PAGE_CACHE_SIZE);
44166 + SetPageUptodate(page);
44167 + unlock_page(page);
44168 + return 0;
44169 + }
44170 + move_cluster_forward(clust, inode, page->index);
44171 + unlock_page(page);
44172 + /*
44173 + * read the whole page cluster
44174 + */
44175 + ret = ctail_read_page_cluster(clust, inode);
44176 +
44177 + assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
44178 + return ret;
44179 +}
44180 +
44181 +/*
44182 + * We populate a bit more then upper readahead suggests:
44183 + * with each nominated page we read the whole page cluster
44184 + * this page belongs to.
44185 + */
44186 +int readpages_ctail(struct file *file, struct address_space *mapping,
44187 + struct list_head *pages)
44188 +{
44189 + int ret = 0;
44190 + hint_t *hint;
44191 + struct cluster_handle clust;
44192 + struct inode *inode = mapping->host;
44193 +
44194 + assert("edward-1521", inode == file->f_dentry->d_inode);
44195 +
44196 + cluster_init_read(&clust, NULL);
44197 + clust.file = file;
44198 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44199 + if (hint == NULL) {
44200 + warning("vs-28", "failed to allocate hint");
44201 + ret = RETERR(-ENOMEM);
44202 + goto exit1;
44203 + }
44204 + clust.hint = hint;
44205 + ret = load_file_hint(clust.file, hint);
44206 + if (ret) {
44207 + warning("edward-1522", "failed to load hint");
44208 + goto exit2;
44209 + }
44210 + assert("vs-26", hint->ext_coord.lh == &hint->lh);
44211 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
44212 + if (ret) {
44213 + warning("edward-1523", "failed to alloc pgset");
44214 + goto exit3;
44215 + }
44216 + ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
44217 +
44218 + assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
44219 + exit3:
44220 + done_lh(&hint->lh);
44221 + save_file_hint(file, hint);
44222 + hint->ext_coord.valid = 0;
44223 + exit2:
44224 + kfree(hint);
44225 + exit1:
44226 + put_cluster_handle(&clust);
44227 + return ret;
44228 +}
44229 +
44230 +/*
44231 + plugin->u.item.s.file.append_key
44232 + key of the first item of the next disk cluster
44233 +*/
44234 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
44235 +{
44236 + assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
44237 + assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
44238 +
44239 + item_key_by_coord(coord, key);
44240 + set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
44241 + << cluster_shift_by_coord(coord));
44242 + return key;
44243 +}
44244 +
44245 +static int insert_unprepped_ctail(struct cluster_handle * clust,
44246 + struct inode *inode)
44247 +{
44248 + int result;
44249 + char buf[UCTAIL_NR_UNITS];
44250 + reiser4_item_data data;
44251 + reiser4_key key;
44252 + int shift = (int)UCTAIL_SHIFT;
44253 +
44254 + memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
44255 + result = key_by_inode_cryptcompress(inode,
44256 + clust_to_off(clust->index, inode),
44257 + &key);
44258 + if (result)
44259 + return result;
44260 + data.user = 0;
44261 + data.iplug = item_plugin_by_id(CTAIL_ID);
44262 + data.arg = &shift;
44263 + data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
44264 + data.data = buf;
44265 +
44266 + result = insert_by_coord(&clust->hint->ext_coord.coord,
44267 + &data, &key, clust->hint->ext_coord.lh, 0);
44268 + return result;
44269 +}
44270 +
44271 +static int
44272 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
44273 + int cluster_shift)
44274 +{
44275 + int result;
44276 + carry_pool *pool;
44277 + carry_level *lowest_level;
44278 + reiser4_item_data *data;
44279 + carry_op *op;
44280 +
44281 + pool =
44282 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
44283 + sizeof(*data));
44284 + if (IS_ERR(pool))
44285 + return PTR_ERR(pool);
44286 + lowest_level = (carry_level *) (pool + 1);
44287 + init_carry_level(lowest_level, pool);
44288 + data = (reiser4_item_data *) (lowest_level + 3);
44289 +
44290 + assert("edward-466", coord->between == AFTER_ITEM
44291 + || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
44292 + || coord->between == EMPTY_NODE
44293 + || coord->between == BEFORE_UNIT);
44294 +
44295 + if (coord->between == AFTER_UNIT) {
44296 + coord->unit_pos = 0;
44297 + coord->between = AFTER_ITEM;
44298 + }
44299 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
44300 + 0 /* operate directly on coord -> node */);
44301 + if (IS_ERR(op) || (op == NULL)) {
44302 + done_carry_pool(pool);
44303 + return RETERR(op ? PTR_ERR(op) : -EIO);
44304 + }
44305 + data->user = 0;
44306 + data->iplug = item_plugin_by_id(CTAIL_ID);
44307 + data->arg = &cluster_shift;
44308 +
44309 + data->length = 0;
44310 + data->data = NULL;
44311 +
44312 + op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
44313 + op->u.insert_flow.insert_point = coord;
44314 + op->u.insert_flow.flow = f;
44315 + op->u.insert_flow.data = data;
44316 + op->u.insert_flow.new_nodes = 0;
44317 +
44318 + lowest_level->track_type = CARRY_TRACK_CHANGE;
44319 + lowest_level->tracked = lh;
44320 +
44321 + result = reiser4_carry(lowest_level, NULL);
44322 + done_carry_pool(pool);
44323 +
44324 + return result;
44325 +}
44326 +
44327 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
44328 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
44329 + lock_handle * lh, flow_t * f,
44330 + int cluster_shift)
44331 +{
44332 + int ret;
44333 + coord_t pos;
44334 + lock_handle lock;
44335 +
44336 + assert("edward-484",
44337 + coord->between == AT_UNIT || coord->between == AFTER_ITEM);
44338 + assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
44339 +
44340 + coord_dup(&pos, coord);
44341 + pos.unit_pos = 0;
44342 + pos.between = AFTER_ITEM;
44343 +
44344 + init_lh(&lock);
44345 + copy_lh(&lock, lh);
44346 +
44347 + ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
44348 + done_lh(&lock);
44349 + assert("edward-1347", znode_is_write_locked(lh->node));
44350 + assert("edward-1228", !ret);
44351 + return ret;
44352 +}
44353 +
44354 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
44355 +static int overwrite_ctail(coord_t * coord, flow_t * f)
44356 +{
44357 + unsigned count;
44358 +
44359 + assert("edward-269", f->user == 0);
44360 + assert("edward-270", f->data != NULL);
44361 + assert("edward-271", f->length > 0);
44362 + assert("edward-272", coord_is_existing_unit(coord));
44363 + assert("edward-273", coord->unit_pos == 0);
44364 + assert("edward-274", znode_is_write_locked(coord->node));
44365 + assert("edward-275", reiser4_schedulable());
44366 + assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
44367 + assert("edward-1243", ctail_ok(coord));
44368 +
44369 + count = nr_units_ctail(coord);
44370 +
44371 + if (count > f->length)
44372 + count = f->length;
44373 + memcpy(first_unit(coord), f->data, count);
44374 + move_flow_forward(f, count);
44375 + coord->unit_pos += count;
44376 + return 0;
44377 +}
44378 +
44379 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
44380 + cut ctail (part or whole) starting from next unit position */
44381 +static int cut_ctail(coord_t * coord)
44382 +{
44383 + coord_t stop;
44384 +
44385 + assert("edward-435", coord->between == AT_UNIT &&
44386 + coord->item_pos < coord_num_items(coord) &&
44387 + coord->unit_pos <= coord_num_units(coord));
44388 +
44389 + if (coord->unit_pos == coord_num_units(coord))
44390 + /* nothing to cut */
44391 + return 0;
44392 + coord_dup(&stop, coord);
44393 + stop.unit_pos = coord_last_unit_pos(coord);
44394 +
44395 + return cut_node_content(coord, &stop, NULL, NULL, NULL);
44396 +}
44397 +
44398 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
44399 + struct inode * inode)
44400 +{
44401 + int result;
44402 + assert("edward-1244", inode != NULL);
44403 + assert("edward-1245", clust->hint != NULL);
44404 + assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44405 + assert("edward-1247", clust->reserved == 1);
44406 +
44407 + result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44408 + if (cbk_errored(result))
44409 + return result;
44410 + assert("edward-1249", result == CBK_COORD_NOTFOUND);
44411 + assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44412 +
44413 + assert("edward-1295",
44414 + clust->hint->ext_coord.lh->node ==
44415 + clust->hint->ext_coord.coord.node);
44416 +
44417 + coord_set_between_clusters(&clust->hint->ext_coord.coord);
44418 +
44419 + result = insert_unprepped_ctail(clust, inode);
44420 + all_grabbed2free();
44421 +
44422 + assert("edward-1251", !result);
44423 + assert("edward-1252", cryptcompress_inode_ok(inode));
44424 + assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44425 + assert("edward-1254",
44426 + reiser4_clustered_blocks(reiser4_get_current_sb()));
44427 + assert("edward-1255",
44428 + znode_convertible(clust->hint->ext_coord.coord.node));
44429 +
44430 + return result;
44431 +}
44432 +
44433 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44434 +{
44435 + int result = 0;
44436 + struct convert_item_info * info;
44437 +
44438 + assert("edward-468", pos != NULL);
44439 + assert("edward-469", pos->sq != NULL);
44440 + assert("edward-845", item_convert_data(pos) != NULL);
44441 +
44442 + info = item_convert_data(pos);
44443 + assert("edward-679", info->flow.data != NULL);
44444 +
44445 + switch (mode) {
44446 + case CRC_APPEND_ITEM:
44447 + assert("edward-1229", info->flow.length != 0);
44448 + assert("edward-1256",
44449 + cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44450 + result =
44451 + insert_cryptcompress_flow_in_place(&pos->coord,
44452 + &pos->lock,
44453 + &info->flow,
44454 + info->cluster_shift);
44455 + break;
44456 + case CRC_OVERWRITE_ITEM:
44457 + assert("edward-1230", info->flow.length != 0);
44458 + overwrite_ctail(&pos->coord, &info->flow);
44459 + if (info->flow.length != 0)
44460 + break;
44461 + case CRC_CUT_ITEM:
44462 + assert("edward-1231", info->flow.length == 0);
44463 + result = cut_ctail(&pos->coord);
44464 + break;
44465 + default:
44466 + result = RETERR(-EIO);
44467 + impossible("edward-244", "bad convert mode");
44468 + }
44469 + return result;
44470 +}
44471 +
44472 +/* plugin->u.item.f.scan */
44473 +int scan_ctail(flush_scan * scan)
44474 +{
44475 + int result = 0;
44476 + struct page *page;
44477 + struct inode *inode;
44478 + jnode *node = scan->node;
44479 +
44480 + assert("edward-227", scan->node != NULL);
44481 + assert("edward-228", jnode_is_cluster_page(scan->node));
44482 + assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44483 +
44484 + page = jnode_page(node);
44485 + inode = page->mapping->host;
44486 +
44487 + if (!reiser4_scanning_left(scan))
44488 + return result;
44489 + if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44490 + znode_make_dirty(scan->parent_lock.node);
44491 +
44492 + if (!znode_convertible(scan->parent_lock.node)) {
44493 + if (JF_ISSET(scan->node, JNODE_DIRTY))
44494 + znode_set_convertible(scan->parent_lock.node);
44495 + else {
44496 + warning("edward-681",
44497 + "cluster page is already processed");
44498 + return -EAGAIN;
44499 + }
44500 + }
44501 + return result;
44502 +}
44503 +
44504 +/* If true, this function attaches children */
44505 +static int should_attach_convert_idata(flush_pos_t * pos)
44506 +{
44507 + int result;
44508 + assert("edward-431", pos != NULL);
44509 + assert("edward-432", pos->child == NULL);
44510 + assert("edward-619", znode_is_write_locked(pos->coord.node));
44511 + assert("edward-470",
44512 + item_plugin_by_coord(&pos->coord) ==
44513 + item_plugin_by_id(CTAIL_ID));
44514 +
44515 + /* check for leftmost child */
44516 + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44517 +
44518 + if (!pos->child)
44519 + return 0;
44520 + spin_lock_jnode(pos->child);
44521 + result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44522 + pos->child->atom == ZJNODE(pos->coord.node)->atom);
44523 + spin_unlock_jnode(pos->child);
44524 + if (!result && pos->child) {
44525 + /* existing child isn't to attach, clear up this one */
44526 + jput(pos->child);
44527 + pos->child = NULL;
44528 + }
44529 + return result;
44530 +}
44531 +
44532 +/**
44533 + * Collect all needed information about the object here,
44534 + * as in-memory inode can be evicted from memory before
44535 + * disk update completion.
44536 + */
44537 +static int init_convert_data_ctail(struct convert_item_info * idata,
44538 + struct inode *inode)
44539 +{
44540 + assert("edward-813", idata != NULL);
44541 + assert("edward-814", inode != NULL);
44542 +
44543 + idata->cluster_shift = inode_cluster_shift(inode);
44544 + idata->d_cur = DC_FIRST_ITEM;
44545 + idata->d_next = DC_INVALID_STATE;
44546 +
44547 + return 0;
44548 +}
44549 +
44550 +static int alloc_item_convert_data(struct convert_info * sq)
44551 +{
44552 + assert("edward-816", sq != NULL);
44553 + assert("edward-817", sq->itm == NULL);
44554 +
44555 + sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44556 + if (sq->itm == NULL)
44557 + return RETERR(-ENOMEM);
44558 + return 0;
44559 +}
44560 +
44561 +static void free_item_convert_data(struct convert_info * sq)
44562 +{
44563 + assert("edward-818", sq != NULL);
44564 + assert("edward-819", sq->itm != NULL);
44565 + assert("edward-820", sq->iplug != NULL);
44566 +
44567 + kfree(sq->itm);
44568 + sq->itm = NULL;
44569 + return;
44570 +}
44571 +
44572 +static int alloc_convert_data(flush_pos_t * pos)
44573 +{
44574 + assert("edward-821", pos != NULL);
44575 + assert("edward-822", pos->sq == NULL);
44576 +
44577 + pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44578 + if (!pos->sq)
44579 + return RETERR(-ENOMEM);
44580 + memset(pos->sq, 0, sizeof(*pos->sq));
44581 + cluster_init_write(&pos->sq->clust, NULL);
44582 + return 0;
44583 +}
44584 +
44585 +void free_convert_data(flush_pos_t * pos)
44586 +{
44587 + struct convert_info *sq;
44588 +
44589 + assert("edward-823", pos != NULL);
44590 + assert("edward-824", pos->sq != NULL);
44591 +
44592 + sq = pos->sq;
44593 + if (sq->itm)
44594 + free_item_convert_data(sq);
44595 + put_cluster_handle(&sq->clust);
44596 + kfree(pos->sq);
44597 + pos->sq = NULL;
44598 + return;
44599 +}
44600 +
44601 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44602 +{
44603 + struct convert_info *sq;
44604 +
44605 + assert("edward-825", pos != NULL);
44606 + assert("edward-826", pos->sq != NULL);
44607 + assert("edward-827", item_convert_data(pos) != NULL);
44608 + assert("edward-828", inode != NULL);
44609 +
44610 + sq = pos->sq;
44611 +
44612 + memset(sq->itm, 0, sizeof(*sq->itm));
44613 +
44614 + /* iplug->init_convert_data() */
44615 + return init_convert_data_ctail(sq->itm, inode);
44616 +}
44617 +
44618 +/* create and attach disk cluster info used by 'convert' phase of the flush
44619 + squalloc() */
44620 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44621 +{
44622 + int ret = 0;
44623 + struct convert_item_info *info;
44624 + struct cluster_handle *clust;
44625 + file_plugin *fplug = inode_file_plugin(inode);
44626 + compression_plugin *cplug = inode_compression_plugin(inode);
44627 +
44628 + assert("edward-248", pos != NULL);
44629 + assert("edward-249", pos->child != NULL);
44630 + assert("edward-251", inode != NULL);
44631 + assert("edward-682", cryptcompress_inode_ok(inode));
44632 + assert("edward-252",
44633 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44634 + assert("edward-473",
44635 + item_plugin_by_coord(&pos->coord) ==
44636 + item_plugin_by_id(CTAIL_ID));
44637 +
44638 + if (!pos->sq) {
44639 + ret = alloc_convert_data(pos);
44640 + if (ret)
44641 + return ret;
44642 + }
44643 + clust = &pos->sq->clust;
44644 + ret = grab_coa(&clust->tc, cplug);
44645 + if (ret)
44646 + goto err;
44647 + ret = set_cluster_by_page(clust,
44648 + jnode_page(pos->child),
44649 + MAX_CLUSTER_NRPAGES);
44650 + if (ret)
44651 + goto err;
44652 +
44653 + assert("edward-829", pos->sq != NULL);
44654 + assert("edward-250", item_convert_data(pos) == NULL);
44655 +
44656 + pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44657 +
44658 + ret = alloc_item_convert_data(pos->sq);
44659 + if (ret)
44660 + goto err;
44661 + ret = init_item_convert_data(pos, inode);
44662 + if (ret)
44663 + goto err;
44664 + info = item_convert_data(pos);
44665 +
44666 + ret = checkout_logical_cluster(clust, pos->child, inode);
44667 + if (ret)
44668 + goto err;
44669 +
44670 + reiser4_deflate_cluster(clust, inode);
44671 + inc_item_convert_count(pos);
44672 +
44673 + /* prepare flow for insertion */
44674 + fplug->flow_by_inode(inode,
44675 + (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44676 + 0 /* kernel space */ ,
44677 + clust->tc.len,
44678 + clust_to_off(clust->index, inode),
44679 + WRITE_OP, &info->flow);
44680 + jput(pos->child);
44681 + return 0;
44682 + err:
44683 + jput(pos->child);
44684 + free_convert_data(pos);
44685 + return ret;
44686 +}
44687 +
44688 +/* clear up disk cluster info */
44689 +static void detach_convert_idata(struct convert_info * sq)
44690 +{
44691 + struct convert_item_info *info;
44692 +
44693 + assert("edward-253", sq != NULL);
44694 + assert("edward-840", sq->itm != NULL);
44695 +
44696 + info = sq->itm;
44697 + assert("edward-1212", info->flow.length == 0);
44698 +
44699 + free_item_convert_data(sq);
44700 + return;
44701 +}
44702 +
44703 +/* plugin->u.item.f.utmost_child */
44704 +
44705 +/* This function sets leftmost child for a first cluster item,
44706 + if the child exists, and NULL in other cases.
44707 + NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44708 +
44709 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44710 +{
44711 + reiser4_key key;
44712 +
44713 + item_key_by_coord(coord, &key);
44714 +
44715 + assert("edward-257", coord != NULL);
44716 + assert("edward-258", child != NULL);
44717 + assert("edward-259", side == LEFT_SIDE);
44718 + assert("edward-260",
44719 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44720 +
44721 + if (!is_disk_cluster_key(&key, coord))
44722 + *child = NULL;
44723 + else
44724 + *child = jlookup(current_tree,
44725 + get_key_objectid(item_key_by_coord
44726 + (coord, &key)),
44727 + off_to_pg(get_key_offset(&key)));
44728 + return 0;
44729 +}
44730 +
44731 +/* Returns true if @p2 is the next item to @p1
44732 + in the _same_ disk cluster.
44733 + Disk cluster is a set of items. If ->clustered() != NULL,
44734 + with each item the whole disk cluster should be read/modified
44735 +*/
44736 +
44737 +/* Go rightward and check for next disk cluster item, set
44738 + * d_next to DC_CHAINED_ITEM, if the last one exists.
44739 + * If the current position is last item, go to right neighbor.
44740 + * Skip empty nodes. Note, that right neighbors may be not in
44741 + * the slum because of races. If so, make it dirty and
44742 + * convertible.
44743 + */
44744 +static int next_item_dc_stat(flush_pos_t * pos)
44745 +{
44746 + int ret = 0;
44747 + int stop = 0;
44748 + znode *cur;
44749 + coord_t coord;
44750 + lock_handle lh;
44751 + lock_handle right_lock;
44752 +
44753 + assert("edward-1232", !node_is_empty(pos->coord.node));
44754 + assert("edward-1014",
44755 + pos->coord.item_pos < coord_num_items(&pos->coord));
44756 + assert("edward-1015", chaining_data_present(pos));
44757 + assert("edward-1017",
44758 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
44759 +
44760 + item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44761 +
44762 + if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44763 + return ret;
44764 + if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44765 + return ret;
44766 +
44767 + /* Check next slum item.
44768 + * Note, that it can not be killed by concurrent truncate,
44769 + * as the last one will want the lock held by us.
44770 + */
44771 + init_lh(&right_lock);
44772 + cur = pos->coord.node;
44773 +
44774 + while (!stop) {
44775 + init_lh(&lh);
44776 + ret = reiser4_get_right_neighbor(&lh,
44777 + cur,
44778 + ZNODE_WRITE_LOCK,
44779 + GN_CAN_USE_UPPER_LEVELS);
44780 + if (ret)
44781 + break;
44782 + ret = zload(lh.node);
44783 + if (ret) {
44784 + done_lh(&lh);
44785 + break;
44786 + }
44787 + coord_init_before_first_item(&coord, lh.node);
44788 +
44789 + if (node_is_empty(lh.node)) {
44790 + znode_make_dirty(lh.node);
44791 + znode_set_convertible(lh.node);
44792 + stop = 0;
44793 + } else if (same_disk_cluster(&pos->coord, &coord)) {
44794 +
44795 + item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44796 +
44797 + if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44798 + /*
44799 + warning("edward-1024",
44800 + "next slum item mergeable, "
44801 + "but znode %p isn't dirty\n",
44802 + lh.node);
44803 + */
44804 + znode_make_dirty(lh.node);
44805 + }
44806 + if (!znode_convertible(lh.node)) {
44807 + /*
44808 + warning("edward-1272",
44809 + "next slum item mergeable, "
44810 + "but znode %p isn't convertible\n",
44811 + lh.node);
44812 + */
44813 + znode_set_convertible(lh.node);
44814 + }
44815 + stop = 1;
44816 + } else
44817 + stop = 1;
44818 + zrelse(lh.node);
44819 + done_lh(&right_lock);
44820 + copy_lh(&right_lock, &lh);
44821 + done_lh(&lh);
44822 + cur = right_lock.node;
44823 + }
44824 + done_lh(&right_lock);
44825 +
44826 + if (ret == -E_NO_NEIGHBOR)
44827 + ret = 0;
44828 + return ret;
44829 +}
44830 +
44831 +static int
44832 +assign_convert_mode(struct convert_item_info * idata,
44833 + cryptcompress_write_mode_t * mode)
44834 +{
44835 + int result = 0;
44836 +
44837 + assert("edward-1025", idata != NULL);
44838 +
44839 + if (idata->flow.length) {
44840 + /* append or overwrite */
44841 + switch (idata->d_cur) {
44842 + case DC_FIRST_ITEM:
44843 + case DC_CHAINED_ITEM:
44844 + *mode = CRC_OVERWRITE_ITEM;
44845 + break;
44846 + case DC_AFTER_CLUSTER:
44847 + *mode = CRC_APPEND_ITEM;
44848 + break;
44849 + default:
44850 + impossible("edward-1018", "wrong current item state");
44851 + }
44852 + } else {
44853 + /* cut or invalidate */
44854 + switch (idata->d_cur) {
44855 + case DC_FIRST_ITEM:
44856 + case DC_CHAINED_ITEM:
44857 + *mode = CRC_CUT_ITEM;
44858 + break;
44859 + case DC_AFTER_CLUSTER:
44860 + result = 1;
44861 + break;
44862 + default:
44863 + impossible("edward-1019", "wrong current item state");
44864 + }
44865 + }
44866 + return result;
44867 +}
44868 +
44869 +/* plugin->u.item.f.convert */
44870 +/* write ctail in guessed mode */
44871 +int convert_ctail(flush_pos_t * pos)
44872 +{
44873 + int result;
44874 + int nr_items;
44875 + cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44876 +
44877 + assert("edward-1020", pos != NULL);
44878 + assert("edward-1213", coord_num_items(&pos->coord) != 0);
44879 + assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44880 + assert("edward-1258", ctail_ok(&pos->coord));
44881 + assert("edward-261", pos->coord.node != NULL);
44882 +
44883 + nr_items = coord_num_items(&pos->coord);
44884 + if (!chaining_data_present(pos)) {
44885 + if (should_attach_convert_idata(pos)) {
44886 + /* attach convert item info */
44887 + struct inode *inode;
44888 +
44889 + assert("edward-264", pos->child != NULL);
44890 + assert("edward-265", jnode_page(pos->child) != NULL);
44891 + assert("edward-266",
44892 + jnode_page(pos->child)->mapping != NULL);
44893 +
44894 + inode = jnode_page(pos->child)->mapping->host;
44895 +
44896 + assert("edward-267", inode != NULL);
44897 +
44898 + /* attach item convert info by child and put the last one */
44899 + result = attach_convert_idata(pos, inode);
44900 + pos->child = NULL;
44901 + if (result == -E_REPEAT) {
44902 + /* jnode became clean, or there is no dirty
44903 + pages (nothing to update in disk cluster) */
44904 + warning("edward-1021",
44905 + "convert_ctail: nothing to attach");
44906 + return 0;
44907 + }
44908 + if (result != 0)
44909 + return result;
44910 + } else
44911 + /* unconvertible */
44912 + return 0;
44913 + } else {
44914 + /* use old convert info */
44915 +
44916 + struct convert_item_info *idata;
44917 +
44918 + idata = item_convert_data(pos);
44919 +
44920 + result = assign_convert_mode(idata, &mode);
44921 + if (result) {
44922 + /* disk cluster is over,
44923 + nothing to update anymore */
44924 + detach_convert_idata(pos->sq);
44925 + return 0;
44926 + }
44927 + }
44928 +
44929 + assert("edward-433", chaining_data_present(pos));
44930 + assert("edward-1022",
44931 + pos->coord.item_pos < coord_num_items(&pos->coord));
44932 +
44933 + /* check if next item is of current disk cluster */
44934 + result = next_item_dc_stat(pos);
44935 + if (result) {
44936 + detach_convert_idata(pos->sq);
44937 + return result;
44938 + }
44939 + result = do_convert_ctail(pos, mode);
44940 + if (result) {
44941 + detach_convert_idata(pos->sq);
44942 + return result;
44943 + }
44944 + switch (mode) {
44945 + case CRC_CUT_ITEM:
44946 + assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44947 + assert("edward-1215",
44948 + coord_num_items(&pos->coord) == nr_items ||
44949 + coord_num_items(&pos->coord) == nr_items - 1);
44950 + if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44951 + break;
44952 + if (coord_num_items(&pos->coord) != nr_items) {
44953 + /* the item was killed, no more chained items */
44954 + detach_convert_idata(pos->sq);
44955 + if (!node_is_empty(pos->coord.node))
44956 + /* make sure the next item will be scanned */
44957 + coord_init_before_item(&pos->coord);
44958 + break;
44959 + }
44960 + case CRC_APPEND_ITEM:
44961 + assert("edward-434", item_convert_data(pos)->flow.length == 0);
44962 + detach_convert_idata(pos->sq);
44963 + break;
44964 + case CRC_OVERWRITE_ITEM:
44965 + if (coord_is_unprepped_ctail(&pos->coord)) {
44966 + /* convert unpprepped ctail to prepped one */
44967 + assert("edward-1259",
44968 + cluster_shift_ok(item_convert_data(pos)->
44969 + cluster_shift));
44970 + put_unaligned((d8)item_convert_data(pos)->cluster_shift,
44971 + &ctail_formatted_at(&pos->coord)->
44972 + cluster_shift);
44973 + }
44974 + break;
44975 + }
44976 + return result;
44977 +}
44978 +
44979 +/* Make Linus happy.
44980 + Local variables:
44981 + c-indentation-style: "K&R"
44982 + mode-name: "LC"
44983 + c-basic-offset: 8
44984 + tab-width: 8
44985 + fill-column: 120
44986 + End:
44987 +*/
44988 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.33/fs/reiser4/plugin/item/ctail.h
44989 --- linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 01:00:00.000000000 +0100
44990 +++ linux-2.6.33/fs/reiser4/plugin/item/ctail.h 2010-03-04 19:33:22.000000000 +0100
44991 @@ -0,0 +1,102 @@
44992 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44993 +
44994 +/* Ctail items are fragments (or bodies) of special tipe to provide
44995 + optimal storage of encrypted and(or) compressed files. */
44996 +
44997 +
44998 +#if !defined( __FS_REISER4_CTAIL_H__ )
44999 +#define __FS_REISER4_CTAIL_H__
45000 +
45001 +/* Disk format of ctail item */
45002 +typedef struct ctail_item_format {
45003 + /* packed shift;
45004 + if its value is different from UCTAIL_SHIFT (see below), then
45005 + size of disk cluster is calculated as (1 << cluster_shift) */
45006 + d8 cluster_shift;
45007 + /* ctail body */
45008 + d8 body[0];
45009 +} __attribute__ ((packed)) ctail_item_format;
45010 +
45011 +/* "Unprepped" disk cluster is represented by a single ctail item
45012 + with the following "magic" attributes: */
45013 +/* "magic" cluster_shift */
45014 +#define UCTAIL_SHIFT 0xff
45015 +/* How many units unprepped ctail item has */
45016 +#define UCTAIL_NR_UNITS 1
45017 +
45018 +/* The following is a set of various item states in a disk cluster.
45019 + Disk cluster is a set of items whose keys belong to the interval
45020 + [dc_key , dc_key + disk_cluster_size - 1] */
45021 +typedef enum {
45022 + DC_INVALID_STATE = 0,
45023 + DC_FIRST_ITEM = 1,
45024 + DC_CHAINED_ITEM = 2,
45025 + DC_AFTER_CLUSTER = 3
45026 +} dc_item_stat;
45027 +
45028 +/* ctail-specific extension.
45029 + In particular this describes parameters of disk cluster an item belongs to */
45030 +struct ctail_coord_extension {
45031 + int shift; /* this contains cluster_shift extracted from
45032 + ctail_item_format (above), or UCTAIL_SHIFT
45033 + (the last one is the "magic" of unprepped disk clusters)*/
45034 + int dsize; /* size of a prepped disk cluster */
45035 + int ncount; /* count of nodes occupied by a disk cluster */
45036 +};
45037 +
45038 +struct cut_list;
45039 +
45040 +/* plugin->item.b.* */
45041 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
45042 + const reiser4_item_data *);
45043 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
45044 +pos_in_node_t nr_units_ctail(const coord_t * coord);
45045 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
45046 +void print_ctail(const char *prefix, coord_t * coord);
45047 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
45048 +
45049 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
45050 + carry_plugin_info * info UNUSED_ARG);
45051 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
45052 +int can_shift_ctail(unsigned free_space, coord_t * coord,
45053 + znode * target, shift_direction pend, unsigned *size,
45054 + unsigned want);
45055 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
45056 + unsigned count, shift_direction where_is_free_space,
45057 + unsigned free_space);
45058 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45059 + carry_cut_data *, reiser4_key * smallest_removed,
45060 + reiser4_key * new_first);
45061 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45062 + carry_kill_data *, reiser4_key * smallest_removed,
45063 + reiser4_key * new_first);
45064 +int ctail_ok(const coord_t * coord);
45065 +int check_ctail(const coord_t * coord, const char **error);
45066 +
45067 +/* plugin->u.item.s.* */
45068 +int read_ctail(struct file *, flow_t *, hint_t *);
45069 +int readpage_ctail(void *, struct page *);
45070 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
45071 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
45072 +int create_hook_ctail(const coord_t * coord, void *arg);
45073 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
45074 + carry_kill_data *);
45075 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
45076 +
45077 +/* plugin->u.item.f */
45078 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
45079 +int scan_ctail(flush_scan *);
45080 +int convert_ctail(flush_pos_t *);
45081 +size_t inode_scaled_cluster_size(struct inode *);
45082 +
45083 +#endif /* __FS_REISER4_CTAIL_H__ */
45084 +
45085 +/* Make Linus happy.
45086 + Local variables:
45087 + c-indentation-style: "K&R"
45088 + mode-name: "LC"
45089 + c-basic-offset: 8
45090 + tab-width: 8
45091 + fill-column: 120
45092 + End:
45093 +*/
45094 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent.c linux-2.6.33/fs/reiser4/plugin/item/extent.c
45095 --- linux-2.6.33.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 01:00:00.000000000 +0100
45096 +++ linux-2.6.33/fs/reiser4/plugin/item/extent.c 2010-03-04 19:33:22.000000000 +0100
45097 @@ -0,0 +1,197 @@
45098 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45099 +
45100 +#include "item.h"
45101 +#include "../../key.h"
45102 +#include "../../super.h"
45103 +#include "../../carry.h"
45104 +#include "../../inode.h"
45105 +#include "../../page_cache.h"
45106 +#include "../../flush.h"
45107 +#include "../object.h"
45108 +
45109 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
45110 +/* Audited by: green(2002.06.13) */
45111 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
45112 + int nr_extents)
45113 +{
45114 + data->data = ext_unit;
45115 + /* data->data is kernel space */
45116 + data->user = 0;
45117 + data->length = sizeof(reiser4_extent) * nr_extents;
45118 + data->arg = NULL;
45119 + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
45120 + return data;
45121 +}
45122 +
45123 +/* how many bytes are addressed by @nr first extents of the extent item */
45124 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
45125 +{
45126 + pos_in_node_t i;
45127 + reiser4_block_nr blocks;
45128 + reiser4_extent *ext;
45129 +
45130 + ext = item_body_by_coord(coord);
45131 + assert("vs-263", nr <= nr_units_extent(coord));
45132 +
45133 + blocks = 0;
45134 + for (i = 0; i < nr; i++, ext++) {
45135 + blocks += extent_get_width(ext);
45136 + }
45137 +
45138 + return blocks * current_blocksize;
45139 +}
45140 +
45141 +extent_state state_of_extent(reiser4_extent * ext)
45142 +{
45143 + switch ((int)extent_get_start(ext)) {
45144 + case 0:
45145 + return HOLE_EXTENT;
45146 + case 1:
45147 + return UNALLOCATED_EXTENT;
45148 + default:
45149 + break;
45150 + }
45151 + return ALLOCATED_EXTENT;
45152 +}
45153 +
45154 +int extent_is_unallocated(const coord_t * item)
45155 +{
45156 + assert("jmacd-5133", item_is_extent(item));
45157 +
45158 + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
45159 +}
45160 +
45161 +/* set extent's start and width */
45162 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
45163 + reiser4_block_nr width)
45164 +{
45165 + extent_set_start(ext, start);
45166 + extent_set_width(ext, width);
45167 +}
45168 +
45169 +/**
45170 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
45171 + * @un_extent: coordinate of extent to be overwritten
45172 + * @lh: need better comment
45173 + * @key: need better comment
45174 + * @exts_to_add: data prepared for insertion into tree
45175 + * @replace: need better comment
45176 + * @flags: need better comment
45177 + * @return_insert_position: need better comment
45178 + *
45179 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
45180 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
45181 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
45182 + * set to extent which was overwritten.
45183 + */
45184 +int reiser4_replace_extent(struct replace_handle *h,
45185 + int return_inserted_position)
45186 +{
45187 + int result;
45188 + znode *orig_znode;
45189 + /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
45190 +
45191 + assert("vs-990", coord_is_existing_unit(h->coord));
45192 + assert("vs-1375", znode_is_write_locked(h->coord->node));
45193 + assert("vs-1426", extent_get_width(&h->overwrite) != 0);
45194 + assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
45195 + assert("vs-1427", ergo(h->nr_new_extents == 2,
45196 + extent_get_width(&h->new_extents[1]) != 0));
45197 +
45198 + /* compose structure for paste */
45199 + init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
45200 +
45201 + coord_dup(&h->coord_after, h->coord);
45202 + init_lh(&h->lh_after);
45203 + copy_lh(&h->lh_after, h->lh);
45204 + reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
45205 + reiser4_tap_monitor(&h->watch);
45206 +
45207 + ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
45208 + orig_znode = h->coord->node;
45209 +
45210 +#if REISER4_DEBUG
45211 + /* make sure that key is set properly */
45212 + unit_key_by_coord(h->coord, &h->tmp);
45213 + set_key_offset(&h->tmp,
45214 + get_key_offset(&h->tmp) +
45215 + extent_get_width(&h->overwrite) * current_blocksize);
45216 + assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
45217 +#endif
45218 +
45219 + /* set insert point after unit to be replaced */
45220 + h->coord->between = AFTER_UNIT;
45221 +
45222 + result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
45223 + &h->paste_key, &h->item, h->flags);
45224 + if (!result) {
45225 + /* now we have to replace the unit after which new units were
45226 + inserted. Its position is tracked by @watch */
45227 + reiser4_extent *ext;
45228 + znode *node;
45229 +
45230 + node = h->coord_after.node;
45231 + if (node != orig_znode) {
45232 + coord_clear_iplug(&h->coord_after);
45233 + result = zload(node);
45234 + }
45235 +
45236 + if (likely(!result)) {
45237 + ext = extent_by_coord(&h->coord_after);
45238 +
45239 + assert("vs-987", znode_is_loaded(node));
45240 + assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
45241 +
45242 + /* overwrite extent unit */
45243 + memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
45244 + znode_make_dirty(node);
45245 +
45246 + if (node != orig_znode)
45247 + zrelse(node);
45248 +
45249 + if (return_inserted_position == 0) {
45250 + /* coord and lh are to be set to overwritten
45251 + extent */
45252 + assert("vs-1662",
45253 + WITH_DATA(node, !memcmp(&h->overwrite,
45254 + extent_by_coord(
45255 + &h->coord_after),
45256 + sizeof(reiser4_extent))));
45257 +
45258 + *h->coord = h->coord_after;
45259 + done_lh(h->lh);
45260 + copy_lh(h->lh, &h->lh_after);
45261 + } else {
45262 + /* h->coord and h->lh are to be set to first of
45263 + inserted units */
45264 + assert("vs-1663",
45265 + WITH_DATA(h->coord->node,
45266 + !memcmp(&h->new_extents[0],
45267 + extent_by_coord(h->coord),
45268 + sizeof(reiser4_extent))));
45269 + assert("vs-1664", h->lh->node == h->coord->node);
45270 + }
45271 + }
45272 + }
45273 + reiser4_tap_done(&h->watch);
45274 +
45275 + return result;
45276 +}
45277 +
45278 +lock_handle *znode_lh(znode *node)
45279 +{
45280 + assert("vs-1371", znode_is_write_locked(node));
45281 + assert("vs-1372", znode_is_wlocked_once(node));
45282 + return list_entry(node->lock.owners.next, lock_handle, owners_link);
45283 +}
45284 +
45285 +/*
45286 + * Local variables:
45287 + * c-indentation-style: "K&R"
45288 + * mode-name: "LC"
45289 + * c-basic-offset: 8
45290 + * tab-width: 8
45291 + * fill-column: 79
45292 + * scroll-step: 1
45293 + * End:
45294 + */
45295 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_file_ops.c
45296 --- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 01:00:00.000000000 +0100
45297 +++ linux-2.6.33/fs/reiser4/plugin/item/extent_file_ops.c 2010-03-04 19:33:22.000000000 +0100
45298 @@ -0,0 +1,1453 @@
45299 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45300 +
45301 +#include "item.h"
45302 +#include "../../inode.h"
45303 +#include "../../page_cache.h"
45304 +#include "../object.h"
45305 +
45306 +#include <linux/quotaops.h>
45307 +#include <linux/swap.h>
45308 +
45309 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
45310 +{
45311 + reiser4_extent *ext;
45312 +
45313 + ext = (reiser4_extent *) (zdata(node) + offset);
45314 + return ext;
45315 +}
45316 +
45317 +/**
45318 + * check_uf_coord - verify coord extension
45319 + * @uf_coord:
45320 + * @key:
45321 + *
45322 + * Makes sure that all fields of @uf_coord are set properly. If @key is
45323 + * specified - check whether @uf_coord is set correspondingly.
45324 + */
45325 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
45326 +{
45327 +#if REISER4_DEBUG
45328 + const coord_t *coord;
45329 + const struct extent_coord_extension *ext_coord;
45330 + reiser4_extent *ext;
45331 +
45332 + coord = &uf_coord->coord;
45333 + ext_coord = &uf_coord->extension.extent;
45334 + ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
45335 +
45336 + assert("",
45337 + WITH_DATA(coord->node,
45338 + (uf_coord->valid == 1 &&
45339 + coord_is_iplug_set(coord) &&
45340 + item_is_extent(coord) &&
45341 + ext_coord->nr_units == nr_units_extent(coord) &&
45342 + ext == extent_by_coord(coord) &&
45343 + ext_coord->width == extent_get_width(ext) &&
45344 + coord->unit_pos < ext_coord->nr_units &&
45345 + ext_coord->pos_in_unit < ext_coord->width &&
45346 + memcmp(ext, &ext_coord->extent,
45347 + sizeof(reiser4_extent)) == 0)));
45348 + if (key) {
45349 + reiser4_key coord_key;
45350 +
45351 + unit_key_by_coord(&uf_coord->coord, &coord_key);
45352 + set_key_offset(&coord_key,
45353 + get_key_offset(&coord_key) +
45354 + (uf_coord->extension.extent.
45355 + pos_in_unit << PAGE_CACHE_SHIFT));
45356 + assert("", keyeq(key, &coord_key));
45357 + }
45358 +#endif
45359 +}
45360 +
45361 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
45362 +{
45363 + check_uf_coord(uf_coord, NULL);
45364 +
45365 + return ext_by_offset(uf_coord->coord.node,
45366 + uf_coord->extension.extent.ext_offset);
45367 +}
45368 +
45369 +#if REISER4_DEBUG
45370 +
45371 +/**
45372 + * offset_is_in_unit
45373 + *
45374 + *
45375 + *
45376 + */
45377 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
45378 + pos_in_unit inside of unit correspondingly */
45379 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
45380 +{
45381 + reiser4_key unit_key;
45382 + __u64 unit_off;
45383 + reiser4_extent *ext;
45384 +
45385 + ext = extent_by_coord(coord);
45386 +
45387 + unit_key_extent(coord, &unit_key);
45388 + unit_off = get_key_offset(&unit_key);
45389 + if (off < unit_off)
45390 + return 0;
45391 + if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
45392 + return 0;
45393 + return 1;
45394 +}
45395 +
45396 +static int
45397 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
45398 +{
45399 + reiser4_key item_key;
45400 +
45401 + assert("vs-771", coord_is_existing_unit(coord));
45402 + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
45403 + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45404 +
45405 + return offset_is_in_unit(coord, get_key_offset(key));
45406 +}
45407 +
45408 +#endif
45409 +
45410 +/**
45411 + * can_append -
45412 + * @key:
45413 + * @coord:
45414 + *
45415 + * Returns 1 if @key is equal to an append key of item @coord is set to
45416 + */
45417 +static int can_append(const reiser4_key *key, const coord_t *coord)
45418 +{
45419 + reiser4_key append_key;
45420 +
45421 + return keyeq(key, append_key_extent(coord, &append_key));
45422 +}
45423 +
45424 +/**
45425 + * append_hole
45426 + * @coord:
45427 + * @lh:
45428 + * @key:
45429 + *
45430 + */
45431 +static int append_hole(coord_t *coord, lock_handle *lh,
45432 + const reiser4_key *key)
45433 +{
45434 + reiser4_key append_key;
45435 + reiser4_block_nr hole_width;
45436 + reiser4_extent *ext, new_ext;
45437 + reiser4_item_data idata;
45438 +
45439 + /* last item of file may have to be appended with hole */
45440 + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45441 + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45442 +
45443 + /* key of first byte which is not addressed by this extent */
45444 + append_key_extent(coord, &append_key);
45445 +
45446 + assert("", keyle(&append_key, key));
45447 +
45448 + /*
45449 + * extent item has to be appended with hole. Calculate length of that
45450 + * hole
45451 + */
45452 + hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45453 + current_blocksize - 1) >> current_blocksize_bits);
45454 + assert("vs-954", hole_width > 0);
45455 +
45456 + /* set coord after last unit */
45457 + coord_init_after_item_end(coord);
45458 +
45459 + /* get last extent in the item */
45460 + ext = extent_by_coord(coord);
45461 + if (state_of_extent(ext) == HOLE_EXTENT) {
45462 + /*
45463 + * last extent of a file is hole extent. Widen that extent by
45464 + * @hole_width blocks. Note that we do not worry about
45465 + * overflowing - extent width is 64 bits
45466 + */
45467 + reiser4_set_extent(ext, HOLE_EXTENT_START,
45468 + extent_get_width(ext) + hole_width);
45469 + znode_make_dirty(coord->node);
45470 + return 0;
45471 + }
45472 +
45473 + /* append last item of the file with hole extent unit */
45474 + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45475 + state_of_extent(ext) == UNALLOCATED_EXTENT));
45476 +
45477 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45478 + init_new_extent(&idata, &new_ext, 1);
45479 + return insert_into_item(coord, lh, &append_key, &idata, 0);
45480 +}
45481 +
45482 +/**
45483 + * check_jnodes
45484 + * @twig: longterm locked twig node
45485 + * @key:
45486 + *
45487 + */
45488 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45489 +{
45490 +#if REISER4_DEBUG
45491 + coord_t c;
45492 + reiser4_key node_key, jnode_key;
45493 +
45494 + jnode_key = *key;
45495 +
45496 + assert("", twig != NULL);
45497 + assert("", znode_get_level(twig) == TWIG_LEVEL);
45498 + assert("", znode_is_write_locked(twig));
45499 +
45500 + zload(twig);
45501 + /* get the smallest key in twig node */
45502 + coord_init_first_unit(&c, twig);
45503 + unit_key_by_coord(&c, &node_key);
45504 + assert("", keyle(&node_key, &jnode_key));
45505 +
45506 + coord_init_last_unit(&c, twig);
45507 + unit_key_by_coord(&c, &node_key);
45508 + if (item_plugin_by_coord(&c)->s.file.append_key)
45509 + item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45510 + set_key_offset(&jnode_key,
45511 + get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45512 + assert("", keylt(&jnode_key, &node_key));
45513 + zrelse(twig);
45514 +#endif
45515 +}
45516 +
45517 +/**
45518 + * append_last_extent - append last file item
45519 + * @uf_coord: coord to start insertion from
45520 + * @jnodes: array of jnodes
45521 + * @count: number of jnodes in the array
45522 + *
45523 + * There is already at least one extent item of file @inode in the tree. Append
45524 + * the last of them with unallocated extent unit of width @count. Assign
45525 + * fake block numbers to jnodes corresponding to the inserted extent.
45526 + */
45527 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45528 + jnode **jnodes, int count)
45529 +{
45530 + int result;
45531 + reiser4_extent new_ext;
45532 + reiser4_item_data idata;
45533 + coord_t *coord;
45534 + struct extent_coord_extension *ext_coord;
45535 + reiser4_extent *ext;
45536 + reiser4_block_nr block;
45537 + jnode *node;
45538 + int i;
45539 +
45540 + coord = &uf_coord->coord;
45541 + ext_coord = &uf_coord->extension.extent;
45542 + ext = ext_by_ext_coord(uf_coord);
45543 +
45544 + /* check correctness of position in the item */
45545 + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45546 + assert("vs-1311", coord->between == AFTER_UNIT);
45547 + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45548 +
45549 + if (!can_append(key, coord)) {
45550 + /* hole extent has to be inserted */
45551 + result = append_hole(coord, uf_coord->lh, key);
45552 + uf_coord->valid = 0;
45553 + return result;
45554 + }
45555 +
45556 + if (count == 0)
45557 + return 0;
45558 +
45559 + assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45560 +
45561 + result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
45562 + count);
45563 + BUG_ON(result != 0);
45564 +
45565 + switch (state_of_extent(ext)) {
45566 + case UNALLOCATED_EXTENT:
45567 + /*
45568 + * last extent unit of the file is unallocated one. Increase
45569 + * its width by @count
45570 + */
45571 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45572 + extent_get_width(ext) + count);
45573 + znode_make_dirty(coord->node);
45574 +
45575 + /* update coord extension */
45576 + ext_coord->width += count;
45577 + ON_DEBUG(extent_set_width
45578 + (&uf_coord->extension.extent.extent,
45579 + ext_coord->width));
45580 + break;
45581 +
45582 + case HOLE_EXTENT:
45583 + case ALLOCATED_EXTENT:
45584 + /*
45585 + * last extent unit of the file is either hole or allocated
45586 + * one. Append one unallocated extent of width @count
45587 + */
45588 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45589 + init_new_extent(&idata, &new_ext, 1);
45590 + result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45591 + uf_coord->valid = 0;
45592 + if (result)
45593 + return result;
45594 + break;
45595 +
45596 + default:
45597 + return RETERR(-EIO);
45598 + }
45599 +
45600 + /*
45601 + * make sure that we hold long term locked twig node containing all
45602 + * jnodes we are about to capture
45603 + */
45604 + check_jnodes(uf_coord->lh->node, key, count);
45605 +
45606 + /*
45607 + * assign fake block numbers to all jnodes. FIXME: make sure whether
45608 + * twig node containing inserted extent item is locked
45609 + */
45610 + block = fake_blocknr_unformatted(count);
45611 + for (i = 0; i < count; i ++, block ++) {
45612 + node = jnodes[i];
45613 + spin_lock_jnode(node);
45614 + JF_SET(node, JNODE_CREATED);
45615 + jnode_set_block(node, &block);
45616 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45617 + BUG_ON(result != 0);
45618 + jnode_make_dirty_locked(node);
45619 + spin_unlock_jnode(node);
45620 + }
45621 + return count;
45622 +}
45623 +
45624 +/**
45625 + * insert_first_hole - inser hole extent into tree
45626 + * @coord:
45627 + * @lh:
45628 + * @key:
45629 + *
45630 + *
45631 + */
45632 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
45633 + const reiser4_key *key)
45634 +{
45635 + reiser4_extent new_ext;
45636 + reiser4_item_data idata;
45637 + reiser4_key item_key;
45638 + reiser4_block_nr hole_width;
45639 +
45640 + /* @coord must be set for inserting of new item */
45641 + assert("vs-711", coord_is_between_items(coord));
45642 +
45643 + item_key = *key;
45644 + set_key_offset(&item_key, 0ull);
45645 +
45646 + hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45647 + current_blocksize_bits);
45648 + assert("vs-710", hole_width > 0);
45649 +
45650 + /* compose body of hole extent and insert item into tree */
45651 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45652 + init_new_extent(&idata, &new_ext, 1);
45653 + return insert_extent_by_coord(coord, &idata, &item_key, lh);
45654 +}
45655 +
45656 +
45657 +/**
45658 + * insert_first_extent - insert first file item
45659 + * @inode: inode of file
45660 + * @uf_coord: coord to start insertion from
45661 + * @jnodes: array of jnodes
45662 + * @count: number of jnodes in the array
45663 + * @inode:
45664 + *
45665 + * There are no items of file @inode in the tree yet. Insert unallocated extent
45666 + * of width @count into tree or hole extent if writing not to the
45667 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45668 + * unallocated extent. Returns number of jnodes or error code.
45669 + */
45670 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45671 + jnode **jnodes, int count,
45672 + struct inode *inode)
45673 +{
45674 + int result;
45675 + int i;
45676 + reiser4_extent new_ext;
45677 + reiser4_item_data idata;
45678 + reiser4_block_nr block;
45679 + struct unix_file_info *uf_info;
45680 + jnode *node;
45681 +
45682 + /* first extent insertion starts at leaf level */
45683 + assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45684 + assert("vs-711", coord_is_between_items(&uf_coord->coord));
45685 +
45686 + if (get_key_offset(key) != 0) {
45687 + result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45688 + uf_coord->valid = 0;
45689 + uf_info = unix_file_inode_data(inode);
45690 +
45691 + /*
45692 + * first item insertion is only possible when writing to empty
45693 + * file or performing tail conversion
45694 + */
45695 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45696 + (reiser4_inode_get_flag(inode,
45697 + REISER4_PART_MIXED) &&
45698 + reiser4_inode_get_flag(inode,
45699 + REISER4_PART_IN_CONV))));
45700 + /* if file was empty - update its state */
45701 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45702 + uf_info->container = UF_CONTAINER_EXTENTS;
45703 + return result;
45704 + }
45705 +
45706 + if (count == 0)
45707 + return 0;
45708 +
45709 + result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
45710 + count);
45711 + BUG_ON(result != 0);
45712 +
45713 + /*
45714 + * prepare for tree modification: compose body of item and item data
45715 + * structure needed for insertion
45716 + */
45717 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45718 + init_new_extent(&idata, &new_ext, 1);
45719 +
45720 + /* insert extent item into the tree */
45721 + result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45722 + uf_coord->lh);
45723 + if (result)
45724 + return result;
45725 +
45726 + /*
45727 + * make sure that we hold long term locked twig node containing all
45728 + * jnodes we are about to capture
45729 + */
45730 + check_jnodes(uf_coord->lh->node, key, count);
45731 + /*
45732 + * assign fake block numbers to all jnodes, capture and mark them dirty
45733 + */
45734 + block = fake_blocknr_unformatted(count);
45735 + for (i = 0; i < count; i ++, block ++) {
45736 + node = jnodes[i];
45737 + spin_lock_jnode(node);
45738 + JF_SET(node, JNODE_CREATED);
45739 + jnode_set_block(node, &block);
45740 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45741 + BUG_ON(result != 0);
45742 + jnode_make_dirty_locked(node);
45743 + spin_unlock_jnode(node);
45744 + }
45745 +
45746 + /*
45747 + * invalidate coordinate, research must be performed to continue
45748 + * because write will continue on twig level
45749 + */
45750 + uf_coord->valid = 0;
45751 + return count;
45752 +}
45753 +
45754 +/**
45755 + * plug_hole - replace hole extent with unallocated and holes
45756 + * @uf_coord:
45757 + * @key:
45758 + * @node:
45759 + * @h: structure containing coordinate, lock handle, key, etc
45760 + *
45761 + * Creates an unallocated extent of width 1 within a hole. In worst case two
45762 + * additional extents can be created.
45763 + */
45764 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45765 +{
45766 + struct replace_handle rh;
45767 + reiser4_extent *ext;
45768 + reiser4_block_nr width, pos_in_unit;
45769 + coord_t *coord;
45770 + struct extent_coord_extension *ext_coord;
45771 + int return_inserted_position;
45772 +
45773 + check_uf_coord(uf_coord, key);
45774 +
45775 + rh.coord = coord_by_uf_coord(uf_coord);
45776 + rh.lh = uf_coord->lh;
45777 + rh.flags = 0;
45778 +
45779 + coord = coord_by_uf_coord(uf_coord);
45780 + ext_coord = ext_coord_by_uf_coord(uf_coord);
45781 + ext = ext_by_ext_coord(uf_coord);
45782 +
45783 + width = ext_coord->width;
45784 + pos_in_unit = ext_coord->pos_in_unit;
45785 +
45786 + *how = 0;
45787 + if (width == 1) {
45788 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45789 + znode_make_dirty(coord->node);
45790 + /* update uf_coord */
45791 + ON_DEBUG(ext_coord->extent = *ext);
45792 + *how = 1;
45793 + return 0;
45794 + } else if (pos_in_unit == 0) {
45795 + /* we deal with first element of extent */
45796 + if (coord->unit_pos) {
45797 + /* there is an extent to the left */
45798 + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45799 + /*
45800 + * left neighboring unit is an unallocated
45801 + * extent. Increase its width and decrease
45802 + * width of hole
45803 + */
45804 + extent_set_width(ext - 1,
45805 + extent_get_width(ext - 1) + 1);
45806 + extent_set_width(ext, width - 1);
45807 + znode_make_dirty(coord->node);
45808 +
45809 + /* update coord extension */
45810 + coord->unit_pos--;
45811 + ext_coord->width = extent_get_width(ext - 1);
45812 + ext_coord->pos_in_unit = ext_coord->width - 1;
45813 + ext_coord->ext_offset -= sizeof(reiser4_extent);
45814 + ON_DEBUG(ext_coord->extent =
45815 + *extent_by_coord(coord));
45816 + *how = 2;
45817 + return 0;
45818 + }
45819 + }
45820 + /* extent for replace */
45821 + reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45822 + /* extent to be inserted */
45823 + reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45824 + width - 1);
45825 + rh.nr_new_extents = 1;
45826 +
45827 + /* have reiser4_replace_extent to return with @coord and
45828 + @uf_coord->lh set to unit which was replaced */
45829 + return_inserted_position = 0;
45830 + *how = 3;
45831 + } else if (pos_in_unit == width - 1) {
45832 + /* we deal with last element of extent */
45833 + if (coord->unit_pos < nr_units_extent(coord) - 1) {
45834 + /* there is an extent unit to the right */
45835 + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45836 + /*
45837 + * right neighboring unit is an unallocated
45838 + * extent. Increase its width and decrease
45839 + * width of hole
45840 + */
45841 + extent_set_width(ext + 1,
45842 + extent_get_width(ext + 1) + 1);
45843 + extent_set_width(ext, width - 1);
45844 + znode_make_dirty(coord->node);
45845 +
45846 + /* update coord extension */
45847 + coord->unit_pos++;
45848 + ext_coord->width = extent_get_width(ext + 1);
45849 + ext_coord->pos_in_unit = 0;
45850 + ext_coord->ext_offset += sizeof(reiser4_extent);
45851 + ON_DEBUG(ext_coord->extent =
45852 + *extent_by_coord(coord));
45853 + *how = 4;
45854 + return 0;
45855 + }
45856 + }
45857 + /* extent for replace */
45858 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45859 + /* extent to be inserted */
45860 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45861 + 1);
45862 + rh.nr_new_extents = 1;
45863 +
45864 + /* have reiser4_replace_extent to return with @coord and
45865 + @uf_coord->lh set to unit which was inserted */
45866 + return_inserted_position = 1;
45867 + *how = 5;
45868 + } else {
45869 + /* extent for replace */
45870 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45871 + pos_in_unit);
45872 + /* extents to be inserted */
45873 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45874 + 1);
45875 + reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45876 + width - pos_in_unit - 1);
45877 + rh.nr_new_extents = 2;
45878 +
45879 + /* have reiser4_replace_extent to return with @coord and
45880 + @uf_coord->lh set to first of units which were inserted */
45881 + return_inserted_position = 1;
45882 + *how = 6;
45883 + }
45884 + unit_key_by_coord(coord, &rh.paste_key);
45885 + set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45886 + extent_get_width(&rh.overwrite) * current_blocksize);
45887 +
45888 + uf_coord->valid = 0;
45889 + return reiser4_replace_extent(&rh, return_inserted_position);
45890 +}
45891 +
45892 +/**
45893 + * overwrite_one_block -
45894 + * @uf_coord:
45895 + * @key:
45896 + * @node:
45897 + *
45898 + * If @node corresponds to hole extent - create unallocated extent for it and
45899 + * assign fake block number. If @node corresponds to allocated extent - assign
45900 + * block number of jnode
45901 + */
45902 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45903 + jnode *node, int *hole_plugged)
45904 +{
45905 + int result;
45906 + struct extent_coord_extension *ext_coord;
45907 + reiser4_extent *ext;
45908 + reiser4_block_nr block;
45909 + int how;
45910 +
45911 + assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45912 +
45913 + result = 0;
45914 + ext_coord = ext_coord_by_uf_coord(uf_coord);
45915 + ext = ext_by_ext_coord(uf_coord);
45916 + assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45917 +
45918 + switch (state_of_extent(ext)) {
45919 + case ALLOCATED_EXTENT:
45920 + block = extent_get_start(ext) + ext_coord->pos_in_unit;
45921 + break;
45922 +
45923 + case HOLE_EXTENT:
45924 + result = vfs_dq_alloc_block_nodirty(mapping_jnode(node)->host,
45925 + 1);
45926 + BUG_ON(result != 0);
45927 + result = plug_hole(uf_coord, key, &how);
45928 + if (result)
45929 + return result;
45930 + block = fake_blocknr_unformatted(1);
45931 + if (hole_plugged)
45932 + *hole_plugged = 1;
45933 + JF_SET(node, JNODE_CREATED);
45934 + break;
45935 +
45936 + default:
45937 + return RETERR(-EIO);
45938 + }
45939 +
45940 + jnode_set_block(node, &block);
45941 + return 0;
45942 +}
45943 +
45944 +/**
45945 + * move_coord - move coordinate forward
45946 + * @uf_coord:
45947 + *
45948 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
45949 + * the last one already or is invalid.
45950 + */
45951 +static int move_coord(uf_coord_t *uf_coord)
45952 +{
45953 + struct extent_coord_extension *ext_coord;
45954 +
45955 + if (uf_coord->valid == 0)
45956 + return 1;
45957 + ext_coord = &uf_coord->extension.extent;
45958 + ext_coord->pos_in_unit ++;
45959 + if (ext_coord->pos_in_unit < ext_coord->width)
45960 + /* coordinate moved within the unit */
45961 + return 0;
45962 +
45963 + /* end of unit is reached. Try to move to next unit */
45964 + ext_coord->pos_in_unit = 0;
45965 + uf_coord->coord.unit_pos ++;
45966 + if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45967 + /* coordinate moved to next unit */
45968 + ext_coord->ext_offset += sizeof(reiser4_extent);
45969 + ext_coord->width =
45970 + extent_get_width(ext_by_offset
45971 + (uf_coord->coord.node,
45972 + ext_coord->ext_offset));
45973 + ON_DEBUG(ext_coord->extent =
45974 + *ext_by_offset(uf_coord->coord.node,
45975 + ext_coord->ext_offset));
45976 + return 0;
45977 + }
45978 + /* end of item is reached */
45979 + uf_coord->valid = 0;
45980 + return 1;
45981 +}
45982 +
45983 +/**
45984 + * overwrite_extent -
45985 + * @inode:
45986 + *
45987 + * Returns number of handled jnodes.
45988 + */
45989 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45990 + jnode **jnodes, int count, int *plugged_hole)
45991 +{
45992 + int result;
45993 + reiser4_key k;
45994 + int i;
45995 + jnode *node;
45996 +
45997 + k = *key;
45998 + for (i = 0; i < count; i ++) {
45999 + node = jnodes[i];
46000 + if (*jnode_get_block(node) == 0) {
46001 + result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
46002 + if (result)
46003 + return result;
46004 + }
46005 + /*
46006 + * make sure that we hold long term locked twig node containing
46007 + * all jnodes we are about to capture
46008 + */
46009 + check_jnodes(uf_coord->lh->node, &k, 1);
46010 + /*
46011 + * assign fake block numbers to all jnodes, capture and mark
46012 + * them dirty
46013 + */
46014 + spin_lock_jnode(node);
46015 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
46016 + BUG_ON(result != 0);
46017 + jnode_make_dirty_locked(node);
46018 + spin_unlock_jnode(node);
46019 +
46020 + if (uf_coord->valid == 0)
46021 + return i + 1;
46022 +
46023 + check_uf_coord(uf_coord, &k);
46024 +
46025 + if (move_coord(uf_coord)) {
46026 + /*
46027 + * failed to move to the next node pointer. Either end
46028 + * of file or end of twig node is reached. In the later
46029 + * case we might go to the right neighbor.
46030 + */
46031 + uf_coord->valid = 0;
46032 + return i + 1;
46033 + }
46034 + set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
46035 + }
46036 +
46037 + return count;
46038 +}
46039 +
46040 +/**
46041 + * reiser4_update_extent
46042 + * @file:
46043 + * @jnodes:
46044 + * @count:
46045 + * @off:
46046 + *
46047 + */
46048 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
46049 + int *plugged_hole)
46050 +{
46051 + int result;
46052 + znode *loaded;
46053 + uf_coord_t uf_coord;
46054 + coord_t *coord;
46055 + lock_handle lh;
46056 + reiser4_key key;
46057 +
46058 + assert("", reiser4_lock_counters()->d_refs == 0);
46059 +
46060 + key_by_inode_and_offset_common(inode, pos, &key);
46061 +
46062 + init_uf_coord(&uf_coord, &lh);
46063 + coord = &uf_coord.coord;
46064 + result = find_file_item_nohint(coord, &lh, &key,
46065 + ZNODE_WRITE_LOCK, inode);
46066 + if (IS_CBKERR(result)) {
46067 + assert("", reiser4_lock_counters()->d_refs == 0);
46068 + return result;
46069 + }
46070 +
46071 + result = zload(coord->node);
46072 + BUG_ON(result != 0);
46073 + loaded = coord->node;
46074 +
46075 + if (coord->between == AFTER_UNIT) {
46076 + /*
46077 + * append existing extent item with unallocated extent of width
46078 + * nr_jnodes
46079 + */
46080 + init_coord_extension_extent(&uf_coord,
46081 + get_key_offset(&key));
46082 + result = append_last_extent(&uf_coord, &key,
46083 + &node, 1);
46084 + } else if (coord->between == AT_UNIT) {
46085 + /*
46086 + * overwrite
46087 + * not optimal yet. Will be optimized if new write will show
46088 + * performance win.
46089 + */
46090 + init_coord_extension_extent(&uf_coord,
46091 + get_key_offset(&key));
46092 + result = overwrite_extent(&uf_coord, &key,
46093 + &node, 1, plugged_hole);
46094 + } else {
46095 + /*
46096 + * there are no items of this file in the tree yet. Create
46097 + * first item of the file inserting one unallocated extent of
46098 + * width nr_jnodes
46099 + */
46100 + result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
46101 + }
46102 + assert("", result == 1 || result < 0);
46103 + zrelse(loaded);
46104 + done_lh(&lh);
46105 + assert("", reiser4_lock_counters()->d_refs == 0);
46106 + return (result == 1) ? 0 : result;
46107 +}
46108 +
46109 +/**
46110 + * update_extents
46111 + * @file:
46112 + * @jnodes:
46113 + * @count:
46114 + * @off:
46115 + *
46116 + */
46117 +static int update_extents(struct file *file, struct inode *inode,
46118 + jnode **jnodes, int count, loff_t pos)
46119 +{
46120 + struct hint hint;
46121 + reiser4_key key;
46122 + int result;
46123 + znode *loaded;
46124 +
46125 + result = load_file_hint(file, &hint);
46126 + BUG_ON(result != 0);
46127 +
46128 + if (count != 0)
46129 + /*
46130 + * count == 0 is special case: expanding truncate
46131 + */
46132 + pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
46133 + key_by_inode_and_offset_common(inode, pos, &key);
46134 +
46135 + assert("", reiser4_lock_counters()->d_refs == 0);
46136 +
46137 + do {
46138 + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
46139 + if (IS_CBKERR(result)) {
46140 + assert("", reiser4_lock_counters()->d_refs == 0);
46141 + return result;
46142 + }
46143 +
46144 + result = zload(hint.ext_coord.coord.node);
46145 + BUG_ON(result != 0);
46146 + loaded = hint.ext_coord.coord.node;
46147 +
46148 + if (hint.ext_coord.coord.between == AFTER_UNIT) {
46149 + /*
46150 + * append existing extent item with unallocated extent
46151 + * of width nr_jnodes
46152 + */
46153 + if (hint.ext_coord.valid == 0)
46154 + /* NOTE: get statistics on this */
46155 + init_coord_extension_extent(&hint.ext_coord,
46156 + get_key_offset(&key));
46157 + result = append_last_extent(&hint.ext_coord, &key,
46158 + jnodes, count);
46159 + } else if (hint.ext_coord.coord.between == AT_UNIT) {
46160 + /*
46161 + * overwrite
46162 + * not optimal yet. Will be optimized if new write will
46163 + * show performance win.
46164 + */
46165 + if (hint.ext_coord.valid == 0)
46166 + /* NOTE: get statistics on this */
46167 + init_coord_extension_extent(&hint.ext_coord,
46168 + get_key_offset(&key));
46169 + result = overwrite_extent(&hint.ext_coord, &key,
46170 + jnodes, count, NULL);
46171 + } else {
46172 + /*
46173 + * there are no items of this file in the tree
46174 + * yet. Create first item of the file inserting one
46175 + * unallocated extent of * width nr_jnodes
46176 + */
46177 + result = insert_first_extent(&hint.ext_coord, &key,
46178 + jnodes, count, inode);
46179 + }
46180 + zrelse(loaded);
46181 + if (result < 0) {
46182 + done_lh(hint.ext_coord.lh);
46183 + break;
46184 + }
46185 +
46186 + jnodes += result;
46187 + count -= result;
46188 + set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
46189 +
46190 + /* seal and unlock znode */
46191 + if (hint.ext_coord.valid)
46192 + reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
46193 + else
46194 + reiser4_unset_hint(&hint);
46195 +
46196 + } while (count > 0);
46197 +
46198 + save_file_hint(file, &hint);
46199 + assert("", reiser4_lock_counters()->d_refs == 0);
46200 + return result;
46201 +}
46202 +
46203 +/**
46204 + * write_extent_reserve_space - reserve space for extent write operation
46205 + * @inode:
46206 + *
46207 + * Estimates and reserves space which may be required for writing
46208 + * WRITE_GRANULARITY pages of file.
46209 + */
46210 +static int write_extent_reserve_space(struct inode *inode)
46211 +{
46212 + __u64 count;
46213 + reiser4_tree *tree;
46214 +
46215 + /*
46216 + * to write WRITE_GRANULARITY pages to a file by extents we have to
46217 + * reserve disk space for:
46218 +
46219 + * 1. find_file_item may have to insert empty node to the tree (empty
46220 + * leaf node between two extent items). This requires 1 block and
46221 + * number of blocks which are necessary to perform insertion of an
46222 + * internal item into twig level.
46223 +
46224 + * 2. for each of written pages there might be needed 1 block and
46225 + * number of blocks which might be necessary to perform insertion of or
46226 + * paste to an extent item.
46227 +
46228 + * 3. stat data update
46229 + */
46230 + tree = reiser4_tree_by_inode(inode);
46231 + count = estimate_one_insert_item(tree) +
46232 + WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
46233 + estimate_one_insert_item(tree);
46234 + grab_space_enable();
46235 + return reiser4_grab_space(count, 0 /* flags */);
46236 +}
46237 +
46238 +/*
46239 + * filemap_copy_from_user no longer exists in generic code, because it
46240 + * is deadlocky (copying from user while holding the page lock is bad).
46241 + * As a temporary fix for reiser4, just define it here.
46242 + */
46243 +static inline size_t
46244 +filemap_copy_from_user(struct page *page, unsigned long offset,
46245 + const char __user *buf, unsigned bytes)
46246 +{
46247 + char *kaddr;
46248 + int left;
46249 +
46250 + kaddr = kmap_atomic(page, KM_USER0);
46251 + left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
46252 + kunmap_atomic(kaddr, KM_USER0);
46253 +
46254 + if (left != 0) {
46255 + /* Do it the slow way */
46256 + kaddr = kmap(page);
46257 + left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
46258 + kunmap(page);
46259 + }
46260 + return bytes - left;
46261 +}
46262 +
46263 +/**
46264 + * reiser4_write_extent - write method of extent item plugin
46265 + * @file: file to write to
46266 + * @buf: address of user-space buffer
46267 + * @count: number of bytes to write
46268 + * @pos: position in file to write to
46269 + *
46270 + */
46271 +ssize_t reiser4_write_extent(struct file *file, struct inode * inode,
46272 + const char __user *buf, size_t count, loff_t *pos)
46273 +{
46274 + int have_to_update_extent;
46275 + int nr_pages, nr_dirty;
46276 + struct page *page;
46277 + jnode *jnodes[WRITE_GRANULARITY + 1];
46278 + unsigned long index;
46279 + unsigned long end;
46280 + int i;
46281 + int to_page, page_off;
46282 + size_t left, written;
46283 + int result = 0;
46284 +
46285 + if (write_extent_reserve_space(inode))
46286 + return RETERR(-ENOSPC);
46287 +
46288 + if (count == 0) {
46289 + /* truncate case */
46290 + update_extents(file, inode, jnodes, 0, *pos);
46291 + return 0;
46292 + }
46293 +
46294 + BUG_ON(get_current_context()->trans->atom != NULL);
46295 +
46296 + left = count;
46297 + index = *pos >> PAGE_CACHE_SHIFT;
46298 + /* calculate number of pages which are to be written */
46299 + end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
46300 + nr_pages = end - index + 1;
46301 + nr_dirty = 0;
46302 + assert("", nr_pages <= WRITE_GRANULARITY + 1);
46303 +
46304 + /* get pages and jnodes */
46305 + for (i = 0; i < nr_pages; i ++) {
46306 + page = find_or_create_page(inode->i_mapping, index + i,
46307 + reiser4_ctx_gfp_mask_get());
46308 + if (page == NULL) {
46309 + nr_pages = i;
46310 + result = RETERR(-ENOMEM);
46311 + goto out;
46312 + }
46313 +
46314 + jnodes[i] = jnode_of_page(page);
46315 + if (IS_ERR(jnodes[i])) {
46316 + unlock_page(page);
46317 + page_cache_release(page);
46318 + nr_pages = i;
46319 + result = RETERR(-ENOMEM);
46320 + goto out;
46321 + }
46322 + /* prevent jnode and page from disconnecting */
46323 + JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
46324 + unlock_page(page);
46325 + }
46326 +
46327 + BUG_ON(get_current_context()->trans->atom != NULL);
46328 +
46329 + have_to_update_extent = 0;
46330 +
46331 + page_off = (*pos & (PAGE_CACHE_SIZE - 1));
46332 + for (i = 0; i < nr_pages; i ++) {
46333 + to_page = PAGE_CACHE_SIZE - page_off;
46334 + if (to_page > left)
46335 + to_page = left;
46336 + page = jnode_page(jnodes[i]);
46337 + if (page_offset(page) < inode->i_size &&
46338 + !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
46339 + /*
46340 + * the above is not optimal for partial write to last
46341 + * page of file when file size is not at boundary of
46342 + * page
46343 + */
46344 + lock_page(page);
46345 + if (!PageUptodate(page)) {
46346 + result = readpage_unix_file(NULL, page);
46347 + BUG_ON(result != 0);
46348 + /* wait for read completion */
46349 + lock_page(page);
46350 + BUG_ON(!PageUptodate(page));
46351 + } else
46352 + result = 0;
46353 + unlock_page(page);
46354 + }
46355 +
46356 + BUG_ON(get_current_context()->trans->atom != NULL);
46357 + fault_in_pages_readable(buf, to_page);
46358 + BUG_ON(get_current_context()->trans->atom != NULL);
46359 +
46360 + lock_page(page);
46361 + if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
46362 + zero_user_segments(page, 0, page_off,
46363 + page_off + to_page,
46364 + PAGE_CACHE_SIZE);
46365 +
46366 + written = filemap_copy_from_user(page, page_off, buf, to_page);
46367 + if (unlikely(written != to_page)) {
46368 + unlock_page(page);
46369 + result = RETERR(-EFAULT);
46370 + break;
46371 + }
46372 +
46373 + flush_dcache_page(page);
46374 + set_page_dirty_notag(page);
46375 + unlock_page(page);
46376 + nr_dirty++;
46377 +
46378 + mark_page_accessed(page);
46379 + SetPageUptodate(page);
46380 +
46381 + if (jnodes[i]->blocknr == 0)
46382 + have_to_update_extent ++;
46383 +
46384 + page_off = 0;
46385 + buf += to_page;
46386 + left -= to_page;
46387 + BUG_ON(get_current_context()->trans->atom != NULL);
46388 + }
46389 +
46390 + if (have_to_update_extent) {
46391 + update_extents(file, inode, jnodes, nr_dirty, *pos);
46392 + } else {
46393 + for (i = 0; i < nr_dirty; i ++) {
46394 + int ret;
46395 + spin_lock_jnode(jnodes[i]);
46396 + ret = reiser4_try_capture(jnodes[i],
46397 + ZNODE_WRITE_LOCK, 0);
46398 + BUG_ON(ret != 0);
46399 + jnode_make_dirty_locked(jnodes[i]);
46400 + spin_unlock_jnode(jnodes[i]);
46401 + }
46402 + }
46403 +out:
46404 + for (i = 0; i < nr_pages; i ++) {
46405 + page_cache_release(jnode_page(jnodes[i]));
46406 + JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46407 + jput(jnodes[i]);
46408 + }
46409 +
46410 + /* the only errors handled so far is ENOMEM and
46411 + EFAULT on copy_from_user */
46412 +
46413 + return (count - left) ? (count - left) : result;
46414 +}
46415 +
46416 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46417 + struct page *page)
46418 +{
46419 + jnode *j;
46420 + struct address_space *mapping;
46421 + unsigned long index;
46422 + oid_t oid;
46423 + reiser4_block_nr block;
46424 +
46425 + mapping = page->mapping;
46426 + oid = get_inode_oid(mapping->host);
46427 + index = page->index;
46428 +
46429 + switch (state_of_extent(ext)) {
46430 + case HOLE_EXTENT:
46431 + /*
46432 + * it is possible to have hole page with jnode, if page was
46433 + * eflushed previously.
46434 + */
46435 + j = jfind(mapping, index);
46436 + if (j == NULL) {
46437 + zero_user(page, 0, PAGE_CACHE_SIZE);
46438 + SetPageUptodate(page);
46439 + unlock_page(page);
46440 + return 0;
46441 + }
46442 + spin_lock_jnode(j);
46443 + if (!jnode_page(j)) {
46444 + jnode_attach_page(j, page);
46445 + } else {
46446 + BUG_ON(jnode_page(j) != page);
46447 + assert("vs-1504", jnode_page(j) == page);
46448 + }
46449 + block = *jnode_get_io_block(j);
46450 + spin_unlock_jnode(j);
46451 + if (block == 0) {
46452 + zero_user(page, 0, PAGE_CACHE_SIZE);
46453 + SetPageUptodate(page);
46454 + unlock_page(page);
46455 + jput(j);
46456 + return 0;
46457 + }
46458 + break;
46459 +
46460 + case ALLOCATED_EXTENT:
46461 + j = jnode_of_page(page);
46462 + if (IS_ERR(j))
46463 + return PTR_ERR(j);
46464 + if (*jnode_get_block(j) == 0) {
46465 + reiser4_block_nr blocknr;
46466 +
46467 + blocknr = extent_get_start(ext) + pos;
46468 + jnode_set_block(j, &blocknr);
46469 + } else
46470 + assert("vs-1403",
46471 + j->blocknr == extent_get_start(ext) + pos);
46472 + break;
46473 +
46474 + case UNALLOCATED_EXTENT:
46475 + j = jfind(mapping, index);
46476 + assert("nikita-2688", j);
46477 + assert("vs-1426", jnode_page(j) == NULL);
46478 +
46479 + spin_lock_jnode(j);
46480 + jnode_attach_page(j, page);
46481 + spin_unlock_jnode(j);
46482 + break;
46483 +
46484 + default:
46485 + warning("vs-957", "wrong extent\n");
46486 + return RETERR(-EIO);
46487 + }
46488 +
46489 + BUG_ON(j == 0);
46490 + reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46491 + jput(j);
46492 + return 0;
46493 +}
46494 +
46495 +/* Implements plugin->u.item.s.file.read operation for extent items. */
46496 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46497 +{
46498 + int result;
46499 + struct page *page;
46500 + unsigned long cur_page, next_page;
46501 + unsigned long page_off, count;
46502 + struct address_space *mapping;
46503 + loff_t file_off;
46504 + uf_coord_t *uf_coord;
46505 + coord_t *coord;
46506 + struct extent_coord_extension *ext_coord;
46507 + unsigned long nr_pages;
46508 + char *kaddr;
46509 +
46510 + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46511 + assert("vs-572", flow->user == 1);
46512 + assert("vs-1351", flow->length > 0);
46513 +
46514 + uf_coord = &hint->ext_coord;
46515 +
46516 + check_uf_coord(uf_coord, NULL);
46517 + assert("vs-33", uf_coord->lh == &hint->lh);
46518 +
46519 + coord = &uf_coord->coord;
46520 + assert("vs-1119", znode_is_rlocked(coord->node));
46521 + assert("vs-1120", znode_is_loaded(coord->node));
46522 + assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46523 +
46524 + mapping = file->f_dentry->d_inode->i_mapping;
46525 + ext_coord = &uf_coord->extension.extent;
46526 +
46527 + /* offset in a file to start read from */
46528 + file_off = get_key_offset(&flow->key);
46529 + /* offset within the page to start read from */
46530 + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46531 + /* bytes which can be read from the page which contains file_off */
46532 + count = PAGE_CACHE_SIZE - page_off;
46533 +
46534 + /* index of page containing offset read is to start from */
46535 + cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46536 + next_page = cur_page;
46537 + /* number of pages flow spans over */
46538 + nr_pages =
46539 + ((file_off + flow->length + PAGE_CACHE_SIZE -
46540 + 1) >> PAGE_CACHE_SHIFT) - cur_page;
46541 +
46542 + /* we start having twig node read locked. However, we do not want to
46543 + keep that lock all the time readahead works. So, set a sel and
46544 + release twig node. */
46545 + reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46546 + /* &hint->lh is done-ed */
46547 +
46548 + do {
46549 + reiser4_txn_restart_current();
46550 + page = read_mapping_page(mapping, cur_page, file);
46551 + if (IS_ERR(page))
46552 + return PTR_ERR(page);
46553 + lock_page(page);
46554 + if (!PageUptodate(page)) {
46555 + unlock_page(page);
46556 + page_cache_release(page);
46557 + warning("jmacd-97178", "extent_read: page is not up to date");
46558 + return RETERR(-EIO);
46559 + }
46560 + mark_page_accessed(page);
46561 + unlock_page(page);
46562 +
46563 + /* If users can be writing to this page using arbitrary virtual
46564 + addresses, take care about potential aliasing before reading
46565 + the page on the kernel side.
46566 + */
46567 + if (mapping_writably_mapped(mapping))
46568 + flush_dcache_page(page);
46569 +
46570 + assert("nikita-3034", reiser4_schedulable());
46571 +
46572 + /* number of bytes which are to be read from the page */
46573 + if (count > flow->length)
46574 + count = flow->length;
46575 +
46576 + result = fault_in_pages_writeable(flow->data, count);
46577 + if (result) {
46578 + page_cache_release(page);
46579 + return RETERR(-EFAULT);
46580 + }
46581 +
46582 + kaddr = kmap_atomic(page, KM_USER0);
46583 + result = __copy_to_user_inatomic(flow->data,
46584 + kaddr + page_off, count);
46585 + kunmap_atomic(kaddr, KM_USER0);
46586 + if (result != 0) {
46587 + kaddr = kmap(page);
46588 + result = __copy_to_user(flow->data, kaddr + page_off, count);
46589 + kunmap(page);
46590 + if (unlikely(result))
46591 + return RETERR(-EFAULT);
46592 + }
46593 +
46594 + page_cache_release(page);
46595 +
46596 + /* increase key (flow->key), update user area pointer (flow->data) */
46597 + move_flow_forward(flow, count);
46598 +
46599 + page_off = 0;
46600 + cur_page ++;
46601 + count = PAGE_CACHE_SIZE;
46602 + nr_pages--;
46603 + } while (flow->length);
46604 +
46605 + return 0;
46606 +}
46607 +
46608 +/*
46609 + plugin->s.file.readpage
46610 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46611 + or
46612 + filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_extent
46613 +
46614 + At the beginning: coord->node is read locked, zloaded, page is
46615 + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46616 +*/
46617 +int reiser4_readpage_extent(void *vp, struct page *page)
46618 +{
46619 + uf_coord_t *uf_coord = vp;
46620 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
46621 + ON_DEBUG(reiser4_key key);
46622 +
46623 + assert("vs-1040", PageLocked(page));
46624 + assert("vs-1050", !PageUptodate(page));
46625 + assert("vs-1039", page->mapping && page->mapping->host);
46626 +
46627 + assert("vs-1044", znode_is_loaded(coord->node));
46628 + assert("vs-758", item_is_extent(coord));
46629 + assert("vs-1046", coord_is_existing_unit(coord));
46630 + assert("vs-1045", znode_is_rlocked(coord->node));
46631 + assert("vs-1047",
46632 + page->mapping->host->i_ino ==
46633 + get_key_objectid(item_key_by_coord(coord, &key)));
46634 + check_uf_coord(uf_coord, NULL);
46635 +
46636 + return reiser4_do_readpage_extent(
46637 + ext_by_ext_coord(uf_coord),
46638 + uf_coord->extension.extent.pos_in_unit, page);
46639 +}
46640 +
46641 +/**
46642 + * get_block_address_extent
46643 + * @coord:
46644 + * @block:
46645 + * @result:
46646 + *
46647 + *
46648 + */
46649 +int get_block_address_extent(const coord_t *coord, sector_t block,
46650 + sector_t *result)
46651 +{
46652 + reiser4_extent *ext;
46653 +
46654 + if (!coord_is_existing_unit(coord))
46655 + return RETERR(-EINVAL);
46656 +
46657 + ext = extent_by_coord(coord);
46658 +
46659 + if (state_of_extent(ext) != ALLOCATED_EXTENT)
46660 + /* FIXME: bad things may happen if it is unallocated extent */
46661 + *result = 0;
46662 + else {
46663 + reiser4_key key;
46664 +
46665 + unit_key_by_coord(coord, &key);
46666 + assert("vs-1645",
46667 + block >= get_key_offset(&key) >> current_blocksize_bits);
46668 + assert("vs-1646",
46669 + block <
46670 + (get_key_offset(&key) >> current_blocksize_bits) +
46671 + extent_get_width(ext));
46672 + *result =
46673 + extent_get_start(ext) + (block -
46674 + (get_key_offset(&key) >>
46675 + current_blocksize_bits));
46676 + }
46677 + return 0;
46678 +}
46679 +
46680 +/*
46681 + plugin->u.item.s.file.append_key
46682 + key of first byte which is the next to last byte by addressed by this extent
46683 +*/
46684 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46685 +{
46686 + item_key_by_coord(coord, key);
46687 + set_key_offset(key,
46688 + get_key_offset(key) + reiser4_extent_size(coord,
46689 + nr_units_extent
46690 + (coord)));
46691 +
46692 + assert("vs-610", get_key_offset(key)
46693 + && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46694 + return key;
46695 +}
46696 +
46697 +/* plugin->u.item.s.file.init_coord_extension */
46698 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46699 +{
46700 + coord_t *coord;
46701 + struct extent_coord_extension *ext_coord;
46702 + reiser4_key key;
46703 + loff_t offset;
46704 +
46705 + assert("vs-1295", uf_coord->valid == 0);
46706 +
46707 + coord = &uf_coord->coord;
46708 + assert("vs-1288", coord_is_iplug_set(coord));
46709 + assert("vs-1327", znode_is_loaded(coord->node));
46710 +
46711 + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46712 + return;
46713 +
46714 + ext_coord = &uf_coord->extension.extent;
46715 + ext_coord->nr_units = nr_units_extent(coord);
46716 + ext_coord->ext_offset =
46717 + (char *)extent_by_coord(coord) - zdata(coord->node);
46718 + ext_coord->width = extent_get_width(extent_by_coord(coord));
46719 + ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46720 + uf_coord->valid = 1;
46721 +
46722 + /* pos_in_unit is the only uninitialized field in extended coord */
46723 + if (coord->between == AFTER_UNIT) {
46724 + assert("vs-1330",
46725 + coord->unit_pos == nr_units_extent(coord) - 1);
46726 +
46727 + ext_coord->pos_in_unit = ext_coord->width - 1;
46728 + } else {
46729 + /* AT_UNIT */
46730 + unit_key_by_coord(coord, &key);
46731 + offset = get_key_offset(&key);
46732 +
46733 + assert("vs-1328", offset <= lookuped);
46734 + assert("vs-1329",
46735 + lookuped <
46736 + offset + ext_coord->width * current_blocksize);
46737 + ext_coord->pos_in_unit =
46738 + ((lookuped - offset) >> current_blocksize_bits);
46739 + }
46740 +}
46741 +
46742 +/*
46743 + * Local variables:
46744 + * c-indentation-style: "K&R"
46745 + * mode-name: "LC"
46746 + * c-basic-offset: 8
46747 + * tab-width: 8
46748 + * fill-column: 79
46749 + * scroll-step: 1
46750 + * End:
46751 + */
46752 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_flush_ops.c
46753 --- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 01:00:00.000000000 +0100
46754 +++ linux-2.6.33/fs/reiser4/plugin/item/extent_flush_ops.c 2010-03-04 19:33:22.000000000 +0100
46755 @@ -0,0 +1,1028 @@
46756 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46757 +
46758 +#include "item.h"
46759 +#include "../../tree.h"
46760 +#include "../../jnode.h"
46761 +#include "../../super.h"
46762 +#include "../../flush.h"
46763 +#include "../../carry.h"
46764 +#include "../object.h"
46765 +
46766 +#include <linux/pagemap.h>
46767 +
46768 +static reiser4_block_nr extent_unit_start(const coord_t * item);
46769 +
46770 +/* Return either first or last extent (depending on @side) of the item
46771 + @coord is set to. Set @pos_in_unit either to first or to last block
46772 + of extent. */
46773 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46774 + reiser4_block_nr * pos_in_unit)
46775 +{
46776 + reiser4_extent *ext;
46777 +
46778 + if (side == LEFT_SIDE) {
46779 + /* get first extent of item */
46780 + ext = extent_item(coord);
46781 + *pos_in_unit = 0;
46782 + } else {
46783 + /* get last extent of item and last position within it */
46784 + assert("vs-363", side == RIGHT_SIDE);
46785 + ext = extent_item(coord) + coord_last_unit_pos(coord);
46786 + *pos_in_unit = extent_get_width(ext) - 1;
46787 + }
46788 +
46789 + return ext;
46790 +}
46791 +
46792 +/* item_plugin->f.utmost_child */
46793 +/* Return the child. Coord is set to extent item. Find jnode corresponding
46794 + either to first or to last unformatted node pointed by the item */
46795 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46796 +{
46797 + reiser4_extent *ext;
46798 + reiser4_block_nr pos_in_unit;
46799 +
46800 + ext = extent_utmost_ext(coord, side, &pos_in_unit);
46801 +
46802 + switch (state_of_extent(ext)) {
46803 + case HOLE_EXTENT:
46804 + *childp = NULL;
46805 + return 0;
46806 + case ALLOCATED_EXTENT:
46807 + case UNALLOCATED_EXTENT:
46808 + break;
46809 + default:
46810 + /* this should never happen */
46811 + assert("vs-1417", 0);
46812 + }
46813 +
46814 + {
46815 + reiser4_key key;
46816 + reiser4_tree *tree;
46817 + unsigned long index;
46818 +
46819 + if (side == LEFT_SIDE) {
46820 + /* get key of first byte addressed by the extent */
46821 + item_key_by_coord(coord, &key);
46822 + } else {
46823 + /* get key of byte which next after last byte addressed by the extent */
46824 + append_key_extent(coord, &key);
46825 + }
46826 +
46827 + assert("vs-544",
46828 + (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46829 + /* index of first or last (depending on @side) page addressed
46830 + by the extent */
46831 + index =
46832 + (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46833 + if (side == RIGHT_SIDE)
46834 + index--;
46835 +
46836 + tree = coord->node->zjnode.tree;
46837 + *childp = jlookup(tree, get_key_objectid(&key), index);
46838 + }
46839 +
46840 + return 0;
46841 +}
46842 +
46843 +/* item_plugin->f.utmost_child_real_block */
46844 +/* Return the child's block, if allocated. */
46845 +int
46846 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
46847 + reiser4_block_nr * block)
46848 +{
46849 + reiser4_extent *ext;
46850 +
46851 + ext = extent_by_coord(coord);
46852 +
46853 + switch (state_of_extent(ext)) {
46854 + case ALLOCATED_EXTENT:
46855 + *block = extent_get_start(ext);
46856 + if (side == RIGHT_SIDE)
46857 + *block += extent_get_width(ext) - 1;
46858 + break;
46859 + case HOLE_EXTENT:
46860 + case UNALLOCATED_EXTENT:
46861 + *block = 0;
46862 + break;
46863 + default:
46864 + /* this should never happen */
46865 + assert("vs-1418", 0);
46866 + }
46867 +
46868 + return 0;
46869 +}
46870 +
46871 +/* item_plugin->f.scan */
46872 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46873 + This scan continues, advancing the parent coordinate, until either it encounters a
46874 + formatted child or it finishes scanning this node.
46875 +
46876 + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
46877 + not sure this is last property (same atom) is enforced, but it should be the case since
46878 + one atom must write the parent and the others must read the parent, thus fusing?). In
46879 + any case, the code below asserts this case for unallocated extents. Unallocated
46880 + extents are thus optimized because we can skip to the endpoint when scanning.
46881 +
46882 + It returns control to reiser4_scan_extent, handles these terminating conditions,
46883 + e.g., by loading the next twig.
46884 +*/
46885 +int reiser4_scan_extent(flush_scan * scan)
46886 +{
46887 + coord_t coord;
46888 + jnode *neighbor;
46889 + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46890 + reiser4_block_nr unit_start;
46891 + __u64 oid;
46892 + reiser4_key key;
46893 + int ret = 0, allocated, incr;
46894 + reiser4_tree *tree;
46895 +
46896 + if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46897 + scan->stop = 1;
46898 + return 0; /* Race with truncate, this node is already
46899 + * truncated. */
46900 + }
46901 +
46902 + coord_dup(&coord, &scan->parent_coord);
46903 +
46904 + assert("jmacd-1404", !reiser4_scan_finished(scan));
46905 + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46906 + assert("jmacd-1406", jnode_is_unformatted(scan->node));
46907 +
46908 + /* The scan_index variable corresponds to the current page index of the
46909 + unformatted block scan position. */
46910 + scan_index = index_jnode(scan->node);
46911 +
46912 + assert("jmacd-7889", item_is_extent(&coord));
46913 +
46914 + repeat:
46915 + /* objectid of file */
46916 + oid = get_key_objectid(item_key_by_coord(&coord, &key));
46917 +
46918 + allocated = !extent_is_unallocated(&coord);
46919 + /* Get the values of this extent unit: */
46920 + unit_index = extent_unit_index(&coord);
46921 + unit_width = extent_unit_width(&coord);
46922 + unit_start = extent_unit_start(&coord);
46923 +
46924 + assert("jmacd-7187", unit_width > 0);
46925 + assert("jmacd-7188", scan_index >= unit_index);
46926 + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46927 +
46928 + /* Depending on the scan direction, we set different maximum values for scan_index
46929 + (scan_max) and the number of nodes that would be passed if the scan goes the
46930 + entire way (scan_dist). Incr is an integer reflecting the incremental
46931 + direction of scan_index. */
46932 + if (reiser4_scanning_left(scan)) {
46933 + scan_max = unit_index;
46934 + scan_dist = scan_index - unit_index;
46935 + incr = -1;
46936 + } else {
46937 + scan_max = unit_index + unit_width - 1;
46938 + scan_dist = scan_max - unit_index;
46939 + incr = +1;
46940 + }
46941 +
46942 + tree = coord.node->zjnode.tree;
46943 +
46944 + /* If the extent is allocated we have to check each of its blocks. If the extent
46945 + is unallocated we can skip to the scan_max. */
46946 + if (allocated) {
46947 + do {
46948 + neighbor = jlookup(tree, oid, scan_index);
46949 + if (neighbor == NULL)
46950 + goto stop_same_parent;
46951 +
46952 + if (scan->node != neighbor
46953 + && !reiser4_scan_goto(scan, neighbor)) {
46954 + /* @neighbor was jput() by reiser4_scan_goto */
46955 + goto stop_same_parent;
46956 + }
46957 +
46958 + ret = scan_set_current(scan, neighbor, 1, &coord);
46959 + if (ret != 0) {
46960 + goto exit;
46961 + }
46962 +
46963 + /* reference to @neighbor is stored in @scan, no need
46964 + to jput(). */
46965 + scan_index += incr;
46966 +
46967 + } while (incr + scan_max != scan_index);
46968 +
46969 + } else {
46970 + /* Optimized case for unallocated extents, skip to the end. */
46971 + neighbor = jlookup(tree, oid, scan_max /*index */ );
46972 + if (neighbor == NULL) {
46973 + /* Race with truncate */
46974 + scan->stop = 1;
46975 + ret = 0;
46976 + goto exit;
46977 + }
46978 +
46979 + assert("zam-1043",
46980 + reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46981 +
46982 + ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46983 + if (ret != 0) {
46984 + goto exit;
46985 + }
46986 + }
46987 +
46988 + if (coord_sideof_unit(&coord, scan->direction) == 0
46989 + && item_is_extent(&coord)) {
46990 + /* Continue as long as there are more extent units. */
46991 +
46992 + scan_index =
46993 + extent_unit_index(&coord) +
46994 + (reiser4_scanning_left(scan) ?
46995 + extent_unit_width(&coord) - 1 : 0);
46996 + goto repeat;
46997 + }
46998 +
46999 + if (0) {
47000 + stop_same_parent:
47001 +
47002 + /* If we are scanning left and we stop in the middle of an allocated
47003 + extent, we know the preceder immediately.. */
47004 + /* middle of extent is (scan_index - unit_index) != 0. */
47005 + if (reiser4_scanning_left(scan) &&
47006 + (scan_index - unit_index) != 0) {
47007 + /* FIXME(B): Someone should step-through and verify that this preceder
47008 + calculation is indeed correct. */
47009 + /* @unit_start is starting block (number) of extent
47010 + unit. Flush stopped at the @scan_index block from
47011 + the beginning of the file, which is (scan_index -
47012 + unit_index) block within extent.
47013 + */
47014 + if (unit_start) {
47015 + /* skip preceder update when we are at hole */
47016 + scan->preceder_blk =
47017 + unit_start + scan_index - unit_index;
47018 + check_preceder(scan->preceder_blk);
47019 + }
47020 + }
47021 +
47022 + /* In this case, we leave coord set to the parent of scan->node. */
47023 + scan->stop = 1;
47024 +
47025 + } else {
47026 + /* In this case, we are still scanning, coord is set to the next item which is
47027 + either off-the-end of the node or not an extent. */
47028 + assert("jmacd-8912", scan->stop == 0);
47029 + assert("jmacd-7812",
47030 + (coord_is_after_sideof_unit(&coord, scan->direction)
47031 + || !item_is_extent(&coord)));
47032 + }
47033 +
47034 + ret = 0;
47035 + exit:
47036 + return ret;
47037 +}
47038 +
47039 +/* ask block allocator for some blocks */
47040 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
47041 + reiser4_block_nr wanted_count,
47042 + reiser4_block_nr *first_allocated,
47043 + reiser4_block_nr *allocated,
47044 + block_stage_t block_stage)
47045 +{
47046 + *allocated = wanted_count;
47047 + preceder->max_dist = 0; /* scan whole disk, if needed */
47048 +
47049 + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
47050 + preceder->block_stage = block_stage;
47051 +
47052 + /* FIXME: we do not handle errors here now */
47053 + check_me("vs-420",
47054 + reiser4_alloc_blocks(preceder, first_allocated, allocated,
47055 + BA_PERMANENT) == 0);
47056 + /* update flush_pos's preceder to last allocated block number */
47057 + preceder->blk = *first_allocated + *allocated - 1;
47058 +}
47059 +
47060 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
47061 + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
47062 + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
47063 +static reiser4_block_nr reserve_replace(void)
47064 +{
47065 + reiser4_block_nr grabbed, needed;
47066 +
47067 + grabbed = get_current_context()->grabbed_blocks;
47068 + needed = estimate_one_insert_into_item(current_tree);
47069 + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
47070 + return grabbed;
47071 +}
47072 +
47073 +static void free_replace_reserved(reiser4_block_nr grabbed)
47074 +{
47075 + reiser4_context *ctx;
47076 +
47077 + ctx = get_current_context();
47078 + grabbed2free(ctx, get_super_private(ctx->super),
47079 + ctx->grabbed_blocks - grabbed);
47080 +}
47081 +
47082 +/* Block offset of first block addressed by unit */
47083 +__u64 extent_unit_index(const coord_t * item)
47084 +{
47085 + reiser4_key key;
47086 +
47087 + assert("vs-648", coord_is_existing_unit(item));
47088 + unit_key_by_coord(item, &key);
47089 + return get_key_offset(&key) >> current_blocksize_bits;
47090 +}
47091 +
47092 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
47093 + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
47094 +__u64 extent_unit_width(const coord_t * item)
47095 +{
47096 + assert("vs-649", coord_is_existing_unit(item));
47097 + return width_by_coord(item);
47098 +}
47099 +
47100 +/* Starting block location of this unit */
47101 +static reiser4_block_nr extent_unit_start(const coord_t * item)
47102 +{
47103 + return extent_get_start(extent_by_coord(item));
47104 +}
47105 +
47106 +/**
47107 + * split_allocated_extent -
47108 + * @coord:
47109 + * @pos_in_unit:
47110 + *
47111 + * replace allocated extent with two allocated extents
47112 + */
47113 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
47114 +{
47115 + int result;
47116 + struct replace_handle *h;
47117 + reiser4_extent *ext;
47118 + reiser4_block_nr grabbed;
47119 +
47120 + ext = extent_by_coord(coord);
47121 + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
47122 + assert("vs-1411", extent_get_width(ext) > pos_in_unit);
47123 +
47124 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
47125 + if (h == NULL)
47126 + return RETERR(-ENOMEM);
47127 + h->coord = coord;
47128 + h->lh = znode_lh(coord->node);
47129 + h->pkey = &h->key;
47130 + unit_key_by_coord(coord, h->pkey);
47131 + set_key_offset(h->pkey,
47132 + (get_key_offset(h->pkey) +
47133 + pos_in_unit * current_blocksize));
47134 + reiser4_set_extent(&h->overwrite, extent_get_start(ext),
47135 + pos_in_unit);
47136 + reiser4_set_extent(&h->new_extents[0],
47137 + extent_get_start(ext) + pos_in_unit,
47138 + extent_get_width(ext) - pos_in_unit);
47139 + h->nr_new_extents = 1;
47140 + h->flags = COPI_DONT_SHIFT_LEFT;
47141 + h->paste_key = h->key;
47142 +
47143 + /* reserve space for extent unit paste, @grabbed is reserved before */
47144 + grabbed = reserve_replace();
47145 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
47146 + extent */);
47147 + /* restore reserved */
47148 + free_replace_reserved(grabbed);
47149 + kfree(h);
47150 + return result;
47151 +}
47152 +
47153 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
47154 + one). Return 1 if it succeeded, 0 - otherwise */
47155 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
47156 + reiser4_extent *replace)
47157 +{
47158 + assert("vs-1415", extent_by_coord(coord) == ext);
47159 +
47160 + if (coord->unit_pos == 0
47161 + || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
47162 + /* @ext either does not exist or is not allocated extent */
47163 + return 0;
47164 + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
47165 + extent_get_start(replace))
47166 + return 0;
47167 +
47168 + /* we can glue, widen previous unit */
47169 + extent_set_width(ext - 1,
47170 + extent_get_width(ext - 1) + extent_get_width(replace));
47171 +
47172 + if (extent_get_width(ext) != extent_get_width(replace)) {
47173 + /* make current extent narrower */
47174 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
47175 + extent_set_start(ext,
47176 + extent_get_start(ext) +
47177 + extent_get_width(replace));
47178 + extent_set_width(ext,
47179 + extent_get_width(ext) -
47180 + extent_get_width(replace));
47181 + } else {
47182 + /* current extent completely glued with its left neighbor, remove it */
47183 + coord_t from, to;
47184 +
47185 + coord_dup(&from, coord);
47186 + from.unit_pos = nr_units_extent(coord) - 1;
47187 + coord_dup(&to, &from);
47188 +
47189 + /* currently cut from extent can cut either from the beginning or from the end. Move place which got
47190 + freed after unit removal to end of item */
47191 + memmove(ext, ext + 1,
47192 + (from.unit_pos -
47193 + coord->unit_pos) * sizeof(reiser4_extent));
47194 + /* wipe part of item which is going to be cut, so that node_check will not be confused */
47195 + cut_node_content(&from, &to, NULL, NULL, NULL);
47196 + }
47197 + znode_make_dirty(coord->node);
47198 + /* move coord back */
47199 + coord->unit_pos--;
47200 + return 1;
47201 +}
47202 +
47203 +/**
47204 + * conv_extent - replace extent with 2 ones
47205 + * @coord: coordinate of extent to be replaced
47206 + * @replace: extent to overwrite the one @coord is set to
47207 + *
47208 + * Overwrites extent @coord is set to and paste one extent unit after
47209 + * overwritten one if @replace is shorter than initial extent
47210 + */
47211 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
47212 +{
47213 + int result;
47214 + struct replace_handle *h;
47215 + reiser4_extent *ext;
47216 + reiser4_block_nr start, width, new_width;
47217 + reiser4_block_nr grabbed;
47218 + extent_state state;
47219 +
47220 + ext = extent_by_coord(coord);
47221 + state = state_of_extent(ext);
47222 + start = extent_get_start(ext);
47223 + width = extent_get_width(ext);
47224 + new_width = extent_get_width(replace);
47225 +
47226 + assert("vs-1458", (state == UNALLOCATED_EXTENT ||
47227 + state == ALLOCATED_EXTENT));
47228 + assert("vs-1459", width >= new_width);
47229 +
47230 + if (try_to_merge_with_left(coord, ext, replace)) {
47231 + /* merged @replace with left neighbor. Current unit is either
47232 + removed or narrowed */
47233 + return 0;
47234 + }
47235 +
47236 + if (width == new_width) {
47237 + /* replace current extent with @replace */
47238 + *ext = *replace;
47239 + znode_make_dirty(coord->node);
47240 + return 0;
47241 + }
47242 +
47243 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
47244 + if (h == NULL)
47245 + return RETERR(-ENOMEM);
47246 + h->coord = coord;
47247 + h->lh = znode_lh(coord->node);
47248 + h->pkey = &h->key;
47249 + unit_key_by_coord(coord, h->pkey);
47250 + set_key_offset(h->pkey,
47251 + (get_key_offset(h->pkey) + new_width * current_blocksize));
47252 + h->overwrite = *replace;
47253 +
47254 + /* replace @ext with @replace and padding extent */
47255 + reiser4_set_extent(&h->new_extents[0],
47256 + (state == ALLOCATED_EXTENT) ?
47257 + (start + new_width) :
47258 + UNALLOCATED_EXTENT_START,
47259 + width - new_width);
47260 + h->nr_new_extents = 1;
47261 + h->flags = COPI_DONT_SHIFT_LEFT;
47262 + h->paste_key = h->key;
47263 +
47264 + /* reserve space for extent unit paste, @grabbed is reserved before */
47265 + grabbed = reserve_replace();
47266 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
47267 + extent */);
47268 +
47269 + /* restore reserved */
47270 + free_replace_reserved(grabbed);
47271 + kfree(h);
47272 + return result;
47273 +}
47274 +
47275 +/**
47276 + * assign_real_blocknrs
47277 + * @flush_pos:
47278 + * @oid: objectid of file jnodes to assign block number to belongs to
47279 + * @index: first jnode on the range
47280 + * @count: number of jnodes to assign block numbers to
47281 + * @first: start of allocated block range
47282 + *
47283 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
47284 + * @index. Jnodes get lookuped with jlookup.
47285 + */
47286 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
47287 + unsigned long index, reiser4_block_nr count,
47288 + reiser4_block_nr first)
47289 +{
47290 + unsigned long i;
47291 + reiser4_tree *tree;
47292 + txn_atom *atom;
47293 + int nr;
47294 +
47295 + atom = atom_locked_by_fq(flush_pos->fq);
47296 + assert("vs-1468", atom);
47297 + BUG_ON(atom == NULL);
47298 +
47299 + nr = 0;
47300 + tree = current_tree;
47301 + for (i = 0; i < count; ++i, ++index) {
47302 + jnode *node;
47303 +
47304 + node = jlookup(tree, oid, index);
47305 + assert("", node != NULL);
47306 + BUG_ON(node == NULL);
47307 +
47308 + spin_lock_jnode(node);
47309 + assert("", !jnode_is_flushprepped(node));
47310 + assert("vs-1475", node->atom == atom);
47311 + assert("vs-1476", atomic_read(&node->x_count) > 0);
47312 +
47313 + JF_CLR(node, JNODE_FLUSH_RESERVED);
47314 + jnode_set_block(node, &first);
47315 + unformatted_make_reloc(node, flush_pos->fq);
47316 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
47317 + FQ_LIST, 0));
47318 + spin_unlock_jnode(node);
47319 + first++;
47320 +
47321 + atomic_dec(&node->x_count);
47322 + nr ++;
47323 + }
47324 +
47325 + spin_unlock_atom(atom);
47326 + return;
47327 +}
47328 +
47329 +/**
47330 + * make_node_ovrwr - assign node to overwrite set
47331 + * @jnodes: overwrite set list head
47332 + * @node: jnode to belong to overwrite set
47333 + *
47334 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
47335 + * which is an accumulator for nodes before they get to overwrite set list of
47336 + * atom.
47337 + */
47338 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
47339 +{
47340 + spin_lock_jnode(node);
47341 +
47342 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
47343 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
47344 +
47345 + JF_SET(node, JNODE_OVRWR);
47346 + list_move_tail(&node->capture_link, jnodes);
47347 + ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
47348 +
47349 + spin_unlock_jnode(node);
47350 +}
47351 +
47352 +/**
47353 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
47354 + * @flush_pos: flush position
47355 + * @oid: objectid of file jnodes belong to
47356 + * @index: starting index
47357 + * @width: extent width
47358 + *
47359 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
47360 + * overwrite set. Starting from the one with index @index. If end of slum is
47361 + * detected (node is not found or flushprepped) - stop iterating and set flush
47362 + * position's state to POS_INVALID.
47363 + */
47364 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
47365 + unsigned long index, reiser4_block_nr width)
47366 +{
47367 + unsigned long i;
47368 + reiser4_tree *tree;
47369 + jnode *node;
47370 + txn_atom *atom;
47371 + LIST_HEAD(jnodes);
47372 +
47373 + tree = current_tree;
47374 +
47375 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47376 + assert("vs-1478", atom);
47377 +
47378 + for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
47379 + node = jlookup(tree, oid, index);
47380 + if (!node) {
47381 + flush_pos->state = POS_INVALID;
47382 + break;
47383 + }
47384 + if (jnode_check_flushprepped(node)) {
47385 + flush_pos->state = POS_INVALID;
47386 + atomic_dec(&node->x_count);
47387 + break;
47388 + }
47389 + if (node->atom != atom) {
47390 + flush_pos->state = POS_INVALID;
47391 + atomic_dec(&node->x_count);
47392 + break;
47393 + }
47394 + make_node_ovrwr(&jnodes, node);
47395 + atomic_dec(&node->x_count);
47396 + }
47397 +
47398 + list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
47399 + spin_unlock_atom(atom);
47400 +}
47401 +
47402 +/**
47403 + * allocated_extent_slum_size
47404 + * @flush_pos:
47405 + * @oid:
47406 + * @index:
47407 + * @count:
47408 + *
47409 + *
47410 + */
47411 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47412 + unsigned long index, unsigned long count)
47413 +{
47414 + unsigned long i;
47415 + reiser4_tree *tree;
47416 + txn_atom *atom;
47417 + int nr;
47418 +
47419 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47420 + assert("vs-1468", atom);
47421 +
47422 + nr = 0;
47423 + tree = current_tree;
47424 + for (i = 0; i < count; ++i, ++index) {
47425 + jnode *node;
47426 +
47427 + node = jlookup(tree, oid, index);
47428 + if (!node)
47429 + break;
47430 +
47431 + if (jnode_check_flushprepped(node)) {
47432 + atomic_dec(&node->x_count);
47433 + break;
47434 + }
47435 +
47436 + if (node->atom != atom) {
47437 + /*
47438 + * this is possible on overwrite: extent_write may
47439 + * capture several unformatted nodes without capturing
47440 + * any formatted nodes.
47441 + */
47442 + atomic_dec(&node->x_count);
47443 + break;
47444 + }
47445 +
47446 + assert("vs-1476", atomic_read(&node->x_count) > 1);
47447 + atomic_dec(&node->x_count);
47448 + nr ++;
47449 + }
47450 +
47451 + spin_unlock_atom(atom);
47452 + return nr;
47453 +}
47454 +
47455 +/**
47456 + * alloc_extent
47457 + * @flush_pos:
47458 + *
47459 + *
47460 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47461 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47462 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47463 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47464 + * set to 1 and to overwrite set otherwise
47465 + */
47466 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
47467 +{
47468 + coord_t *coord;
47469 + reiser4_extent *ext;
47470 + reiser4_extent replace_ext;
47471 + oid_t oid;
47472 + reiser4_block_nr protected;
47473 + reiser4_block_nr start;
47474 + __u64 index;
47475 + __u64 width;
47476 + extent_state state;
47477 + int result;
47478 + reiser4_block_nr first_allocated;
47479 + __u64 allocated;
47480 + reiser4_key key;
47481 + block_stage_t block_stage;
47482 +
47483 + assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47484 + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47485 + && item_is_extent(&flush_pos->coord));
47486 +
47487 + coord = &flush_pos->coord;
47488 +
47489 + ext = extent_by_coord(coord);
47490 + state = state_of_extent(ext);
47491 + if (state == HOLE_EXTENT) {
47492 + flush_pos->state = POS_INVALID;
47493 + return 0;
47494 + }
47495 +
47496 + item_key_by_coord(coord, &key);
47497 + oid = get_key_objectid(&key);
47498 + index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47499 + start = extent_get_start(ext);
47500 + width = extent_get_width(ext);
47501 +
47502 + assert("vs-1457", width > flush_pos->pos_in_unit);
47503 +
47504 + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47505 + /* relocate */
47506 + if (flush_pos->pos_in_unit) {
47507 + /* split extent unit into two */
47508 + result =
47509 + split_allocated_extent(coord,
47510 + flush_pos->pos_in_unit);
47511 + flush_pos->pos_in_unit = 0;
47512 + return result;
47513 + }
47514 +
47515 + /* limit number of nodes to allocate */
47516 + if (flush_pos->nr_to_write < width)
47517 + width = flush_pos->nr_to_write;
47518 +
47519 + if (state == ALLOCATED_EXTENT) {
47520 + /*
47521 + * all protected nodes are not flushprepped, therefore
47522 + * they are counted as flush_reserved
47523 + */
47524 + block_stage = BLOCK_FLUSH_RESERVED;
47525 + protected = allocated_extent_slum_size(flush_pos, oid,
47526 + index, width);
47527 + if (protected == 0) {
47528 + flush_pos->state = POS_INVALID;
47529 + flush_pos->pos_in_unit = 0;
47530 + return 0;
47531 + }
47532 + } else {
47533 + block_stage = BLOCK_UNALLOCATED;
47534 + protected = width;
47535 + }
47536 +
47537 + /*
47538 + * look at previous unit if possible. If it is allocated, make
47539 + * preceder more precise
47540 + */
47541 + if (coord->unit_pos &&
47542 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47543 + reiser4_pos_hint(flush_pos)->blk =
47544 + extent_get_start(ext - 1) +
47545 + extent_get_width(ext - 1);
47546 +
47547 + /* allocate new block numbers for protected nodes */
47548 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47549 + protected,
47550 + &first_allocated, &allocated,
47551 + block_stage);
47552 +
47553 + if (state == ALLOCATED_EXTENT)
47554 + /*
47555 + * on relocating - free nodes which are going to be
47556 + * relocated
47557 + */
47558 + reiser4_dealloc_blocks(&start, &allocated,
47559 + BLOCK_ALLOCATED, BA_DEFER);
47560 +
47561 + /* assign new block numbers to protected nodes */
47562 + assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47563 +
47564 + /* prepare extent which will replace current one */
47565 + reiser4_set_extent(&replace_ext, first_allocated, allocated);
47566 +
47567 + /* adjust extent item */
47568 + result = conv_extent(coord, &replace_ext);
47569 + if (result != 0 && result != -ENOMEM) {
47570 + warning("vs-1461",
47571 + "Failed to allocate extent. Should not happen\n");
47572 + return result;
47573 + }
47574 +
47575 + /*
47576 + * break flush: we prepared for flushing as many blocks as we
47577 + * were asked for
47578 + */
47579 + if (flush_pos->nr_to_write == allocated)
47580 + flush_pos->state = POS_INVALID;
47581 + } else {
47582 + /* overwrite */
47583 + mark_jnodes_overwrite(flush_pos, oid, index, width);
47584 + }
47585 + flush_pos->pos_in_unit = 0;
47586 + return 0;
47587 +}
47588 +
47589 +/* if @key is glueable to the item @coord is set to */
47590 +static int must_insert(const coord_t *coord, const reiser4_key *key)
47591 +{
47592 + reiser4_key last;
47593 +
47594 + if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47595 + && keyeq(append_key_extent(coord, &last), key))
47596 + return 0;
47597 + return 1;
47598 +}
47599 +
47600 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47601 + or modify last unit of last item to have greater width */
47602 +static int put_unit_to_end(znode *node, const reiser4_key *key,
47603 + reiser4_extent *copy_ext)
47604 +{
47605 + int result;
47606 + coord_t coord;
47607 + cop_insert_flag flags;
47608 + reiser4_extent *last_ext;
47609 + reiser4_item_data data;
47610 +
47611 + /* set coord after last unit in an item */
47612 + coord_init_last_unit(&coord, node);
47613 + coord.between = AFTER_UNIT;
47614 +
47615 + flags =
47616 + COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47617 + if (must_insert(&coord, key)) {
47618 + result =
47619 + insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47620 + key, NULL /*lh */ , flags);
47621 +
47622 + } else {
47623 + /* try to glue with last unit */
47624 + last_ext = extent_by_coord(&coord);
47625 + if (state_of_extent(last_ext) &&
47626 + extent_get_start(last_ext) + extent_get_width(last_ext) ==
47627 + extent_get_start(copy_ext)) {
47628 + /* widen last unit of node */
47629 + extent_set_width(last_ext,
47630 + extent_get_width(last_ext) +
47631 + extent_get_width(copy_ext));
47632 + znode_make_dirty(node);
47633 + return 0;
47634 + }
47635 +
47636 + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47637 + result =
47638 + insert_into_item(&coord, NULL /*lh */ , key,
47639 + init_new_extent(&data, copy_ext, 1),
47640 + flags);
47641 + }
47642 +
47643 + assert("vs-438", result == 0 || result == -E_NODE_FULL);
47644 + return result;
47645 +}
47646 +
47647 +/* @coord is set to extent unit */
47648 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47649 + flush_pos_t *flush_pos,
47650 + reiser4_key *stop_key)
47651 +{
47652 + reiser4_extent *ext;
47653 + __u64 index;
47654 + __u64 width;
47655 + reiser4_block_nr start;
47656 + extent_state state;
47657 + oid_t oid;
47658 + reiser4_block_nr first_allocated;
47659 + __u64 allocated;
47660 + __u64 protected;
47661 + reiser4_extent copy_extent;
47662 + reiser4_key key;
47663 + int result;
47664 + block_stage_t block_stage;
47665 +
47666 + assert("vs-1457", flush_pos->pos_in_unit == 0);
47667 + assert("vs-1467", coord_is_leftmost_unit(coord));
47668 + assert("vs-1467", item_is_extent(coord));
47669 +
47670 + ext = extent_by_coord(coord);
47671 + index = extent_unit_index(coord);
47672 + start = extent_get_start(ext);
47673 + width = extent_get_width(ext);
47674 + state = state_of_extent(ext);
47675 + unit_key_by_coord(coord, &key);
47676 + oid = get_key_objectid(&key);
47677 +
47678 + if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47679 + (state == UNALLOCATED_EXTENT)) {
47680 + /* relocate */
47681 + if (state == ALLOCATED_EXTENT) {
47682 + /* all protected nodes are not flushprepped, therefore
47683 + * they are counted as flush_reserved */
47684 + block_stage = BLOCK_FLUSH_RESERVED;
47685 + protected = allocated_extent_slum_size(flush_pos, oid,
47686 + index, width);
47687 + if (protected == 0) {
47688 + flush_pos->state = POS_INVALID;
47689 + flush_pos->pos_in_unit = 0;
47690 + return 0;
47691 + }
47692 + } else {
47693 + block_stage = BLOCK_UNALLOCATED;
47694 + protected = width;
47695 + }
47696 +
47697 + /*
47698 + * look at previous unit if possible. If it is allocated, make
47699 + * preceder more precise
47700 + */
47701 + if (coord->unit_pos &&
47702 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47703 + reiser4_pos_hint(flush_pos)->blk =
47704 + extent_get_start(ext - 1) +
47705 + extent_get_width(ext - 1);
47706 +
47707 + /* allocate new block numbers for protected nodes */
47708 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47709 + protected,
47710 + &first_allocated, &allocated,
47711 + block_stage);
47712 +
47713 + /* prepare extent which will be copied to left */
47714 + reiser4_set_extent(&copy_extent, first_allocated, allocated);
47715 +
47716 + result = put_unit_to_end(left, &key, &copy_extent);
47717 + if (result == -E_NODE_FULL) {
47718 + int target_block_stage;
47719 +
47720 + /* free blocks which were just allocated */
47721 + target_block_stage =
47722 + (state ==
47723 + ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47724 + BLOCK_UNALLOCATED;
47725 + reiser4_dealloc_blocks(&first_allocated, &allocated,
47726 + target_block_stage,
47727 + BA_PERMANENT);
47728 +
47729 + /* rewind the preceder. */
47730 + flush_pos->preceder.blk = first_allocated;
47731 + check_preceder(flush_pos->preceder.blk);
47732 +
47733 + return SQUEEZE_TARGET_FULL;
47734 + }
47735 +
47736 + if (state == ALLOCATED_EXTENT) {
47737 + /* free nodes which were relocated */
47738 + reiser4_dealloc_blocks(&start, &allocated,
47739 + BLOCK_ALLOCATED, BA_DEFER);
47740 + }
47741 +
47742 + /* assign new block numbers to protected nodes */
47743 + assign_real_blocknrs(flush_pos, oid, index, allocated,
47744 + first_allocated);
47745 +
47746 + set_key_offset(&key,
47747 + get_key_offset(&key) +
47748 + (allocated << current_blocksize_bits));
47749 + } else {
47750 + /*
47751 + * overwrite: try to copy unit as it is to left neighbor and
47752 + * make all first not flushprepped nodes overwrite nodes
47753 + */
47754 + reiser4_set_extent(&copy_extent, start, width);
47755 + result = put_unit_to_end(left, &key, &copy_extent);
47756 + if (result == -E_NODE_FULL)
47757 + return SQUEEZE_TARGET_FULL;
47758 +
47759 + if (state != HOLE_EXTENT)
47760 + mark_jnodes_overwrite(flush_pos, oid, index, width);
47761 + set_key_offset(&key,
47762 + get_key_offset(&key) +
47763 + (width << current_blocksize_bits));
47764 + }
47765 + *stop_key = key;
47766 + return SQUEEZE_CONTINUE;
47767 +}
47768 +
47769 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47770 +{
47771 + return key_by_inode_and_offset_common(inode, off, key);
47772 +}
47773 +
47774 +/*
47775 + * Local variables:
47776 + * c-indentation-style: "K&R"
47777 + * mode-name: "LC"
47778 + * c-basic-offset: 8
47779 + * tab-width: 8
47780 + * fill-column: 79
47781 + * scroll-step: 1
47782 + * End:
47783 + */
47784 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent.h linux-2.6.33/fs/reiser4/plugin/item/extent.h
47785 --- linux-2.6.33.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 01:00:00.000000000 +0100
47786 +++ linux-2.6.33/fs/reiser4/plugin/item/extent.h 2010-03-04 19:33:22.000000000 +0100
47787 @@ -0,0 +1,231 @@
47788 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47789 +
47790 +#ifndef __REISER4_EXTENT_H__
47791 +#define __REISER4_EXTENT_H__
47792 +
47793 +/* on disk extent */
47794 +typedef struct {
47795 + reiser4_dblock_nr start;
47796 + reiser4_dblock_nr width;
47797 +} reiser4_extent;
47798 +
47799 +struct extent_stat {
47800 + int unallocated_units;
47801 + int unallocated_blocks;
47802 + int allocated_units;
47803 + int allocated_blocks;
47804 + int hole_units;
47805 + int hole_blocks;
47806 +};
47807 +
47808 +/* extents in an extent item can be either holes, or unallocated or allocated
47809 + extents */
47810 +typedef enum {
47811 + HOLE_EXTENT,
47812 + UNALLOCATED_EXTENT,
47813 + ALLOCATED_EXTENT
47814 +} extent_state;
47815 +
47816 +#define HOLE_EXTENT_START 0
47817 +#define UNALLOCATED_EXTENT_START 1
47818 +#define UNALLOCATED_EXTENT_START2 2
47819 +
47820 +struct extent_coord_extension {
47821 + reiser4_block_nr pos_in_unit;
47822 + reiser4_block_nr width; /* width of current unit */
47823 + pos_in_node_t nr_units; /* number of units */
47824 + int ext_offset; /* offset from the beginning of zdata() */
47825 + unsigned long expected_page;
47826 +#if REISER4_DEBUG
47827 + reiser4_extent extent;
47828 +#endif
47829 +};
47830 +
47831 +/* macros to set/get fields of on-disk extent */
47832 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47833 +{
47834 + return le64_to_cpu(ext->start);
47835 +}
47836 +
47837 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47838 +{
47839 + return le64_to_cpu(ext->width);
47840 +}
47841 +
47842 +extern __u64 reiser4_current_block_count(void);
47843 +
47844 +static inline void
47845 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47846 +{
47847 + cassert(sizeof(ext->start) == 8);
47848 + assert("nikita-2510",
47849 + ergo(start > 1, start < reiser4_current_block_count()));
47850 + put_unaligned(cpu_to_le64(start), &ext->start);
47851 +}
47852 +
47853 +static inline void
47854 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47855 +{
47856 + cassert(sizeof(ext->width) == 8);
47857 + assert("", width > 0);
47858 + put_unaligned(cpu_to_le64(width), &ext->width);
47859 + assert("nikita-2511",
47860 + ergo(extent_get_start(ext) > 1,
47861 + extent_get_start(ext) + width <=
47862 + reiser4_current_block_count()));
47863 +}
47864 +
47865 +#define extent_item(coord) \
47866 +({ \
47867 + assert("nikita-3143", item_is_extent(coord)); \
47868 + ((reiser4_extent *)item_body_by_coord (coord)); \
47869 +})
47870 +
47871 +#define extent_by_coord(coord) \
47872 +({ \
47873 + assert("nikita-3144", item_is_extent(coord)); \
47874 + (extent_item (coord) + (coord)->unit_pos); \
47875 +})
47876 +
47877 +#define width_by_coord(coord) \
47878 +({ \
47879 + assert("nikita-3145", item_is_extent(coord)); \
47880 + extent_get_width (extent_by_coord(coord)); \
47881 +})
47882 +
47883 +struct carry_cut_data;
47884 +struct carry_kill_data;
47885 +
47886 +/* plugin->u.item.b.* */
47887 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47888 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47889 + const reiser4_item_data *);
47890 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47891 +pos_in_node_t nr_units_extent(const coord_t *);
47892 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47893 +void init_coord_extent(coord_t *);
47894 +int init_extent(coord_t *, reiser4_item_data *);
47895 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47896 +int can_shift_extent(unsigned free_space,
47897 + coord_t * source, znode * target, shift_direction,
47898 + unsigned *size, unsigned want);
47899 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47900 + unsigned count, shift_direction where_is_free_space,
47901 + unsigned free_space);
47902 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47903 + struct carry_kill_data *);
47904 +int create_hook_extent(const coord_t * coord, void *arg);
47905 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47906 + struct carry_cut_data *, reiser4_key * smallest_removed,
47907 + reiser4_key * new_first);
47908 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47909 + struct carry_kill_data *, reiser4_key * smallest_removed,
47910 + reiser4_key * new_first);
47911 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47912 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47913 +void print_extent(const char *, coord_t *);
47914 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47915 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47916 + reiser4_block_nr * block);
47917 +void item_stat_extent(const coord_t * coord, void *vp);
47918 +int reiser4_check_extent(const coord_t * coord, const char **error);
47919 +
47920 +/* plugin->u.item.s.file.* */
47921 +ssize_t reiser4_write_extent(struct file *, struct inode * inode,
47922 + const char __user *, size_t, loff_t *);
47923 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47924 +int reiser4_readpage_extent(void *, struct page *);
47925 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47926 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47927 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47928 +int get_block_address_extent(const coord_t *, sector_t block,
47929 + sector_t * result);
47930 +
47931 +/* these are used in flush.c
47932 + FIXME-VS: should they be somewhere in item_plugin? */
47933 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47934 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47935 + reiser4_key * stop_key);
47936 +
47937 +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47938 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47939 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47940 +
47941 +/* plugin->u.item.f. */
47942 +int reiser4_scan_extent(flush_scan * scan);
47943 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47944 +
47945 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47946 + int nr_extents);
47947 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47948 +extent_state state_of_extent(reiser4_extent * ext);
47949 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47950 + reiser4_block_nr width);
47951 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47952 + int *plugged_hole);
47953 +
47954 +#include "../../coord.h"
47955 +#include "../../lock.h"
47956 +#include "../../tap.h"
47957 +
47958 +struct replace_handle {
47959 + /* these are to be set before calling reiser4_replace_extent */
47960 + coord_t *coord;
47961 + lock_handle *lh;
47962 + reiser4_key key;
47963 + reiser4_key *pkey;
47964 + reiser4_extent overwrite;
47965 + reiser4_extent new_extents[2];
47966 + int nr_new_extents;
47967 + unsigned flags;
47968 +
47969 + /* these are used by reiser4_replace_extent */
47970 + reiser4_item_data item;
47971 + coord_t coord_after;
47972 + lock_handle lh_after;
47973 + tap_t watch;
47974 + reiser4_key paste_key;
47975 +#if REISER4_DEBUG
47976 + reiser4_extent orig_ext;
47977 + reiser4_key tmp;
47978 +#endif
47979 +};
47980 +
47981 +/* this structure is kmalloced before calling make_extent to avoid excessive
47982 + stack consumption on plug_hole->reiser4_replace_extent */
47983 +struct make_extent_handle {
47984 + uf_coord_t *uf_coord;
47985 + reiser4_block_nr blocknr;
47986 + int created;
47987 + struct inode *inode;
47988 + union {
47989 + struct {
47990 + } append;
47991 + struct replace_handle replace;
47992 + } u;
47993 +};
47994 +
47995 +int reiser4_replace_extent(struct replace_handle *,
47996 + int return_inserted_position);
47997 +lock_handle *znode_lh(znode *);
47998 +
47999 +/* the reiser4 repacker support */
48000 +struct repacker_cursor;
48001 +extern int process_extent_backward_for_repacking(tap_t *,
48002 + struct repacker_cursor *);
48003 +extern int mark_extent_for_repacking(tap_t *, int);
48004 +
48005 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
48006 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
48007 +
48008 +/* __REISER4_EXTENT_H__ */
48009 +#endif
48010 +/*
48011 + Local variables:
48012 + c-indentation-style: "K&R"
48013 + mode-name: "LC"
48014 + c-basic-offset: 8
48015 + tab-width: 8
48016 + fill-column: 120
48017 + End:
48018 +*/
48019 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_item_ops.c
48020 --- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 01:00:00.000000000 +0100
48021 +++ linux-2.6.33/fs/reiser4/plugin/item/extent_item_ops.c 2010-03-04 19:33:22.000000000 +0100
48022 @@ -0,0 +1,889 @@
48023 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48024 +
48025 +#include "item.h"
48026 +#include "../../inode.h"
48027 +#include "../../tree_walk.h" /* check_sibling_list() */
48028 +#include "../../page_cache.h"
48029 +#include "../../carry.h"
48030 +
48031 +#include <linux/quotaops.h>
48032 +
48033 +/* item_plugin->b.max_key_inside */
48034 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
48035 +{
48036 + item_key_by_coord(coord, key);
48037 + set_key_offset(key, get_key_offset(reiser4_max_key()));
48038 + return key;
48039 +}
48040 +
48041 +/* item_plugin->b.can_contain_key
48042 + this checks whether @key of @data is matching to position set by @coord */
48043 +int
48044 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
48045 + const reiser4_item_data * data)
48046 +{
48047 + reiser4_key item_key;
48048 +
48049 + if (item_plugin_by_coord(coord) != data->iplug)
48050 + return 0;
48051 +
48052 + item_key_by_coord(coord, &item_key);
48053 + if (get_key_locality(key) != get_key_locality(&item_key) ||
48054 + get_key_objectid(key) != get_key_objectid(&item_key) ||
48055 + get_key_ordering(key) != get_key_ordering(&item_key))
48056 + return 0;
48057 +
48058 + return 1;
48059 +}
48060 +
48061 +/* item_plugin->b.mergeable
48062 + first item is of extent type */
48063 +/* Audited by: green(2002.06.13) */
48064 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
48065 +{
48066 + reiser4_key key1, key2;
48067 +
48068 + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
48069 + /* FIXME-VS: Which is it? Assert or return 0 */
48070 + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
48071 + return 0;
48072 + }
48073 +
48074 + item_key_by_coord(p1, &key1);
48075 + item_key_by_coord(p2, &key2);
48076 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
48077 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
48078 + get_key_ordering(&key1) != get_key_ordering(&key2) ||
48079 + get_key_type(&key1) != get_key_type(&key2))
48080 + return 0;
48081 + if (get_key_offset(&key1) +
48082 + reiser4_extent_size(p1, nr_units_extent(p1)) !=
48083 + get_key_offset(&key2))
48084 + return 0;
48085 + return 1;
48086 +}
48087 +
48088 +/* item_plugin->b.nr_units */
48089 +pos_in_node_t nr_units_extent(const coord_t * coord)
48090 +{
48091 + /* length of extent item has to be multiple of extent size */
48092 + assert("vs-1424",
48093 + (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
48094 + return item_length_by_coord(coord) / sizeof(reiser4_extent);
48095 +}
48096 +
48097 +/* item_plugin->b.lookup */
48098 +lookup_result
48099 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
48100 + coord_t * coord)
48101 +{ /* znode and item_pos are
48102 + set to an extent item to
48103 + look through */
48104 + reiser4_key item_key;
48105 + reiser4_block_nr lookuped, offset;
48106 + unsigned i, nr_units;
48107 + reiser4_extent *ext;
48108 + unsigned blocksize;
48109 + unsigned char blocksize_bits;
48110 +
48111 + item_key_by_coord(coord, &item_key);
48112 + offset = get_key_offset(&item_key);
48113 +
48114 + /* key we are looking for must be greater than key of item @coord */
48115 + assert("vs-414", keygt(key, &item_key));
48116 +
48117 + assert("umka-99945",
48118 + !keygt(key, max_key_inside_extent(coord, &item_key)));
48119 +
48120 + ext = extent_item(coord);
48121 + assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
48122 +
48123 + blocksize = current_blocksize;
48124 + blocksize_bits = current_blocksize_bits;
48125 +
48126 + /* offset we are looking for */
48127 + lookuped = get_key_offset(key);
48128 +
48129 + nr_units = nr_units_extent(coord);
48130 + /* go through all extents until the one which address given offset */
48131 + for (i = 0; i < nr_units; i++, ext++) {
48132 + offset += (extent_get_width(ext) << blocksize_bits);
48133 + if (offset > lookuped) {
48134 + /* desired byte is somewhere in this extent */
48135 + coord->unit_pos = i;
48136 + coord->between = AT_UNIT;
48137 + return CBK_COORD_FOUND;
48138 + }
48139 + }
48140 +
48141 + /* set coord after last unit */
48142 + coord->unit_pos = nr_units - 1;
48143 + coord->between = AFTER_UNIT;
48144 + return CBK_COORD_FOUND;
48145 +}
48146 +
48147 +/* item_plugin->b.paste
48148 + item @coord is set to has been appended with @data->length of free
48149 + space. data->data contains data to be pasted into the item in position
48150 + @coord->in_item.unit_pos. It must fit into that free space.
48151 + @coord must be set between units.
48152 +*/
48153 +int
48154 +paste_extent(coord_t * coord, reiser4_item_data * data,
48155 + carry_plugin_info * info UNUSED_ARG)
48156 +{
48157 + unsigned old_nr_units;
48158 + reiser4_extent *ext;
48159 + int item_length;
48160 +
48161 + ext = extent_item(coord);
48162 + item_length = item_length_by_coord(coord);
48163 + old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
48164 +
48165 + /* this is also used to copy extent into newly created item, so
48166 + old_nr_units could be 0 */
48167 + assert("vs-260", item_length >= data->length);
48168 +
48169 + /* make sure that coord is set properly */
48170 + assert("vs-35",
48171 + ((!coord_is_existing_unit(coord))
48172 + || (!old_nr_units && !coord->unit_pos)));
48173 +
48174 + /* first unit to be moved */
48175 + switch (coord->between) {
48176 + case AFTER_UNIT:
48177 + coord->unit_pos++;
48178 + case BEFORE_UNIT:
48179 + coord->between = AT_UNIT;
48180 + break;
48181 + case AT_UNIT:
48182 + assert("vs-331", !old_nr_units && !coord->unit_pos);
48183 + break;
48184 + default:
48185 + impossible("vs-330", "coord is set improperly");
48186 + }
48187 +
48188 + /* prepare space for new units */
48189 + memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
48190 + ext + coord->unit_pos,
48191 + (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
48192 +
48193 + /* copy new data from kernel space */
48194 + assert("vs-556", data->user == 0);
48195 + memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
48196 +
48197 + /* after paste @coord is set to first of pasted units */
48198 + assert("vs-332", coord_is_existing_unit(coord));
48199 + assert("vs-333",
48200 + !memcmp(data->data, extent_by_coord(coord),
48201 + (unsigned)data->length));
48202 + return 0;
48203 +}
48204 +
48205 +/* item_plugin->b.can_shift */
48206 +int
48207 +can_shift_extent(unsigned free_space, coord_t * source,
48208 + znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
48209 + unsigned *size, unsigned want)
48210 +{
48211 + *size = item_length_by_coord(source);
48212 + if (*size > free_space)
48213 + /* never split a unit of extent item */
48214 + *size = free_space - free_space % sizeof(reiser4_extent);
48215 +
48216 + /* we can shift *size bytes, calculate how many do we want to shift */
48217 + if (*size > want * sizeof(reiser4_extent))
48218 + *size = want * sizeof(reiser4_extent);
48219 +
48220 + if (*size % sizeof(reiser4_extent) != 0)
48221 + impossible("vs-119", "Wrong extent size: %i %zd", *size,
48222 + sizeof(reiser4_extent));
48223 + return *size / sizeof(reiser4_extent);
48224 +
48225 +}
48226 +
48227 +/* item_plugin->b.copy_units */
48228 +void
48229 +copy_units_extent(coord_t * target, coord_t * source,
48230 + unsigned from, unsigned count,
48231 + shift_direction where_is_free_space, unsigned free_space)
48232 +{
48233 + char *from_ext, *to_ext;
48234 +
48235 + assert("vs-217", free_space == count * sizeof(reiser4_extent));
48236 +
48237 + from_ext = item_body_by_coord(source);
48238 + to_ext = item_body_by_coord(target);
48239 +
48240 + if (where_is_free_space == SHIFT_LEFT) {
48241 + assert("vs-215", from == 0);
48242 +
48243 + /* At this moment, item length was already updated in the item
48244 + header by shifting code, hence nr_units_extent() will
48245 + return "new" number of units---one we obtain after copying
48246 + units.
48247 + */
48248 + to_ext +=
48249 + (nr_units_extent(target) - count) * sizeof(reiser4_extent);
48250 + } else {
48251 + reiser4_key key;
48252 + coord_t coord;
48253 +
48254 + assert("vs-216",
48255 + from + count == coord_last_unit_pos(source) + 1);
48256 +
48257 + from_ext += item_length_by_coord(source) - free_space;
48258 +
48259 + /* new units are inserted before first unit in an item,
48260 + therefore, we have to update item key */
48261 + coord = *source;
48262 + coord.unit_pos = from;
48263 + unit_key_extent(&coord, &key);
48264 +
48265 + node_plugin_by_node(target->node)->update_item_key(target, &key,
48266 + NULL /*info */);
48267 + }
48268 +
48269 + memcpy(to_ext, from_ext, free_space);
48270 +}
48271 +
48272 +/* item_plugin->b.create_hook
48273 + @arg is znode of leaf node for which we need to update right delimiting key */
48274 +int create_hook_extent(const coord_t * coord, void *arg)
48275 +{
48276 + coord_t *child_coord;
48277 + znode *node;
48278 + reiser4_key key;
48279 + reiser4_tree *tree;
48280 +
48281 + if (!arg)
48282 + return 0;
48283 +
48284 + child_coord = arg;
48285 + tree = znode_get_tree(coord->node);
48286 +
48287 + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
48288 +
48289 + write_lock_tree(tree);
48290 + write_lock_dk(tree);
48291 + /* find a node on the left level for which right delimiting key has to
48292 + be updated */
48293 + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
48294 + assert("vs-411", znode_is_left_connected(child_coord->node));
48295 + node = child_coord->node->left;
48296 + } else {
48297 + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
48298 + node = child_coord->node;
48299 + assert("nikita-3314", node != NULL);
48300 + }
48301 +
48302 + if (node != NULL) {
48303 + znode_set_rd_key(node, item_key_by_coord(coord, &key));
48304 +
48305 + assert("nikita-3282", check_sibling_list(node));
48306 + /* break sibling links */
48307 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
48308 + ON_DEBUG(node->right->left_version =
48309 + atomic_inc_return(&delim_key_version);
48310 + node->right_version =
48311 + atomic_inc_return(&delim_key_version););
48312 +
48313 + node->right->left = NULL;
48314 + node->right = NULL;
48315 + }
48316 + }
48317 + write_unlock_dk(tree);
48318 + write_unlock_tree(tree);
48319 + return 0;
48320 +}
48321 +
48322 +#define ITEM_TAIL_KILLED 0
48323 +#define ITEM_HEAD_KILLED 1
48324 +#define ITEM_KILLED 2
48325 +
48326 +/* item_plugin->b.kill_hook
48327 + this is called when @count units starting from @from-th one are going to be removed
48328 + */
48329 +int
48330 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
48331 + struct carry_kill_data *kdata)
48332 +{
48333 + reiser4_extent *ext;
48334 + reiser4_block_nr start, length;
48335 + const reiser4_key *pfrom_key, *pto_key;
48336 + struct inode *inode;
48337 + reiser4_tree *tree;
48338 + pgoff_t from_off, to_off, offset, skip;
48339 + int retval;
48340 +
48341 + /* these are located in memory kmalloc-ed by kill_node_content */
48342 + reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
48343 + coord_t *dup, *next;
48344 +
48345 + assert("zam-811", znode_is_write_locked(coord->node));
48346 + assert("nikita-3315", kdata != NULL);
48347 + assert("vs-34", kdata->buf != NULL);
48348 +
48349 + /* map structures to kdata->buf */
48350 + min_item_key = (reiser4_key *) (kdata->buf);
48351 + max_item_key = min_item_key + 1;
48352 + from_key = max_item_key + 1;
48353 + to_key = from_key + 1;
48354 + key = to_key + 1;
48355 + dup = (coord_t *) (key + 1);
48356 + next = dup + 1;
48357 +
48358 + item_key_by_coord(coord, min_item_key);
48359 + max_item_key_by_coord(coord, max_item_key);
48360 +
48361 + if (kdata->params.from_key) {
48362 + pfrom_key = kdata->params.from_key;
48363 + pto_key = kdata->params.to_key;
48364 + } else {
48365 + assert("vs-1549", from == coord->unit_pos);
48366 + unit_key_by_coord(coord, from_key);
48367 + pfrom_key = from_key;
48368 +
48369 + coord_dup(dup, coord);
48370 + dup->unit_pos = from + count - 1;
48371 + max_unit_key_by_coord(dup, to_key);
48372 + pto_key = to_key;
48373 + }
48374 +
48375 + if (!keylt(pto_key, max_item_key)) {
48376 + if (!keygt(pfrom_key, min_item_key)) {
48377 + znode *left, *right;
48378 +
48379 + /* item is to be removed completely */
48380 + assert("nikita-3316", kdata->left != NULL
48381 + && kdata->right != NULL);
48382 +
48383 + left = kdata->left->node;
48384 + right = kdata->right->node;
48385 +
48386 + tree = current_tree;
48387 + /* we have to do two things:
48388 + *
48389 + * 1. link left and right formatted neighbors of
48390 + * extent being removed, and
48391 + *
48392 + * 2. update their delimiting keys.
48393 + *
48394 + * atomicity of these operations is protected by
48395 + * taking dk-lock and tree-lock.
48396 + */
48397 + /* if neighbors of item being removed are znodes -
48398 + * link them */
48399 + write_lock_tree(tree);
48400 + write_lock_dk(tree);
48401 + link_left_and_right(left, right);
48402 + if (left) {
48403 + /* update right delimiting key of left
48404 + * neighbor of extent item */
48405 + /*coord_t next;
48406 + reiser4_key key; */
48407 +
48408 + coord_dup(next, coord);
48409 +
48410 + if (coord_next_item(next))
48411 + *key = *znode_get_rd_key(coord->node);
48412 + else
48413 + item_key_by_coord(next, key);
48414 + znode_set_rd_key(left, key);
48415 + }
48416 + write_unlock_dk(tree);
48417 + write_unlock_tree(tree);
48418 +
48419 + from_off =
48420 + get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48421 + to_off =
48422 + (get_key_offset(max_item_key) +
48423 + 1) >> PAGE_CACHE_SHIFT;
48424 + retval = ITEM_KILLED;
48425 + } else {
48426 + /* tail of item is to be removed */
48427 + from_off =
48428 + (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48429 + 1) >> PAGE_CACHE_SHIFT;
48430 + to_off =
48431 + (get_key_offset(max_item_key) +
48432 + 1) >> PAGE_CACHE_SHIFT;
48433 + retval = ITEM_TAIL_KILLED;
48434 + }
48435 + } else {
48436 + /* head of item is to be removed */
48437 + assert("vs-1571", keyeq(pfrom_key, min_item_key));
48438 + assert("vs-1572",
48439 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48440 + 0);
48441 + assert("vs-1573",
48442 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48443 + 1)) == 0);
48444 +
48445 + if (kdata->left->node) {
48446 + /* update right delimiting key of left neighbor of extent item */
48447 + /*reiser4_key key; */
48448 +
48449 + *key = *pto_key;
48450 + set_key_offset(key, get_key_offset(pto_key) + 1);
48451 +
48452 + write_lock_dk(current_tree);
48453 + znode_set_rd_key(kdata->left->node, key);
48454 + write_unlock_dk(current_tree);
48455 + }
48456 +
48457 + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48458 + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48459 + retval = ITEM_HEAD_KILLED;
48460 + }
48461 +
48462 + inode = kdata->inode;
48463 + assert("vs-1545", inode != NULL);
48464 + if (inode != NULL)
48465 + /* take care of pages and jnodes corresponding to part of item being killed */
48466 + reiser4_invalidate_pages(inode->i_mapping, from_off,
48467 + to_off - from_off,
48468 + kdata->params.truncate);
48469 +
48470 + ext = extent_item(coord) + from;
48471 + offset =
48472 + (get_key_offset(min_item_key) +
48473 + reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48474 +
48475 + assert("vs-1551", from_off >= offset);
48476 + assert("vs-1552", from_off - offset <= extent_get_width(ext));
48477 + skip = from_off - offset;
48478 + offset = from_off;
48479 +
48480 + while (offset < to_off) {
48481 + length = extent_get_width(ext) - skip;
48482 + if (state_of_extent(ext) == HOLE_EXTENT) {
48483 + skip = 0;
48484 + offset += length;
48485 + ext++;
48486 + continue;
48487 + }
48488 +
48489 + if (offset + length > to_off) {
48490 + length = to_off - offset;
48491 + }
48492 +
48493 + vfs_dq_free_block_nodirty(inode, length);
48494 +
48495 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48496 + /* some jnodes corresponding to this unallocated extent */
48497 + fake_allocated2free(length, 0 /* unformatted */ );
48498 +
48499 + skip = 0;
48500 + offset += length;
48501 + ext++;
48502 + continue;
48503 + }
48504 +
48505 + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48506 +
48507 + if (length != 0) {
48508 + start = extent_get_start(ext) + skip;
48509 +
48510 + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48511 + immediately */
48512 + reiser4_dealloc_blocks(&start, &length,
48513 + 0 /* not used */ ,
48514 + BA_DEFER
48515 + /* unformatted with defer */ );
48516 + }
48517 + skip = 0;
48518 + offset += length;
48519 + ext++;
48520 + }
48521 + return retval;
48522 +}
48523 +
48524 +/* item_plugin->b.kill_units */
48525 +int
48526 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48527 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48528 + reiser4_key * new_first)
48529 +{
48530 + reiser4_extent *ext;
48531 + reiser4_key item_key;
48532 + pos_in_node_t count;
48533 + reiser4_key from_key, to_key;
48534 + const reiser4_key *pfrom_key, *pto_key;
48535 + loff_t off;
48536 + int result;
48537 +
48538 + assert("vs-1541",
48539 + ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48540 + || (kdata->params.from_key != NULL
48541 + && kdata->params.to_key != NULL)));
48542 +
48543 + if (kdata->params.from_key) {
48544 + pfrom_key = kdata->params.from_key;
48545 + pto_key = kdata->params.to_key;
48546 + } else {
48547 + coord_t dup;
48548 +
48549 + /* calculate key range of kill */
48550 + assert("vs-1549", from == coord->unit_pos);
48551 + unit_key_by_coord(coord, &from_key);
48552 + pfrom_key = &from_key;
48553 +
48554 + coord_dup(&dup, coord);
48555 + dup.unit_pos = to;
48556 + max_unit_key_by_coord(&dup, &to_key);
48557 + pto_key = &to_key;
48558 + }
48559 +
48560 + item_key_by_coord(coord, &item_key);
48561 +
48562 +#if REISER4_DEBUG
48563 + {
48564 + reiser4_key max_item_key;
48565 +
48566 + max_item_key_by_coord(coord, &max_item_key);
48567 +
48568 + if (new_first) {
48569 + /* head of item is to be cut */
48570 + assert("vs-1542", keyeq(pfrom_key, &item_key));
48571 + assert("vs-1538", keylt(pto_key, &max_item_key));
48572 + } else {
48573 + /* tail of item is to be cut */
48574 + assert("vs-1540", keygt(pfrom_key, &item_key));
48575 + assert("vs-1543", !keylt(pto_key, &max_item_key));
48576 + }
48577 + }
48578 +#endif
48579 +
48580 + if (smallest_removed)
48581 + *smallest_removed = *pfrom_key;
48582 +
48583 + if (new_first) {
48584 + /* item head is cut. Item key will change. This new key is calculated here */
48585 + assert("vs-1556",
48586 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48587 + (PAGE_CACHE_SIZE - 1));
48588 + *new_first = *pto_key;
48589 + set_key_offset(new_first, get_key_offset(new_first) + 1);
48590 + }
48591 +
48592 + count = to - from + 1;
48593 + result = kill_hook_extent(coord, from, count, kdata);
48594 + if (result == ITEM_TAIL_KILLED) {
48595 + assert("vs-1553",
48596 + get_key_offset(pfrom_key) >=
48597 + get_key_offset(&item_key) +
48598 + reiser4_extent_size(coord, from));
48599 + off =
48600 + get_key_offset(pfrom_key) -
48601 + (get_key_offset(&item_key) +
48602 + reiser4_extent_size(coord, from));
48603 + if (off) {
48604 + /* unit @from is to be cut partially. Its width decreases */
48605 + ext = extent_item(coord) + from;
48606 + extent_set_width(ext,
48607 + (off + PAGE_CACHE_SIZE -
48608 + 1) >> PAGE_CACHE_SHIFT);
48609 + count--;
48610 + }
48611 + } else {
48612 + __u64 max_to_offset;
48613 + __u64 rest;
48614 +
48615 + assert("vs-1575", result == ITEM_HEAD_KILLED);
48616 + assert("", from == 0);
48617 + assert("",
48618 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48619 + 1)) == 0);
48620 + assert("",
48621 + get_key_offset(pto_key) + 1 >
48622 + get_key_offset(&item_key) +
48623 + reiser4_extent_size(coord, to));
48624 + max_to_offset =
48625 + get_key_offset(&item_key) +
48626 + reiser4_extent_size(coord, to + 1) - 1;
48627 + assert("", get_key_offset(pto_key) <= max_to_offset);
48628 +
48629 + rest =
48630 + (max_to_offset -
48631 + get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48632 + if (rest) {
48633 + /* unit @to is to be cut partially */
48634 + ext = extent_item(coord) + to;
48635 +
48636 + assert("", extent_get_width(ext) > rest);
48637 +
48638 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
48639 + extent_set_start(ext,
48640 + extent_get_start(ext) +
48641 + (extent_get_width(ext) -
48642 + rest));
48643 +
48644 + extent_set_width(ext, rest);
48645 + count--;
48646 + }
48647 + }
48648 + return count * sizeof(reiser4_extent);
48649 +}
48650 +
48651 +/* item_plugin->b.cut_units
48652 + this is too similar to kill_units_extent */
48653 +int
48654 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48655 + struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48656 + reiser4_key * new_first)
48657 +{
48658 + reiser4_extent *ext;
48659 + reiser4_key item_key;
48660 + pos_in_node_t count;
48661 + reiser4_key from_key, to_key;
48662 + const reiser4_key *pfrom_key, *pto_key;
48663 + loff_t off;
48664 +
48665 + assert("vs-1541",
48666 + ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48667 + || (cdata->params.from_key != NULL
48668 + && cdata->params.to_key != NULL)));
48669 +
48670 + if (cdata->params.from_key) {
48671 + pfrom_key = cdata->params.from_key;
48672 + pto_key = cdata->params.to_key;
48673 + } else {
48674 + coord_t dup;
48675 +
48676 + /* calculate key range of kill */
48677 + coord_dup(&dup, coord);
48678 + dup.unit_pos = from;
48679 + unit_key_by_coord(&dup, &from_key);
48680 +
48681 + dup.unit_pos = to;
48682 + max_unit_key_by_coord(&dup, &to_key);
48683 +
48684 + pfrom_key = &from_key;
48685 + pto_key = &to_key;
48686 + }
48687 +
48688 + assert("vs-1555",
48689 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48690 + assert("vs-1556",
48691 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48692 + (PAGE_CACHE_SIZE - 1));
48693 +
48694 + item_key_by_coord(coord, &item_key);
48695 +
48696 +#if REISER4_DEBUG
48697 + {
48698 + reiser4_key max_item_key;
48699 +
48700 + assert("vs-1584",
48701 + get_key_locality(pfrom_key) ==
48702 + get_key_locality(&item_key));
48703 + assert("vs-1585",
48704 + get_key_type(pfrom_key) == get_key_type(&item_key));
48705 + assert("vs-1586",
48706 + get_key_objectid(pfrom_key) ==
48707 + get_key_objectid(&item_key));
48708 + assert("vs-1587",
48709 + get_key_ordering(pfrom_key) ==
48710 + get_key_ordering(&item_key));
48711 +
48712 + max_item_key_by_coord(coord, &max_item_key);
48713 +
48714 + if (new_first != NULL) {
48715 + /* head of item is to be cut */
48716 + assert("vs-1542", keyeq(pfrom_key, &item_key));
48717 + assert("vs-1538", keylt(pto_key, &max_item_key));
48718 + } else {
48719 + /* tail of item is to be cut */
48720 + assert("vs-1540", keygt(pfrom_key, &item_key));
48721 + assert("vs-1543", keyeq(pto_key, &max_item_key));
48722 + }
48723 + }
48724 +#endif
48725 +
48726 + if (smallest_removed)
48727 + *smallest_removed = *pfrom_key;
48728 +
48729 + if (new_first) {
48730 + /* item head is cut. Item key will change. This new key is calculated here */
48731 + *new_first = *pto_key;
48732 + set_key_offset(new_first, get_key_offset(new_first) + 1);
48733 + }
48734 +
48735 + count = to - from + 1;
48736 +
48737 + assert("vs-1553",
48738 + get_key_offset(pfrom_key) >=
48739 + get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48740 + off =
48741 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48742 + reiser4_extent_size(coord, from));
48743 + if (off) {
48744 + /* tail of unit @from is to be cut partially. Its width decreases */
48745 + assert("vs-1582", new_first == NULL);
48746 + ext = extent_item(coord) + from;
48747 + extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48748 + count--;
48749 + }
48750 +
48751 + assert("vs-1554",
48752 + get_key_offset(pto_key) <=
48753 + get_key_offset(&item_key) +
48754 + reiser4_extent_size(coord, to + 1) - 1);
48755 + off =
48756 + (get_key_offset(&item_key) +
48757 + reiser4_extent_size(coord, to + 1) - 1) -
48758 + get_key_offset(pto_key);
48759 + if (off) {
48760 + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48761 + and width decreased. */
48762 + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48763 + ext = extent_item(coord) + to;
48764 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
48765 + extent_set_start(ext,
48766 + extent_get_start(ext) +
48767 + (extent_get_width(ext) -
48768 + (off >> PAGE_CACHE_SHIFT)));
48769 +
48770 + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48771 + count--;
48772 + }
48773 + return count * sizeof(reiser4_extent);
48774 +}
48775 +
48776 +/* item_plugin->b.unit_key */
48777 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48778 +{
48779 + assert("vs-300", coord_is_existing_unit(coord));
48780 +
48781 + item_key_by_coord(coord, key);
48782 + set_key_offset(key,
48783 + (get_key_offset(key) +
48784 + reiser4_extent_size(coord, coord->unit_pos)));
48785 +
48786 + return key;
48787 +}
48788 +
48789 +/* item_plugin->b.max_unit_key */
48790 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48791 +{
48792 + assert("vs-300", coord_is_existing_unit(coord));
48793 +
48794 + item_key_by_coord(coord, key);
48795 + set_key_offset(key,
48796 + (get_key_offset(key) +
48797 + reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48798 + return key;
48799 +}
48800 +
48801 +/* item_plugin->b.estimate
48802 + item_plugin->b.item_data_by_flow */
48803 +
48804 +#if REISER4_DEBUG
48805 +
48806 +/* item_plugin->b.check
48807 + used for debugging, every item should have here the most complete
48808 + possible check of the consistency of the item that the inventor can
48809 + construct
48810 +*/
48811 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48812 + const char **error /* where to store error message */)
48813 +{
48814 + reiser4_extent *ext, *first;
48815 + unsigned i, j;
48816 + reiser4_block_nr start, width, blk_cnt;
48817 + unsigned num_units;
48818 + reiser4_tree *tree;
48819 + oid_t oid;
48820 + reiser4_key key;
48821 + coord_t scan;
48822 +
48823 + assert("vs-933", REISER4_DEBUG);
48824 +
48825 + if (znode_get_level(coord->node) != TWIG_LEVEL) {
48826 + *error = "Extent on the wrong level";
48827 + return -1;
48828 + }
48829 + if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48830 + *error = "Wrong item size";
48831 + return -1;
48832 + }
48833 + ext = first = extent_item(coord);
48834 + blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48835 + num_units = coord_num_units(coord);
48836 + tree = znode_get_tree(coord->node);
48837 + item_key_by_coord(coord, &key);
48838 + oid = get_key_objectid(&key);
48839 + coord_dup(&scan, coord);
48840 +
48841 + for (i = 0; i < num_units; ++i, ++ext) {
48842 + __u64 index;
48843 +
48844 + scan.unit_pos = i;
48845 + index = extent_unit_index(&scan);
48846 +
48847 +#if 0
48848 + /* check that all jnodes are present for the unallocated
48849 + * extent */
48850 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48851 + for (j = 0; j < extent_get_width(ext); j++) {
48852 + jnode *node;
48853 +
48854 + node = jlookup(tree, oid, index + j);
48855 + if (node == NULL) {
48856 + print_coord("scan", &scan, 0);
48857 + *error = "Jnode missing";
48858 + return -1;
48859 + }
48860 + jput(node);
48861 + }
48862 + }
48863 +#endif
48864 +
48865 + start = extent_get_start(ext);
48866 + if (start < 2)
48867 + continue;
48868 + /* extent is allocated one */
48869 + width = extent_get_width(ext);
48870 + if (start >= blk_cnt) {
48871 + *error = "Start too large";
48872 + return -1;
48873 + }
48874 + if (start + width > blk_cnt) {
48875 + *error = "End too large";
48876 + return -1;
48877 + }
48878 + /* make sure that this extent does not overlap with other
48879 + allocated extents extents */
48880 + for (j = 0; j < i; j++) {
48881 + if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48882 + continue;
48883 + if (!
48884 + ((extent_get_start(ext) >=
48885 + extent_get_start(first + j) +
48886 + extent_get_width(first + j))
48887 + || (extent_get_start(ext) +
48888 + extent_get_width(ext) <=
48889 + extent_get_start(first + j)))) {
48890 + *error = "Extent overlaps with others";
48891 + return -1;
48892 + }
48893 + }
48894 +
48895 + }
48896 +
48897 + return 0;
48898 +}
48899 +
48900 +#endif /* REISER4_DEBUG */
48901 +
48902 +/*
48903 + Local variables:
48904 + c-indentation-style: "K&R"
48905 + mode-name: "LC"
48906 + c-basic-offset: 8
48907 + tab-width: 8
48908 + fill-column: 120
48909 + scroll-step: 1
48910 + End:
48911 +*/
48912 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/internal.c linux-2.6.33/fs/reiser4/plugin/item/internal.c
48913 --- linux-2.6.33.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 01:00:00.000000000 +0100
48914 +++ linux-2.6.33/fs/reiser4/plugin/item/internal.c 2010-03-04 19:33:22.000000000 +0100
48915 @@ -0,0 +1,404 @@
48916 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48917 +
48918 +/* Implementation of internal-item plugin methods. */
48919 +
48920 +#include "../../forward.h"
48921 +#include "../../debug.h"
48922 +#include "../../dformat.h"
48923 +#include "../../key.h"
48924 +#include "../../coord.h"
48925 +#include "internal.h"
48926 +#include "item.h"
48927 +#include "../node/node.h"
48928 +#include "../plugin.h"
48929 +#include "../../jnode.h"
48930 +#include "../../znode.h"
48931 +#include "../../tree_walk.h"
48932 +#include "../../tree_mod.h"
48933 +#include "../../tree.h"
48934 +#include "../../super.h"
48935 +#include "../../block_alloc.h"
48936 +
48937 +/* see internal.h for explanation */
48938 +
48939 +/* plugin->u.item.b.mergeable */
48940 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48941 + const coord_t * p2 UNUSED_ARG /* second item */ )
48942 +{
48943 + /* internal items are not mergeable */
48944 + return 0;
48945 +}
48946 +
48947 +/* ->lookup() method for internal items */
48948 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48949 + lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48950 + coord_t * coord /* coord of item */ )
48951 +{
48952 + reiser4_key ukey;
48953 +
48954 + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48955 + default:
48956 + impossible("", "keycmp()?!");
48957 + case LESS_THAN:
48958 + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48959 + item plugin can not be taken using coord set this way */
48960 + assert("vs-681", coord->unit_pos == 0);
48961 + coord->between = AFTER_UNIT;
48962 + case EQUAL_TO:
48963 + return CBK_COORD_FOUND;
48964 + case GREATER_THAN:
48965 + return CBK_COORD_NOTFOUND;
48966 + }
48967 +}
48968 +
48969 +/* return body of internal item at @coord */
48970 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
48971 + * item */ )
48972 +{
48973 + assert("nikita-607", coord != NULL);
48974 + assert("nikita-1650",
48975 + item_plugin_by_coord(coord) ==
48976 + item_plugin_by_id(NODE_POINTER_ID));
48977 + return (internal_item_layout *) item_body_by_coord(coord);
48978 +}
48979 +
48980 +void reiser4_update_internal(const coord_t * coord,
48981 + const reiser4_block_nr * blocknr)
48982 +{
48983 + internal_item_layout *item = internal_at(coord);
48984 + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48985 +
48986 + put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48987 +}
48988 +
48989 +/* return child block number stored in the internal item at @coord */
48990 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48991 +{
48992 + assert("nikita-608", coord != NULL);
48993 + return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48994 +}
48995 +
48996 +/* get znode pointed to by internal @item */
48997 +static znode *znode_at(const coord_t * item /* coord of item */ ,
48998 + znode * parent /* parent node */ )
48999 +{
49000 + return child_znode(item, parent, 1, 0);
49001 +}
49002 +
49003 +/* store pointer from internal item into "block". Implementation of
49004 + ->down_link() method */
49005 +void down_link_internal(const coord_t * coord /* coord of item */ ,
49006 + const reiser4_key * key UNUSED_ARG /* key to get
49007 + * pointer for */ ,
49008 + reiser4_block_nr * block /* resulting block number */ )
49009 +{
49010 + ON_DEBUG(reiser4_key item_key);
49011 +
49012 + assert("nikita-609", coord != NULL);
49013 + assert("nikita-611", block != NULL);
49014 + assert("nikita-612", (key == NULL) ||
49015 + /* twig horrors */
49016 + (znode_get_level(coord->node) == TWIG_LEVEL)
49017 + || keyle(item_key_by_coord(coord, &item_key), key));
49018 +
49019 + *block = pointer_at(coord);
49020 + assert("nikita-2960", reiser4_blocknr_is_sane(block));
49021 +}
49022 +
49023 +/* Get the child's block number, or 0 if the block is unallocated. */
49024 +int
49025 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
49026 + reiser4_block_nr * block)
49027 +{
49028 + assert("jmacd-2059", coord != NULL);
49029 +
49030 + *block = pointer_at(coord);
49031 + assert("nikita-2961", reiser4_blocknr_is_sane(block));
49032 +
49033 + if (reiser4_blocknr_is_fake(block)) {
49034 + *block = 0;
49035 + }
49036 +
49037 + return 0;
49038 +}
49039 +
49040 +/* Return the child. */
49041 +int
49042 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
49043 + jnode ** childp)
49044 +{
49045 + reiser4_block_nr block = pointer_at(coord);
49046 + znode *child;
49047 +
49048 + assert("jmacd-2059", childp != NULL);
49049 + assert("nikita-2962", reiser4_blocknr_is_sane(&block));
49050 +
49051 + child = zlook(znode_get_tree(coord->node), &block);
49052 +
49053 + if (IS_ERR(child)) {
49054 + return PTR_ERR(child);
49055 + }
49056 +
49057 + *childp = ZJNODE(child);
49058 +
49059 + return 0;
49060 +}
49061 +
49062 +#if REISER4_DEBUG
49063 +
49064 +static void check_link(znode * left, znode * right)
49065 +{
49066 + znode *scan;
49067 +
49068 + for (scan = left; scan != right; scan = scan->right) {
49069 + if (ZF_ISSET(scan, JNODE_RIP))
49070 + break;
49071 + if (znode_is_right_connected(scan) && scan->right != NULL) {
49072 + if (ZF_ISSET(scan->right, JNODE_RIP))
49073 + break;
49074 + assert("nikita-3285",
49075 + znode_is_left_connected(scan->right));
49076 + assert("nikita-3265",
49077 + ergo(scan != left,
49078 + ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
49079 + assert("nikita-3284", scan->right->left == scan);
49080 + } else
49081 + break;
49082 + }
49083 +}
49084 +
49085 +int check__internal(const coord_t * coord, const char **error)
49086 +{
49087 + reiser4_block_nr blk;
49088 + znode *child;
49089 + coord_t cpy;
49090 +
49091 + blk = pointer_at(coord);
49092 + if (!reiser4_blocknr_is_sane(&blk)) {
49093 + *error = "Invalid pointer";
49094 + return -1;
49095 + }
49096 + coord_dup(&cpy, coord);
49097 + child = znode_at(&cpy, cpy.node);
49098 + if (child != NULL) {
49099 + znode *left_child;
49100 + znode *right_child;
49101 +
49102 + left_child = right_child = NULL;
49103 +
49104 + assert("nikita-3256", znode_invariant(child));
49105 + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
49106 + left_child = znode_at(&cpy, cpy.node);
49107 + if (left_child != NULL) {
49108 + read_lock_tree(znode_get_tree(child));
49109 + check_link(left_child, child);
49110 + read_unlock_tree(znode_get_tree(child));
49111 + zput(left_child);
49112 + }
49113 + }
49114 + coord_dup(&cpy, coord);
49115 + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
49116 + right_child = znode_at(&cpy, cpy.node);
49117 + if (right_child != NULL) {
49118 + read_lock_tree(znode_get_tree(child));
49119 + check_link(child, right_child);
49120 + read_unlock_tree(znode_get_tree(child));
49121 + zput(right_child);
49122 + }
49123 + }
49124 + zput(child);
49125 + }
49126 + return 0;
49127 +}
49128 +
49129 +#endif /* REISER4_DEBUG */
49130 +
49131 +/* return true only if this item really points to "block" */
49132 +/* Audited by: green(2002.06.14) */
49133 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
49134 + const reiser4_block_nr * block /* block number to
49135 + * check */ )
49136 +{
49137 + assert("nikita-613", coord != NULL);
49138 + assert("nikita-614", block != NULL);
49139 +
49140 + return pointer_at(coord) == *block;
49141 +}
49142 +
49143 +/* hook called by ->create_item() method of node plugin after new internal
49144 + item was just created.
49145 +
49146 + This is point where pointer to new node is inserted into tree. Initialize
49147 + parent pointer in child znode, insert child into sibling list and slum.
49148 +
49149 +*/
49150 +int create_hook_internal(const coord_t * item /* coord of item */ ,
49151 + void *arg /* child's left neighbor, if any */ )
49152 +{
49153 + znode *child;
49154 + __u64 child_ptr;
49155 +
49156 + assert("nikita-1252", item != NULL);
49157 + assert("nikita-1253", item->node != NULL);
49158 + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
49159 + assert("nikita-1450", item->unit_pos == 0);
49160 +
49161 + /*
49162 + * preparing to item insertion build_child_ptr_data sets pointer to
49163 + * data to be inserted to jnode's blocknr which is in cpu byte
49164 + * order. Node's create_item simply copied those data. As result we
49165 + * have child pointer in cpu's byte order. Convert content of internal
49166 + * item to little endian byte order.
49167 + */
49168 + child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
49169 + reiser4_update_internal(item, &child_ptr);
49170 +
49171 + child = znode_at(item, item->node);
49172 + if (child != NULL && !IS_ERR(child)) {
49173 + znode *left;
49174 + int result = 0;
49175 + reiser4_tree *tree;
49176 +
49177 + left = arg;
49178 + tree = znode_get_tree(item->node);
49179 + write_lock_tree(tree);
49180 + write_lock_dk(tree);
49181 + assert("nikita-1400", (child->in_parent.node == NULL)
49182 + || (znode_above_root(child->in_parent.node)));
49183 + ++item->node->c_count;
49184 + coord_to_parent_coord(item, &child->in_parent);
49185 + sibling_list_insert_nolock(child, left);
49186 +
49187 + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
49188 + ZF_CLR(child, JNODE_ORPHAN);
49189 +
49190 + if ((left != NULL) && !keyeq(znode_get_rd_key(left),
49191 + znode_get_rd_key(child))) {
49192 + znode_set_rd_key(child, znode_get_rd_key(left));
49193 + }
49194 + write_unlock_dk(tree);
49195 + write_unlock_tree(tree);
49196 + zput(child);
49197 + return result;
49198 + } else {
49199 + if (child == NULL)
49200 + child = ERR_PTR(-EIO);
49201 + return PTR_ERR(child);
49202 + }
49203 +}
49204 +
49205 +/* hook called by ->cut_and_kill() method of node plugin just before internal
49206 + item is removed.
49207 +
49208 + This is point where empty node is removed from the tree. Clear parent
49209 + pointer in child, and mark node for pending deletion.
49210 +
49211 + Node will be actually deleted later and in several installations:
49212 +
49213 + . when last lock on this node will be released, node will be removed from
49214 + the sibling list and its lock will be invalidated
49215 +
49216 + . when last reference to this node will be dropped, bitmap will be updated
49217 + and node will be actually removed from the memory.
49218 +
49219 +*/
49220 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
49221 + pos_in_node_t from UNUSED_ARG /* start unit */ ,
49222 + pos_in_node_t count UNUSED_ARG /* stop unit */ ,
49223 + struct carry_kill_data *p UNUSED_ARG)
49224 +{
49225 + znode *child;
49226 + int result = 0;
49227 +
49228 + assert("nikita-1222", item != NULL);
49229 + assert("nikita-1224", from == 0);
49230 + assert("nikita-1225", count == 1);
49231 +
49232 + child = znode_at(item, item->node);
49233 + if (child == NULL)
49234 + return 0;
49235 + if (IS_ERR(child))
49236 + return PTR_ERR(child);
49237 + result = zload(child);
49238 + if (result) {
49239 + zput(child);
49240 + return result;
49241 + }
49242 + if (node_is_empty(child)) {
49243 + reiser4_tree *tree;
49244 +
49245 + assert("nikita-1397", znode_is_write_locked(child));
49246 + assert("nikita-1398", child->c_count == 0);
49247 + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
49248 +
49249 + tree = znode_get_tree(item->node);
49250 + write_lock_tree(tree);
49251 + init_parent_coord(&child->in_parent, NULL);
49252 + --item->node->c_count;
49253 + write_unlock_tree(tree);
49254 + } else {
49255 + warning("nikita-1223",
49256 + "Cowardly refuse to remove link to non-empty node");
49257 + result = RETERR(-EIO);
49258 + }
49259 + zrelse(child);
49260 + zput(child);
49261 + return result;
49262 +}
49263 +
49264 +/* hook called by ->shift() node plugin method when iternal item was just
49265 + moved from one node to another.
49266 +
49267 + Update parent pointer in child and c_counts in old and new parent
49268 +
49269 +*/
49270 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
49271 + unsigned from UNUSED_ARG /* start unit */ ,
49272 + unsigned count UNUSED_ARG /* stop unit */ ,
49273 + znode * old_node /* old parent */ )
49274 +{
49275 + znode *child;
49276 + znode *new_node;
49277 + reiser4_tree *tree;
49278 +
49279 + assert("nikita-1276", item != NULL);
49280 + assert("nikita-1277", from == 0);
49281 + assert("nikita-1278", count == 1);
49282 + assert("nikita-1451", item->unit_pos == 0);
49283 +
49284 + new_node = item->node;
49285 + assert("nikita-2132", new_node != old_node);
49286 + tree = znode_get_tree(item->node);
49287 + child = child_znode(item, old_node, 1, 0);
49288 + if (child == NULL)
49289 + return 0;
49290 + if (!IS_ERR(child)) {
49291 + write_lock_tree(tree);
49292 + ++new_node->c_count;
49293 + assert("nikita-1395", znode_parent(child) == old_node);
49294 + assert("nikita-1396", old_node->c_count > 0);
49295 + coord_to_parent_coord(item, &child->in_parent);
49296 + assert("nikita-1781", znode_parent(child) == new_node);
49297 + assert("nikita-1782",
49298 + check_tree_pointer(item, child) == NS_FOUND);
49299 + --old_node->c_count;
49300 + write_unlock_tree(tree);
49301 + zput(child);
49302 + return 0;
49303 + } else
49304 + return PTR_ERR(child);
49305 +}
49306 +
49307 +/* plugin->u.item.b.max_key_inside - not defined */
49308 +
49309 +/* plugin->u.item.b.nr_units - item.c:single_unit */
49310 +
49311 +/* Make Linus happy.
49312 + Local variables:
49313 + c-indentation-style: "K&R"
49314 + mode-name: "LC"
49315 + c-basic-offset: 8
49316 + tab-width: 8
49317 + fill-column: 120
49318 + End:
49319 +*/
49320 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/internal.h linux-2.6.33/fs/reiser4/plugin/item/internal.h
49321 --- linux-2.6.33.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 01:00:00.000000000 +0100
49322 +++ linux-2.6.33/fs/reiser4/plugin/item/internal.h 2010-03-04 19:33:22.000000000 +0100
49323 @@ -0,0 +1,57 @@
49324 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49325 +/* Internal item contains down-link to the child of the internal/twig
49326 + node in a tree. It is internal items that are actually used during
49327 + tree traversal. */
49328 +
49329 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
49330 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
49331 +
49332 +#include "../../forward.h"
49333 +#include "../../dformat.h"
49334 +
49335 +/* on-disk layout of internal item */
49336 +typedef struct internal_item_layout {
49337 + /* 0 */ reiser4_dblock_nr pointer;
49338 + /* 4 */
49339 +} internal_item_layout;
49340 +
49341 +struct cut_list;
49342 +
49343 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
49344 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
49345 + coord_t * coord);
49346 +/* store pointer from internal item into "block". Implementation of
49347 + ->down_link() method */
49348 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
49349 + reiser4_block_nr * block);
49350 +extern int has_pointer_to_internal(const coord_t * coord,
49351 + const reiser4_block_nr * block);
49352 +extern int create_hook_internal(const coord_t * item, void *arg);
49353 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
49354 + pos_in_node_t count, struct carry_kill_data *);
49355 +extern int shift_hook_internal(const coord_t * item, unsigned from,
49356 + unsigned count, znode * old_node);
49357 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
49358 +
49359 +extern int utmost_child_internal(const coord_t * coord, sideof side,
49360 + jnode ** child);
49361 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
49362 + reiser4_block_nr * block);
49363 +
49364 +extern void reiser4_update_internal(const coord_t * coord,
49365 + const reiser4_block_nr * blocknr);
49366 +/* FIXME: reiserfs has check_internal */
49367 +extern int check__internal(const coord_t * coord, const char **error);
49368 +
49369 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
49370 +#endif
49371 +
49372 +/* Make Linus happy.
49373 + Local variables:
49374 + c-indentation-style: "K&R"
49375 + mode-name: "LC"
49376 + c-basic-offset: 8
49377 + tab-width: 8
49378 + fill-column: 120
49379 + End:
49380 +*/
49381 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/item.c linux-2.6.33/fs/reiser4/plugin/item/item.c
49382 --- linux-2.6.33.orig/fs/reiser4/plugin/item/item.c 1970-01-01 01:00:00.000000000 +0100
49383 +++ linux-2.6.33/fs/reiser4/plugin/item/item.c 2010-03-04 19:33:22.000000000 +0100
49384 @@ -0,0 +1,719 @@
49385 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49386 +
49387 +/* definition of item plugins. */
49388 +
49389 +#include "../../forward.h"
49390 +#include "../../debug.h"
49391 +#include "../../key.h"
49392 +#include "../../coord.h"
49393 +#include "../plugin_header.h"
49394 +#include "sde.h"
49395 +#include "internal.h"
49396 +#include "item.h"
49397 +#include "static_stat.h"
49398 +#include "../plugin.h"
49399 +#include "../../znode.h"
49400 +#include "../../tree.h"
49401 +#include "../../context.h"
49402 +#include "ctail.h"
49403 +
49404 +/* return pointer to item body */
49405 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
49406 +{
49407 + assert("nikita-324", coord != NULL);
49408 + assert("nikita-325", coord->node != NULL);
49409 + assert("nikita-326", znode_is_loaded(coord->node));
49410 + assert("nikita-3200", coord->offset == INVALID_OFFSET);
49411 +
49412 + coord->offset =
49413 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
49414 + zdata(coord->node);
49415 + ON_DEBUG(coord->body_v = coord->node->times_locked);
49416 +}
49417 +
49418 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49419 +{
49420 + return zdata(coord->node) + coord->offset;
49421 +}
49422 +
49423 +#if REISER4_DEBUG
49424 +
49425 +int item_body_is_valid(const coord_t * coord)
49426 +{
49427 + return
49428 + coord->offset ==
49429 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
49430 + zdata(coord->node);
49431 +}
49432 +
49433 +#endif
49434 +
49435 +/* return length of item at @coord */
49436 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49437 +{
49438 + int len;
49439 +
49440 + assert("nikita-327", coord != NULL);
49441 + assert("nikita-328", coord->node != NULL);
49442 + assert("nikita-329", znode_is_loaded(coord->node));
49443 +
49444 + len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49445 + return len;
49446 +}
49447 +
49448 +void obtain_item_plugin(const coord_t * coord)
49449 +{
49450 + assert("nikita-330", coord != NULL);
49451 + assert("nikita-331", coord->node != NULL);
49452 + assert("nikita-332", znode_is_loaded(coord->node));
49453 +
49454 + coord_set_iplug((coord_t *) coord,
49455 + node_plugin_by_node(coord->node)->
49456 + plugin_by_coord(coord));
49457 + assert("nikita-2479",
49458 + coord_iplug(coord) ==
49459 + node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49460 +}
49461 +
49462 +/* return id of item */
49463 +/* Audited by: green(2002.06.15) */
49464 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49465 +{
49466 + assert("vs-539", coord != NULL);
49467 + assert("vs-538", coord->node != NULL);
49468 + assert("vs-537", znode_is_loaded(coord->node));
49469 + assert("vs-536", item_plugin_by_coord(coord) != NULL);
49470 + assert("vs-540",
49471 + item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49472 +
49473 + return item_id_by_plugin(item_plugin_by_coord(coord));
49474 +}
49475 +
49476 +/* return key of item at @coord */
49477 +/* Audited by: green(2002.06.15) */
49478 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49479 + reiser4_key * key /* result */ )
49480 +{
49481 + assert("nikita-338", coord != NULL);
49482 + assert("nikita-339", coord->node != NULL);
49483 + assert("nikita-340", znode_is_loaded(coord->node));
49484 +
49485 + return node_plugin_by_node(coord->node)->key_at(coord, key);
49486 +}
49487 +
49488 +/* this returns max key in the item */
49489 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49490 + reiser4_key * key /* result */ )
49491 +{
49492 + coord_t last;
49493 +
49494 + assert("nikita-338", coord != NULL);
49495 + assert("nikita-339", coord->node != NULL);
49496 + assert("nikita-340", znode_is_loaded(coord->node));
49497 +
49498 + /* make coord pointing to last item's unit */
49499 + coord_dup(&last, coord);
49500 + last.unit_pos = coord_num_units(&last) - 1;
49501 + assert("vs-1560", coord_is_existing_unit(&last));
49502 +
49503 + max_unit_key_by_coord(&last, key);
49504 + return key;
49505 +}
49506 +
49507 +/* return key of unit at @coord */
49508 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49509 + reiser4_key * key /* result */ )
49510 +{
49511 + assert("nikita-772", coord != NULL);
49512 + assert("nikita-774", coord->node != NULL);
49513 + assert("nikita-775", znode_is_loaded(coord->node));
49514 +
49515 + if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49516 + return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49517 + else
49518 + return item_key_by_coord(coord, key);
49519 +}
49520 +
49521 +/* return the biggest key contained the unit @coord */
49522 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49523 + reiser4_key * key /* result */ )
49524 +{
49525 + assert("nikita-772", coord != NULL);
49526 + assert("nikita-774", coord->node != NULL);
49527 + assert("nikita-775", znode_is_loaded(coord->node));
49528 +
49529 + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49530 + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49531 + else
49532 + return unit_key_by_coord(coord, key);
49533 +}
49534 +
49535 +/* ->max_key_inside() method for items consisting of exactly one key (like
49536 + stat-data) */
49537 +static reiser4_key *max_key_inside_single_key(const coord_t *
49538 + coord /* coord of item */ ,
49539 + reiser4_key *
49540 + result /* resulting key */ )
49541 +{
49542 + assert("nikita-604", coord != NULL);
49543 +
49544 + /* coord -> key is starting key of this item and it has to be already
49545 + filled in */
49546 + return unit_key_by_coord(coord, result);
49547 +}
49548 +
49549 +/* ->nr_units() method for items consisting of exactly one unit always */
49550 +pos_in_node_t
49551 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49552 +{
49553 + return 1;
49554 +}
49555 +
49556 +static int
49557 +paste_no_paste(coord_t * coord UNUSED_ARG,
49558 + reiser4_item_data * data UNUSED_ARG,
49559 + carry_plugin_info * info UNUSED_ARG)
49560 +{
49561 + return 0;
49562 +}
49563 +
49564 +/* default ->fast_paste() method */
49565 +static int
49566 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49567 +{
49568 + return 1;
49569 +}
49570 +
49571 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
49572 + const reiser4_key * key /* key to check */ ,
49573 + const reiser4_item_data * data /* parameters of item
49574 + * being created */ )
49575 +{
49576 + item_plugin *iplug;
49577 + reiser4_key min_key_in_item;
49578 + reiser4_key max_key_in_item;
49579 +
49580 + assert("nikita-1658", item != NULL);
49581 + assert("nikita-1659", key != NULL);
49582 +
49583 + iplug = item_plugin_by_coord(item);
49584 + if (iplug->b.can_contain_key != NULL)
49585 + return iplug->b.can_contain_key(item, key, data);
49586 + else {
49587 + assert("nikita-1681", iplug->b.max_key_inside != NULL);
49588 + item_key_by_coord(item, &min_key_in_item);
49589 + iplug->b.max_key_inside(item, &max_key_in_item);
49590 +
49591 + /* can contain key if
49592 + min_key_in_item <= key &&
49593 + key <= max_key_in_item
49594 + */
49595 + return keyle(&min_key_in_item, key)
49596 + && keyle(key, &max_key_in_item);
49597 + }
49598 +}
49599 +
49600 +/* mergeable method for non mergeable items */
49601 +static int
49602 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49603 +{
49604 + return 0;
49605 +}
49606 +
49607 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49608 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49609 + const coord_t * i2 /* coord of second item */ )
49610 +{
49611 + item_plugin *iplug;
49612 + reiser4_key k1;
49613 + reiser4_key k2;
49614 +
49615 + assert("nikita-1336", i1 != NULL);
49616 + assert("nikita-1337", i2 != NULL);
49617 +
49618 + iplug = item_plugin_by_coord(i1);
49619 + assert("nikita-1338", iplug != NULL);
49620 +
49621 + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49622 + shifting code when nodes are in "suspended" state. */
49623 + assert("nikita-1663",
49624 + keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49625 +
49626 + if (iplug->b.mergeable != NULL) {
49627 + return iplug->b.mergeable(i1, i2);
49628 + } else if (iplug->b.max_key_inside != NULL) {
49629 + iplug->b.max_key_inside(i1, &k1);
49630 + item_key_by_coord(i2, &k2);
49631 +
49632 + /* mergeable if ->max_key_inside() >= key of i2; */
49633 + return keyge(iplug->b.max_key_inside(i1, &k1),
49634 + item_key_by_coord(i2, &k2));
49635 + } else {
49636 + item_key_by_coord(i1, &k1);
49637 + item_key_by_coord(i2, &k2);
49638 +
49639 + return
49640 + (get_key_locality(&k1) == get_key_locality(&k2)) &&
49641 + (get_key_objectid(&k1) == get_key_objectid(&k2))
49642 + && (iplug == item_plugin_by_coord(i2));
49643 + }
49644 +}
49645 +
49646 +int item_is_extent(const coord_t * item)
49647 +{
49648 + assert("vs-482", coord_is_existing_item(item));
49649 + return item_id_by_coord(item) == EXTENT_POINTER_ID;
49650 +}
49651 +
49652 +int item_is_tail(const coord_t * item)
49653 +{
49654 + assert("vs-482", coord_is_existing_item(item));
49655 + return item_id_by_coord(item) == FORMATTING_ID;
49656 +}
49657 +
49658 +#if REISER4_DEBUG
49659 +
49660 +int item_is_statdata(const coord_t * item)
49661 +{
49662 + assert("vs-516", coord_is_existing_item(item));
49663 + return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49664 +}
49665 +
49666 +int item_is_ctail(const coord_t * item)
49667 +{
49668 + assert("edward-xx", coord_is_existing_item(item));
49669 + return item_id_by_coord(item) == CTAIL_ID;
49670 +}
49671 +
49672 +#endif /* REISER4_DEBUG */
49673 +
49674 +static int change_item(struct inode *inode,
49675 + reiser4_plugin * plugin,
49676 + pset_member memb)
49677 +{
49678 + /* cannot change constituent item (sd, or dir_item) */
49679 + return RETERR(-EINVAL);
49680 +}
49681 +
49682 +static reiser4_plugin_ops item_plugin_ops = {
49683 + .init = NULL,
49684 + .load = NULL,
49685 + .save_len = NULL,
49686 + .save = NULL,
49687 + .change = change_item
49688 +};
49689 +
49690 +item_plugin item_plugins[LAST_ITEM_ID] = {
49691 + [STATIC_STAT_DATA_ID] = {
49692 + .h = {
49693 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49694 + .id = STATIC_STAT_DATA_ID,
49695 + .groups = (1 << STAT_DATA_ITEM_TYPE),
49696 + .pops = &item_plugin_ops,
49697 + .label = "sd",
49698 + .desc = "stat-data",
49699 + .linkage = {NULL, NULL}
49700 + },
49701 + .b = {
49702 + .max_key_inside = max_key_inside_single_key,
49703 + .can_contain_key = NULL,
49704 + .mergeable = not_mergeable,
49705 + .nr_units = nr_units_single_unit,
49706 + .lookup = NULL,
49707 + .init = NULL,
49708 + .paste = paste_no_paste,
49709 + .fast_paste = NULL,
49710 + .can_shift = NULL,
49711 + .copy_units = NULL,
49712 + .create_hook = NULL,
49713 + .kill_hook = NULL,
49714 + .shift_hook = NULL,
49715 + .cut_units = NULL,
49716 + .kill_units = NULL,
49717 + .unit_key = NULL,
49718 + .max_unit_key = NULL,
49719 + .estimate = NULL,
49720 + .item_data_by_flow = NULL,
49721 +#if REISER4_DEBUG
49722 + .check = NULL
49723 +#endif
49724 + },
49725 + .f = {
49726 + .utmost_child = NULL,
49727 + .utmost_child_real_block = NULL,
49728 + .update = NULL,
49729 + .scan = NULL,
49730 + .convert = NULL
49731 + },
49732 + .s = {
49733 + .sd = {
49734 + .init_inode = init_inode_static_sd,
49735 + .save_len = save_len_static_sd,
49736 + .save = save_static_sd
49737 + }
49738 + }
49739 + },
49740 + [SIMPLE_DIR_ENTRY_ID] = {
49741 + .h = {
49742 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49743 + .id = SIMPLE_DIR_ENTRY_ID,
49744 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49745 + .pops = &item_plugin_ops,
49746 + .label = "de",
49747 + .desc = "directory entry",
49748 + .linkage = {NULL, NULL}
49749 + },
49750 + .b = {
49751 + .max_key_inside = max_key_inside_single_key,
49752 + .can_contain_key = NULL,
49753 + .mergeable = NULL,
49754 + .nr_units = nr_units_single_unit,
49755 + .lookup = NULL,
49756 + .init = NULL,
49757 + .paste = NULL,
49758 + .fast_paste = NULL,
49759 + .can_shift = NULL,
49760 + .copy_units = NULL,
49761 + .create_hook = NULL,
49762 + .kill_hook = NULL,
49763 + .shift_hook = NULL,
49764 + .cut_units = NULL,
49765 + .kill_units = NULL,
49766 + .unit_key = NULL,
49767 + .max_unit_key = NULL,
49768 + .estimate = NULL,
49769 + .item_data_by_flow = NULL,
49770 +#if REISER4_DEBUG
49771 + .check = NULL
49772 +#endif
49773 + },
49774 + .f = {
49775 + .utmost_child = NULL,
49776 + .utmost_child_real_block = NULL,
49777 + .update = NULL,
49778 + .scan = NULL,
49779 + .convert = NULL
49780 + },
49781 + .s = {
49782 + .dir = {
49783 + .extract_key = extract_key_de,
49784 + .update_key = update_key_de,
49785 + .extract_name = extract_name_de,
49786 + .extract_file_type = extract_file_type_de,
49787 + .add_entry = add_entry_de,
49788 + .rem_entry = rem_entry_de,
49789 + .max_name_len = max_name_len_de
49790 + }
49791 + }
49792 + },
49793 + [COMPOUND_DIR_ID] = {
49794 + .h = {
49795 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49796 + .id = COMPOUND_DIR_ID,
49797 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49798 + .pops = &item_plugin_ops,
49799 + .label = "cde",
49800 + .desc = "compressed directory entry",
49801 + .linkage = {NULL, NULL}
49802 + },
49803 + .b = {
49804 + .max_key_inside = max_key_inside_cde,
49805 + .can_contain_key = can_contain_key_cde,
49806 + .mergeable = mergeable_cde,
49807 + .nr_units = nr_units_cde,
49808 + .lookup = lookup_cde,
49809 + .init = init_cde,
49810 + .paste = paste_cde,
49811 + .fast_paste = agree_to_fast_op,
49812 + .can_shift = can_shift_cde,
49813 + .copy_units = copy_units_cde,
49814 + .create_hook = NULL,
49815 + .kill_hook = NULL,
49816 + .shift_hook = NULL,
49817 + .cut_units = cut_units_cde,
49818 + .kill_units = kill_units_cde,
49819 + .unit_key = unit_key_cde,
49820 + .max_unit_key = unit_key_cde,
49821 + .estimate = estimate_cde,
49822 + .item_data_by_flow = NULL,
49823 +#if REISER4_DEBUG
49824 + .check = reiser4_check_cde
49825 +#endif
49826 + },
49827 + .f = {
49828 + .utmost_child = NULL,
49829 + .utmost_child_real_block = NULL,
49830 + .update = NULL,
49831 + .scan = NULL,
49832 + .convert = NULL
49833 + },
49834 + .s = {
49835 + .dir = {
49836 + .extract_key = extract_key_cde,
49837 + .update_key = update_key_cde,
49838 + .extract_name = extract_name_cde,
49839 + .extract_file_type = extract_file_type_de,
49840 + .add_entry = add_entry_cde,
49841 + .rem_entry = rem_entry_cde,
49842 + .max_name_len = max_name_len_cde
49843 + }
49844 + }
49845 + },
49846 + [NODE_POINTER_ID] = {
49847 + .h = {
49848 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49849 + .id = NODE_POINTER_ID,
49850 + .groups = (1 << INTERNAL_ITEM_TYPE),
49851 + .pops = NULL,
49852 + .label = "internal",
49853 + .desc = "internal item",
49854 + .linkage = {NULL, NULL}
49855 + },
49856 + .b = {
49857 + .max_key_inside = NULL,
49858 + .can_contain_key = NULL,
49859 + .mergeable = mergeable_internal,
49860 + .nr_units = nr_units_single_unit,
49861 + .lookup = lookup_internal,
49862 + .init = NULL,
49863 + .paste = NULL,
49864 + .fast_paste = NULL,
49865 + .can_shift = NULL,
49866 + .copy_units = NULL,
49867 + .create_hook = create_hook_internal,
49868 + .kill_hook = kill_hook_internal,
49869 + .shift_hook = shift_hook_internal,
49870 + .cut_units = NULL,
49871 + .kill_units = NULL,
49872 + .unit_key = NULL,
49873 + .max_unit_key = NULL,
49874 + .estimate = NULL,
49875 + .item_data_by_flow = NULL,
49876 +#if REISER4_DEBUG
49877 + .check = check__internal
49878 +#endif
49879 + },
49880 + .f = {
49881 + .utmost_child = utmost_child_internal,
49882 + .utmost_child_real_block =
49883 + utmost_child_real_block_internal,
49884 + .update = reiser4_update_internal,
49885 + .scan = NULL,
49886 + .convert = NULL
49887 + },
49888 + .s = {
49889 + .internal = {
49890 + .down_link = down_link_internal,
49891 + .has_pointer_to = has_pointer_to_internal
49892 + }
49893 + }
49894 + },
49895 + [EXTENT_POINTER_ID] = {
49896 + .h = {
49897 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49898 + .id = EXTENT_POINTER_ID,
49899 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49900 + .pops = NULL,
49901 + .label = "extent",
49902 + .desc = "extent item",
49903 + .linkage = {NULL, NULL}
49904 + },
49905 + .b = {
49906 + .max_key_inside = max_key_inside_extent,
49907 + .can_contain_key = can_contain_key_extent,
49908 + .mergeable = mergeable_extent,
49909 + .nr_units = nr_units_extent,
49910 + .lookup = lookup_extent,
49911 + .init = NULL,
49912 + .paste = paste_extent,
49913 + .fast_paste = agree_to_fast_op,
49914 + .can_shift = can_shift_extent,
49915 + .create_hook = create_hook_extent,
49916 + .copy_units = copy_units_extent,
49917 + .kill_hook = kill_hook_extent,
49918 + .shift_hook = NULL,
49919 + .cut_units = cut_units_extent,
49920 + .kill_units = kill_units_extent,
49921 + .unit_key = unit_key_extent,
49922 + .max_unit_key = max_unit_key_extent,
49923 + .estimate = NULL,
49924 + .item_data_by_flow = NULL,
49925 +#if REISER4_DEBUG
49926 + .check = reiser4_check_extent
49927 +#endif
49928 + },
49929 + .f = {
49930 + .utmost_child = utmost_child_extent,
49931 + .utmost_child_real_block =
49932 + utmost_child_real_block_extent,
49933 + .update = NULL,
49934 + .scan = reiser4_scan_extent,
49935 + .convert = NULL,
49936 + .key_by_offset = key_by_offset_extent
49937 + },
49938 + .s = {
49939 + .file = {
49940 + .write = reiser4_write_extent,
49941 + .read = reiser4_read_extent,
49942 + .readpage = reiser4_readpage_extent,
49943 + .get_block = get_block_address_extent,
49944 + .append_key = append_key_extent,
49945 + .init_coord_extension =
49946 + init_coord_extension_extent
49947 + }
49948 + }
49949 + },
49950 + [FORMATTING_ID] = {
49951 + .h = {
49952 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49953 + .id = FORMATTING_ID,
49954 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49955 + .pops = NULL,
49956 + .label = "body",
49957 + .desc = "body (or tail?) item",
49958 + .linkage = {NULL, NULL}
49959 + },
49960 + .b = {
49961 + .max_key_inside = max_key_inside_tail,
49962 + .can_contain_key = can_contain_key_tail,
49963 + .mergeable = mergeable_tail,
49964 + .nr_units = nr_units_tail,
49965 + .lookup = lookup_tail,
49966 + .init = NULL,
49967 + .paste = paste_tail,
49968 + .fast_paste = agree_to_fast_op,
49969 + .can_shift = can_shift_tail,
49970 + .create_hook = NULL,
49971 + .copy_units = copy_units_tail,
49972 + .kill_hook = kill_hook_tail,
49973 + .shift_hook = NULL,
49974 + .cut_units = cut_units_tail,
49975 + .kill_units = kill_units_tail,
49976 + .unit_key = unit_key_tail,
49977 + .max_unit_key = unit_key_tail,
49978 + .estimate = NULL,
49979 + .item_data_by_flow = NULL,
49980 +#if REISER4_DEBUG
49981 + .check = NULL
49982 +#endif
49983 + },
49984 + .f = {
49985 + .utmost_child = NULL,
49986 + .utmost_child_real_block = NULL,
49987 + .update = NULL,
49988 + .scan = NULL,
49989 + .convert = NULL
49990 + },
49991 + .s = {
49992 + .file = {
49993 + .write = reiser4_write_tail,
49994 + .read = reiser4_read_tail,
49995 + .readpage = readpage_tail,
49996 + .get_block = get_block_address_tail,
49997 + .append_key = append_key_tail,
49998 + .init_coord_extension =
49999 + init_coord_extension_tail
50000 + }
50001 + }
50002 + },
50003 + [CTAIL_ID] = {
50004 + .h = {
50005 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
50006 + .id = CTAIL_ID,
50007 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
50008 + .pops = NULL,
50009 + .label = "ctail",
50010 + .desc = "cryptcompress tail item",
50011 + .linkage = {NULL, NULL}
50012 + },
50013 + .b = {
50014 + .max_key_inside = max_key_inside_tail,
50015 + .can_contain_key = can_contain_key_ctail,
50016 + .mergeable = mergeable_ctail,
50017 + .nr_units = nr_units_ctail,
50018 + .lookup = NULL,
50019 + .init = init_ctail,
50020 + .paste = paste_ctail,
50021 + .fast_paste = agree_to_fast_op,
50022 + .can_shift = can_shift_ctail,
50023 + .create_hook = create_hook_ctail,
50024 + .copy_units = copy_units_ctail,
50025 + .kill_hook = kill_hook_ctail,
50026 + .shift_hook = shift_hook_ctail,
50027 + .cut_units = cut_units_ctail,
50028 + .kill_units = kill_units_ctail,
50029 + .unit_key = unit_key_tail,
50030 + .max_unit_key = unit_key_tail,
50031 + .estimate = estimate_ctail,
50032 + .item_data_by_flow = NULL,
50033 +#if REISER4_DEBUG
50034 + .check = check_ctail
50035 +#endif
50036 + },
50037 + .f = {
50038 + .utmost_child = utmost_child_ctail,
50039 + /* FIXME-EDWARD: write this */
50040 + .utmost_child_real_block = NULL,
50041 + .update = NULL,
50042 + .scan = scan_ctail,
50043 + .convert = convert_ctail
50044 + },
50045 + .s = {
50046 + .file = {
50047 + .write = NULL,
50048 + .read = read_ctail,
50049 + .readpage = readpage_ctail,
50050 + .get_block = get_block_address_tail,
50051 + .append_key = append_key_ctail,
50052 + .init_coord_extension =
50053 + init_coord_extension_tail
50054 + }
50055 + }
50056 + },
50057 + [BLACK_BOX_ID] = {
50058 + .h = {
50059 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
50060 + .id = BLACK_BOX_ID,
50061 + .groups = (1 << OTHER_ITEM_TYPE),
50062 + .pops = NULL,
50063 + .label = "blackbox",
50064 + .desc = "black box item",
50065 + .linkage = {NULL, NULL}
50066 + },
50067 + .b = {
50068 + .max_key_inside = NULL,
50069 + .can_contain_key = NULL,
50070 + .mergeable = not_mergeable,
50071 + .nr_units = nr_units_single_unit,
50072 + /* to need for ->lookup method */
50073 + .lookup = NULL,
50074 + .init = NULL,
50075 + .paste = NULL,
50076 + .fast_paste = NULL,
50077 + .can_shift = NULL,
50078 + .copy_units = NULL,
50079 + .create_hook = NULL,
50080 + .kill_hook = NULL,
50081 + .shift_hook = NULL,
50082 + .cut_units = NULL,
50083 + .kill_units = NULL,
50084 + .unit_key = NULL,
50085 + .max_unit_key = NULL,
50086 + .estimate = NULL,
50087 + .item_data_by_flow = NULL,
50088 +#if REISER4_DEBUG
50089 + .check = NULL
50090 +#endif
50091 + }
50092 + }
50093 +};
50094 +
50095 +/* Make Linus happy.
50096 + Local variables:
50097 + c-indentation-style: "K&R"
50098 + mode-name: "LC"
50099 + c-basic-offset: 8
50100 + tab-width: 8
50101 + fill-column: 120
50102 + End:
50103 +*/
50104 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/item.h linux-2.6.33/fs/reiser4/plugin/item/item.h
50105 --- linux-2.6.33.orig/fs/reiser4/plugin/item/item.h 1970-01-01 01:00:00.000000000 +0100
50106 +++ linux-2.6.33/fs/reiser4/plugin/item/item.h 2010-03-04 19:33:22.000000000 +0100
50107 @@ -0,0 +1,398 @@
50108 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50109 +
50110 +/* first read balance.c comments before reading this */
50111 +
50112 +/* An item_plugin implements all of the operations required for
50113 + balancing that are item specific. */
50114 +
50115 +/* an item plugin also implements other operations that are specific to that
50116 + item. These go into the item specific operations portion of the item
50117 + handler, and all of the item specific portions of the item handler are put
50118 + into a union. */
50119 +
50120 +#if !defined( __REISER4_ITEM_H__ )
50121 +#define __REISER4_ITEM_H__
50122 +
50123 +#include "../../forward.h"
50124 +#include "../plugin_header.h"
50125 +#include "../../dformat.h"
50126 +#include "../../seal.h"
50127 +#include "../../plugin/file/file.h"
50128 +
50129 +#include <linux/fs.h> /* for struct file, struct inode */
50130 +#include <linux/mm.h> /* for struct page */
50131 +#include <linux/dcache.h> /* for struct dentry */
50132 +
50133 +typedef enum {
50134 + STAT_DATA_ITEM_TYPE,
50135 + DIR_ENTRY_ITEM_TYPE,
50136 + INTERNAL_ITEM_TYPE,
50137 + UNIX_FILE_METADATA_ITEM_TYPE,
50138 + OTHER_ITEM_TYPE
50139 +} item_type_id;
50140 +
50141 +/* this is the part of each item plugin that all items are expected to
50142 + support or at least explicitly fail to support by setting the
50143 + pointer to null. */
50144 +struct balance_ops {
50145 + /* operations called by balancing
50146 +
50147 + It is interesting to consider that some of these item
50148 + operations could be given sources or targets that are not
50149 + really items in nodes. This could be ok/useful.
50150 +
50151 + */
50152 + /* maximal key that can _possibly_ be occupied by this item
50153 +
50154 + When inserting, and node ->lookup() method (called by
50155 + coord_by_key()) reaches an item after binary search,
50156 + the ->max_key_inside() item plugin method is used to determine
50157 + whether new item should pasted into existing item
50158 + (new_key<=max_key_inside()) or new item has to be created
50159 + (new_key>max_key_inside()).
50160 +
50161 + For items that occupy exactly one key (like stat-data)
50162 + this method should return this key. For items that can
50163 + grow indefinitely (extent, directory item) this should
50164 + return reiser4_max_key().
50165 +
50166 + For example extent with the key
50167 +
50168 + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
50169 +
50170 + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
50171 + */
50172 + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
50173 +
50174 + /* true if item @coord can merge data at @key. */
50175 + int (*can_contain_key) (const coord_t *, const reiser4_key *,
50176 + const reiser4_item_data *);
50177 + /* mergeable() - check items for mergeability
50178 +
50179 + Optional method. Returns true if two items can be merged.
50180 +
50181 + */
50182 + int (*mergeable) (const coord_t *, const coord_t *);
50183 +
50184 + /* number of atomic things in an item.
50185 + NOTE FOR CONTRIBUTORS: use a generic method
50186 + nr_units_single_unit() for solid (atomic) items, as
50187 + tree operations use it as a criterion of solidness
50188 + (see is_solid_item macro) */
50189 + pos_in_node_t(*nr_units) (const coord_t *);
50190 +
50191 + /* search within item for a unit within the item, and return a
50192 + pointer to it. This can be used to calculate how many
50193 + bytes to shrink an item if you use pointer arithmetic and
50194 + compare to the start of the item body if the item's data
50195 + are continuous in the node, if the item's data are not
50196 + continuous in the node, all sorts of other things are maybe
50197 + going to break as well. */
50198 + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
50199 + /* method called by ode_plugin->create_item() to initialise new
50200 + item */
50201 + int (*init) (coord_t * target, coord_t * from,
50202 + reiser4_item_data * data);
50203 + /* method called (e.g., by reiser4_resize_item()) to place new data
50204 + into item when it grows */
50205 + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
50206 + /* return true if paste into @coord is allowed to skip
50207 + carry. That is, if such paste would require any changes
50208 + at the parent level
50209 + */
50210 + int (*fast_paste) (const coord_t *);
50211 + /* how many but not more than @want units of @source can be
50212 + shifted into @target node. If pend == append - we try to
50213 + append last item of @target by first units of @source. If
50214 + pend == prepend - we try to "prepend" first item in @target
50215 + by last units of @source. @target node has @free_space
50216 + bytes of free space. Total size of those units are returned
50217 + via @size.
50218 +
50219 + @target is not NULL if shifting to the mergeable item and
50220 + NULL is new item will be created during shifting.
50221 + */
50222 + int (*can_shift) (unsigned free_space, coord_t *,
50223 + znode *, shift_direction, unsigned *size,
50224 + unsigned want);
50225 +
50226 + /* starting off @from-th unit of item @source append or
50227 + prepend @count units to @target. @target has been already
50228 + expanded by @free_space bytes. That must be exactly what is
50229 + needed for those items in @target. If @where_is_free_space
50230 + == SHIFT_LEFT - free space is at the end of @target item,
50231 + othersize - it is in the beginning of it. */
50232 + void (*copy_units) (coord_t *, coord_t *,
50233 + unsigned from, unsigned count,
50234 + shift_direction where_is_free_space,
50235 + unsigned free_space);
50236 +
50237 + int (*create_hook) (const coord_t *, void *);
50238 + /* do whatever is necessary to do when @count units starting
50239 + from @from-th one are removed from the tree */
50240 + /* FIXME-VS: this is used to be here for, in particular,
50241 + extents and items of internal type to free blocks they point
50242 + to at the same time with removing items from a
50243 + tree. Problems start, however, when dealloc_block fails due
50244 + to some reason. Item gets removed, but blocks it pointed to
50245 + are not freed. It is not clear how to fix this for items of
50246 + internal type because a need to remove internal item may
50247 + appear in the middle of balancing, and there is no way to
50248 + undo changes made. OTOH, if space allocator involves
50249 + balancing to perform dealloc_block - this will probably
50250 + break balancing due to deadlock issues
50251 + */
50252 + int (*kill_hook) (const coord_t *, pos_in_node_t from,
50253 + pos_in_node_t count, struct carry_kill_data *);
50254 + int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
50255 + znode * _node);
50256 +
50257 + /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
50258 + including boundaries. When units are cut from item beginning - move space which gets freed to head of
50259 + item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
50260 + item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
50261 + @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
50262 + */
50263 + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
50264 + struct carry_cut_data *,
50265 + reiser4_key * smallest_removed,
50266 + reiser4_key * new_first_key);
50267 +
50268 + /* like cut_units, except that these units are removed from the
50269 + tree, not only from a node */
50270 + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
50271 + struct carry_kill_data *,
50272 + reiser4_key * smallest_removed,
50273 + reiser4_key * new_first);
50274 +
50275 + /* if @key_of_coord == 1 - returned key of coord, otherwise -
50276 + key of unit is returned. If @coord is not set to certain
50277 + unit - ERR_PTR(-ENOENT) is returned */
50278 + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
50279 + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
50280 + /* estimate how much space is needed for paste @data into item at
50281 + @coord. if @coord==0 - estimate insertion, otherwise - estimate
50282 + pasting
50283 + */
50284 + int (*estimate) (const coord_t *, const reiser4_item_data *);
50285 +
50286 + /* converts flow @f to item data. @coord == 0 on insert */
50287 + int (*item_data_by_flow) (const coord_t *, const flow_t *,
50288 + reiser4_item_data *);
50289 +
50290 + /*void (*show) (struct seq_file *, coord_t *); */
50291 +
50292 +#if REISER4_DEBUG
50293 + /* used for debugging, every item should have here the most
50294 + complete possible check of the consistency of the item that
50295 + the inventor can construct */
50296 + int (*check) (const coord_t *, const char **error);
50297 +#endif
50298 +
50299 +};
50300 +
50301 +struct flush_ops {
50302 + /* return the right or left child of @coord, only if it is in memory */
50303 + int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
50304 +
50305 + /* return whether the right or left child of @coord has a non-fake
50306 + block number. */
50307 + int (*utmost_child_real_block) (const coord_t *, sideof side,
50308 + reiser4_block_nr *);
50309 + /* relocate child at @coord to the @block */
50310 + void (*update) (const coord_t *, const reiser4_block_nr *);
50311 + /* count unformatted nodes per item for leave relocation policy, etc.. */
50312 + int (*scan) (flush_scan * scan);
50313 + /* convert item by flush */
50314 + int (*convert) (flush_pos_t * pos);
50315 + /* backward mapping from jnode offset to a key. */
50316 + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
50317 +};
50318 +
50319 +/* operations specific to the directory item */
50320 +struct dir_entry_iops {
50321 + /* extract stat-data key from directory entry at @coord and place it
50322 + into @key. */
50323 + int (*extract_key) (const coord_t *, reiser4_key * key);
50324 + /* update object key in item. */
50325 + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
50326 + /* extract name from directory entry at @coord and return it */
50327 + char *(*extract_name) (const coord_t *, char *buf);
50328 + /* extract file type (DT_* stuff) from directory entry at @coord and
50329 + return it */
50330 + unsigned (*extract_file_type) (const coord_t *);
50331 + int (*add_entry) (struct inode * dir,
50332 + coord_t *, lock_handle *,
50333 + const struct dentry * name,
50334 + reiser4_dir_entry_desc * entry);
50335 + int (*rem_entry) (struct inode * dir, const struct qstr * name,
50336 + coord_t *, lock_handle *,
50337 + reiser4_dir_entry_desc * entry);
50338 + int (*max_name_len) (const struct inode * dir);
50339 +};
50340 +
50341 +/* operations specific to items regular (unix) file metadata are built of */
50342 +struct file_iops{
50343 + ssize_t (*write) (struct file *, struct inode *,
50344 + const char __user *, size_t, loff_t *pos);
50345 + int (*read) (struct file *, flow_t *, hint_t *);
50346 + int (*readpage) (void *, struct page *);
50347 + int (*get_block) (const coord_t *, sector_t, sector_t *);
50348 + /*
50349 + * key of first byte which is not addressed by the item @coord is set
50350 + * to.
50351 + * For example, for extent item with the key
50352 + *
50353 + * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
50354 + *
50355 + * ->append_key is
50356 + *
50357 + * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
50358 + */
50359 + reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
50360 +
50361 + void (*init_coord_extension) (uf_coord_t *, loff_t);
50362 +};
50363 +
50364 +/* operations specific to items of stat data type */
50365 +struct sd_iops {
50366 + int (*init_inode) (struct inode * inode, char *sd, int len);
50367 + int (*save_len) (struct inode * inode);
50368 + int (*save) (struct inode * inode, char **area);
50369 +};
50370 +
50371 +/* operations specific to internal item */
50372 +struct internal_iops{
50373 + /* all tree traversal want to know from internal item is where
50374 + to go next. */
50375 + void (*down_link) (const coord_t * coord,
50376 + const reiser4_key * key, reiser4_block_nr * block);
50377 + /* check that given internal item contains given pointer. */
50378 + int (*has_pointer_to) (const coord_t * coord,
50379 + const reiser4_block_nr * block);
50380 +};
50381 +
50382 +struct item_plugin {
50383 + /* generic fields */
50384 + plugin_header h;
50385 + /* methods common for all item types */
50386 + struct balance_ops b; /* balance operations */
50387 + struct flush_ops f; /* flush operates with items via this methods */
50388 +
50389 + /* methods specific to particular type of item */
50390 + union {
50391 + struct dir_entry_iops dir;
50392 + struct file_iops file;
50393 + struct sd_iops sd;
50394 + struct internal_iops internal;
50395 + } s;
50396 +};
50397 +
50398 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
50399 +
50400 +static inline item_id item_id_by_plugin(item_plugin * plugin)
50401 +{
50402 + return plugin->h.id;
50403 +}
50404 +
50405 +static inline char get_iplugid(item_plugin * iplug)
50406 +{
50407 + assert("nikita-2838", iplug != NULL);
50408 + assert("nikita-2839", iplug->h.id < 0xff);
50409 + return (char)item_id_by_plugin(iplug);
50410 +}
50411 +
50412 +extern unsigned long znode_times_locked(const znode * z);
50413 +
50414 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50415 +{
50416 + assert("nikita-2837", coord != NULL);
50417 + assert("nikita-2838", iplug != NULL);
50418 + coord->iplugid = get_iplugid(iplug);
50419 + ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50420 +}
50421 +
50422 +static inline item_plugin *coord_iplug(const coord_t * coord)
50423 +{
50424 + assert("nikita-2833", coord != NULL);
50425 + assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50426 + assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50427 + return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50428 + coord->iplugid);
50429 +}
50430 +
50431 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50432 + const reiser4_item_data *);
50433 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50434 +extern int item_is_extent(const coord_t *);
50435 +extern int item_is_tail(const coord_t *);
50436 +extern int item_is_statdata(const coord_t * item);
50437 +extern int item_is_ctail(const coord_t *);
50438 +
50439 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50440 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50441 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50442 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50443 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50444 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50445 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50446 + reiser4_key * key);
50447 +extern void obtain_item_plugin(const coord_t * coord);
50448 +
50449 +#if defined(REISER4_DEBUG)
50450 +extern int znode_is_loaded(const znode * node);
50451 +#endif
50452 +
50453 +/* return plugin of item at @coord */
50454 +static inline item_plugin *item_plugin_by_coord(const coord_t *
50455 + coord /* coord to query */ )
50456 +{
50457 + assert("nikita-330", coord != NULL);
50458 + assert("nikita-331", coord->node != NULL);
50459 + assert("nikita-332", znode_is_loaded(coord->node));
50460 +
50461 + if (unlikely(!coord_is_iplug_set(coord)))
50462 + obtain_item_plugin(coord);
50463 + return coord_iplug(coord);
50464 +}
50465 +
50466 +/* this returns true if item is of internal type */
50467 +static inline int item_is_internal(const coord_t * item)
50468 +{
50469 + assert("vs-483", coord_is_existing_item(item));
50470 + return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50471 +}
50472 +
50473 +extern void item_body_by_coord_hard(coord_t * coord);
50474 +extern void *item_body_by_coord_easy(const coord_t * coord);
50475 +#if REISER4_DEBUG
50476 +extern int item_body_is_valid(const coord_t * coord);
50477 +#endif
50478 +
50479 +/* return pointer to item body */
50480 +static inline void *item_body_by_coord(const coord_t *
50481 + coord /* coord to query */ )
50482 +{
50483 + assert("nikita-324", coord != NULL);
50484 + assert("nikita-325", coord->node != NULL);
50485 + assert("nikita-326", znode_is_loaded(coord->node));
50486 +
50487 + if (coord->offset == INVALID_OFFSET)
50488 + item_body_by_coord_hard((coord_t *) coord);
50489 + assert("nikita-3201", item_body_is_valid(coord));
50490 + assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50491 + return item_body_by_coord_easy(coord);
50492 +}
50493 +
50494 +/* __REISER4_ITEM_H__ */
50495 +#endif
50496 +/* Make Linus happy.
50497 + Local variables:
50498 + c-indentation-style: "K&R"
50499 + mode-name: "LC"
50500 + c-basic-offset: 8
50501 + tab-width: 8
50502 + fill-column: 120
50503 + scroll-step: 1
50504 + End:
50505 +*/
50506 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/Makefile linux-2.6.33/fs/reiser4/plugin/item/Makefile
50507 --- linux-2.6.33.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 01:00:00.000000000 +0100
50508 +++ linux-2.6.33/fs/reiser4/plugin/item/Makefile 2010-03-04 19:33:22.000000000 +0100
50509 @@ -0,0 +1,18 @@
50510 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
50511 +
50512 +item_plugins-objs := \
50513 + item.o \
50514 + static_stat.o \
50515 + sde.o \
50516 + cde.o \
50517 + blackbox.o \
50518 + internal.o \
50519 + tail.o \
50520 + ctail.o \
50521 + extent.o \
50522 + extent_item_ops.o \
50523 + extent_file_ops.o \
50524 + extent_flush_ops.o
50525 +
50526 +
50527 +
50528 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/sde.c linux-2.6.33/fs/reiser4/plugin/item/sde.c
50529 --- linux-2.6.33.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 01:00:00.000000000 +0100
50530 +++ linux-2.6.33/fs/reiser4/plugin/item/sde.c 2010-03-04 19:33:22.000000000 +0100
50531 @@ -0,0 +1,190 @@
50532 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50533 +
50534 +/* Directory entry implementation */
50535 +#include "../../forward.h"
50536 +#include "../../debug.h"
50537 +#include "../../dformat.h"
50538 +#include "../../kassign.h"
50539 +#include "../../coord.h"
50540 +#include "sde.h"
50541 +#include "item.h"
50542 +#include "../plugin.h"
50543 +#include "../../znode.h"
50544 +#include "../../carry.h"
50545 +#include "../../tree.h"
50546 +#include "../../inode.h"
50547 +
50548 +#include <linux/fs.h> /* for struct inode */
50549 +#include <linux/dcache.h> /* for struct dentry */
50550 +#include <linux/quotaops.h>
50551 +
50552 +/* ->extract_key() method of simple directory item plugin. */
50553 +int extract_key_de(const coord_t * coord /* coord of item */ ,
50554 + reiser4_key * key /* resulting key */ )
50555 +{
50556 + directory_entry_format *dent;
50557 +
50558 + assert("nikita-1458", coord != NULL);
50559 + assert("nikita-1459", key != NULL);
50560 +
50561 + dent = (directory_entry_format *) item_body_by_coord(coord);
50562 + assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50563 + return extract_key_from_id(&dent->id, key);
50564 +}
50565 +
50566 +int
50567 +update_key_de(const coord_t * coord, const reiser4_key * key,
50568 + lock_handle * lh UNUSED_ARG)
50569 +{
50570 + directory_entry_format *dent;
50571 + obj_key_id obj_id;
50572 + int result;
50573 +
50574 + assert("nikita-2342", coord != NULL);
50575 + assert("nikita-2343", key != NULL);
50576 +
50577 + dent = (directory_entry_format *) item_body_by_coord(coord);
50578 + result = build_obj_key_id(key, &obj_id);
50579 + if (result == 0) {
50580 + dent->id = obj_id;
50581 + znode_make_dirty(coord->node);
50582 + }
50583 + return 0;
50584 +}
50585 +
50586 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50587 + char *buf)
50588 +{
50589 + reiser4_key key;
50590 +
50591 + unit_key_by_coord(coord, &key);
50592 + if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50593 + reiser4_print_address("oops", znode_get_block(coord->node));
50594 + if (!is_longname_key(&key)) {
50595 + if (is_dot_key(&key))
50596 + return (char *)".";
50597 + else
50598 + return extract_name_from_key(&key, buf);
50599 + } else
50600 + return (char *)dent->name;
50601 +}
50602 +
50603 +/* ->extract_name() method of simple directory item plugin. */
50604 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50605 +{
50606 + directory_entry_format *dent;
50607 +
50608 + assert("nikita-1460", coord != NULL);
50609 +
50610 + dent = (directory_entry_format *) item_body_by_coord(coord);
50611 + return extract_dent_name(coord, dent, buf);
50612 +}
50613 +
50614 +/* ->extract_file_type() method of simple directory item plugin. */
50615 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50616 + * item */ )
50617 +{
50618 + assert("nikita-1764", coord != NULL);
50619 + /* we don't store file type in the directory entry yet.
50620 +
50621 + But see comments at kassign.h:obj_key_id
50622 + */
50623 + return DT_UNKNOWN;
50624 +}
50625 +
50626 +int add_entry_de(struct inode *dir /* directory of item */ ,
50627 + coord_t * coord /* coord of item */ ,
50628 + lock_handle * lh /* insertion lock handle */ ,
50629 + const struct dentry *de /* name to add */ ,
50630 + reiser4_dir_entry_desc * entry /* parameters of new directory
50631 + * entry */ )
50632 +{
50633 + reiser4_item_data data;
50634 + directory_entry_format *dent;
50635 + int result;
50636 + const char *name;
50637 + int len;
50638 + int longname;
50639 +
50640 + name = de->d_name.name;
50641 + len = de->d_name.len;
50642 + assert("nikita-1163", strlen(name) == len);
50643 +
50644 + longname = is_longname(name, len);
50645 +
50646 + data.length = sizeof *dent;
50647 + if (longname)
50648 + data.length += len + 1;
50649 + data.data = NULL;
50650 + data.user = 0;
50651 + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50652 +
50653 + /* NOTE-NIKITA quota plugin */
50654 + if (vfs_dq_alloc_space_nodirty(dir, data.length))
50655 + return -EDQUOT;
50656 +
50657 + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50658 + if (result != 0)
50659 + return result;
50660 +
50661 + dent = (directory_entry_format *) item_body_by_coord(coord);
50662 + build_inode_key_id(entry->obj, &dent->id);
50663 + if (longname) {
50664 + memcpy(dent->name, name, len);
50665 + put_unaligned(0, &dent->name[len]);
50666 + }
50667 + return 0;
50668 +}
50669 +
50670 +int rem_entry_de(struct inode *dir /* directory of item */ ,
50671 + const struct qstr *name UNUSED_ARG,
50672 + coord_t * coord /* coord of item */ ,
50673 + lock_handle * lh UNUSED_ARG /* lock handle for
50674 + * removal */ ,
50675 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
50676 + * directory entry
50677 + * being removed */ )
50678 +{
50679 + coord_t shadow;
50680 + int result;
50681 + int length;
50682 +
50683 + length = item_length_by_coord(coord);
50684 + if (inode_get_bytes(dir) < length) {
50685 + warning("nikita-2627", "Dir is broke: %llu: %llu",
50686 + (unsigned long long)get_inode_oid(dir),
50687 + inode_get_bytes(dir));
50688 +
50689 + return RETERR(-EIO);
50690 + }
50691 +
50692 + /* cut_node() is supposed to take pointers to _different_
50693 + coords, because it will modify them without respect to
50694 + possible aliasing. To work around this, create temporary copy
50695 + of @coord.
50696 + */
50697 + coord_dup(&shadow, coord);
50698 + result =
50699 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50700 + if (result == 0) {
50701 + /* NOTE-NIKITA quota plugin */
50702 + vfs_dq_free_space_nodirty(dir, length);
50703 + }
50704 + return result;
50705 +}
50706 +
50707 +int max_name_len_de(const struct inode *dir)
50708 +{
50709 + return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50710 + sizeof(directory_entry_format) - 2;
50711 +}
50712 +
50713 +/* Make Linus happy.
50714 + Local variables:
50715 + c-indentation-style: "K&R"
50716 + mode-name: "LC"
50717 + c-basic-offset: 8
50718 + tab-width: 8
50719 + fill-column: 120
50720 + End:
50721 +*/
50722 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/sde.h linux-2.6.33/fs/reiser4/plugin/item/sde.h
50723 --- linux-2.6.33.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 01:00:00.000000000 +0100
50724 +++ linux-2.6.33/fs/reiser4/plugin/item/sde.h 2010-03-04 19:33:22.000000000 +0100
50725 @@ -0,0 +1,66 @@
50726 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50727 +
50728 +/* Directory entry. */
50729 +
50730 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50731 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50732 +
50733 +#include "../../forward.h"
50734 +#include "../../dformat.h"
50735 +#include "../../kassign.h"
50736 +#include "../../key.h"
50737 +
50738 +#include <linux/fs.h>
50739 +#include <linux/dcache.h> /* for struct dentry */
50740 +
50741 +typedef struct directory_entry_format {
50742 + /* key of object stat-data. It's not necessary to store whole
50743 + key here, because it's always key of stat-data, so minor
50744 + packing locality and offset can be omitted here. But this
50745 + relies on particular key allocation scheme for stat-data, so,
50746 + for extensibility sake, whole key can be stored here.
50747 +
50748 + We store key as array of bytes, because we don't want 8-byte
50749 + alignment of dir entries.
50750 + */
50751 + obj_key_id id;
50752 + /* file name. Null terminated string. */
50753 + d8 name[0];
50754 +} directory_entry_format;
50755 +
50756 +void print_de(const char *prefix, coord_t * coord);
50757 +int extract_key_de(const coord_t * coord, reiser4_key * key);
50758 +int update_key_de(const coord_t * coord, const reiser4_key * key,
50759 + lock_handle * lh);
50760 +char *extract_name_de(const coord_t * coord, char *buf);
50761 +unsigned extract_file_type_de(const coord_t * coord);
50762 +int add_entry_de(struct inode *dir, coord_t * coord,
50763 + lock_handle * lh, const struct dentry *name,
50764 + reiser4_dir_entry_desc * entry);
50765 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50766 + lock_handle * lh, reiser4_dir_entry_desc * entry);
50767 +int max_name_len_de(const struct inode *dir);
50768 +
50769 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50770 +
50771 +char *extract_dent_name(const coord_t * coord,
50772 + directory_entry_format * dent, char *buf);
50773 +
50774 +#if REISER4_LARGE_KEY
50775 +#define DE_NAME_BUF_LEN (24)
50776 +#else
50777 +#define DE_NAME_BUF_LEN (16)
50778 +#endif
50779 +
50780 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50781 +#endif
50782 +
50783 +/* Make Linus happy.
50784 + Local variables:
50785 + c-indentation-style: "K&R"
50786 + mode-name: "LC"
50787 + c-basic-offset: 8
50788 + tab-width: 8
50789 + fill-column: 120
50790 + End:
50791 +*/
50792 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.33/fs/reiser4/plugin/item/static_stat.c
50793 --- linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 01:00:00.000000000 +0100
50794 +++ linux-2.6.33/fs/reiser4/plugin/item/static_stat.c 2010-03-04 19:33:22.000000000 +0100
50795 @@ -0,0 +1,1107 @@
50796 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50797 +
50798 +/* stat data manipulation. */
50799 +
50800 +#include "../../forward.h"
50801 +#include "../../super.h"
50802 +#include "../../vfs_ops.h"
50803 +#include "../../inode.h"
50804 +#include "../../debug.h"
50805 +#include "../../dformat.h"
50806 +#include "../object.h"
50807 +#include "../plugin.h"
50808 +#include "../plugin_header.h"
50809 +#include "static_stat.h"
50810 +#include "item.h"
50811 +
50812 +#include <linux/types.h>
50813 +#include <linux/fs.h>
50814 +
50815 +/* see static_stat.h for explanation */
50816 +
50817 +/* helper function used while we are dumping/loading inode/plugin state
50818 + to/from the stat-data. */
50819 +
50820 +static void move_on(int *length /* space remaining in stat-data */ ,
50821 + char **area /* current coord in stat data */ ,
50822 + int size_of /* how many bytes to move forward */ )
50823 +{
50824 + assert("nikita-615", length != NULL);
50825 + assert("nikita-616", area != NULL);
50826 +
50827 + *length -= size_of;
50828 + *area += size_of;
50829 +
50830 + assert("nikita-617", *length >= 0);
50831 +}
50832 +
50833 +/* helper function used while loading inode/plugin state from stat-data.
50834 + Complain if there is less space in stat-data than was expected.
50835 + Can only happen on disk corruption. */
50836 +static int not_enough_space(struct inode *inode /* object being processed */ ,
50837 + const char *where /* error message */ )
50838 +{
50839 + assert("nikita-618", inode != NULL);
50840 +
50841 + warning("nikita-619", "Not enough space in %llu while loading %s",
50842 + (unsigned long long)get_inode_oid(inode), where);
50843 +
50844 + return RETERR(-EINVAL);
50845 +}
50846 +
50847 +/* helper function used while loading inode/plugin state from
50848 + stat-data. Call it if invalid plugin id was found. */
50849 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50850 + struct inode *inode /* object being processed */ )
50851 +{
50852 + warning("nikita-620", "Unknown plugin %i in %llu",
50853 + id, (unsigned long long)get_inode_oid(inode));
50854 +
50855 + return RETERR(-EINVAL);
50856 +}
50857 +
50858 +/* this is installed as ->init_inode() method of
50859 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50860 + Copies data from on-disk stat-data format into inode.
50861 + Handles stat-data extensions. */
50862 +/* was sd_load */
50863 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50864 + char *sd /* stat-data body */ ,
50865 + int len /* length of stat-data */ )
50866 +{
50867 + int result;
50868 + int bit;
50869 + int chunk;
50870 + __u16 mask;
50871 + __u64 bigmask;
50872 + reiser4_stat_data_base *sd_base;
50873 + reiser4_inode *state;
50874 +
50875 + assert("nikita-625", inode != NULL);
50876 + assert("nikita-626", sd != NULL);
50877 +
50878 + result = 0;
50879 + sd_base = (reiser4_stat_data_base *) sd;
50880 + state = reiser4_inode_data(inode);
50881 + mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50882 + bigmask = mask;
50883 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50884 +
50885 + move_on(&len, &sd, sizeof *sd_base);
50886 + for (bit = 0, chunk = 0;
50887 + mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50888 + ++bit, mask >>= 1) {
50889 + if (((bit + 1) % 16) != 0) {
50890 + /* handle extension */
50891 + sd_ext_plugin *sdplug;
50892 +
50893 + if (bit >= LAST_SD_EXTENSION) {
50894 + warning("vpf-1904",
50895 + "No such extension %i in inode %llu",
50896 + bit,
50897 + (unsigned long long)
50898 + get_inode_oid(inode));
50899 +
50900 + result = RETERR(-EINVAL);
50901 + break;
50902 + }
50903 +
50904 + sdplug = sd_ext_plugin_by_id(bit);
50905 + if (sdplug == NULL) {
50906 + warning("nikita-627",
50907 + "No such extension %i in inode %llu",
50908 + bit,
50909 + (unsigned long long)
50910 + get_inode_oid(inode));
50911 +
50912 + result = RETERR(-EINVAL);
50913 + break;
50914 + }
50915 + if (mask & 1) {
50916 + assert("nikita-628", sdplug->present);
50917 + /* alignment is not supported in node layout
50918 + plugin yet.
50919 + result = align( inode, &len, &sd,
50920 + sdplug -> alignment );
50921 + if( result != 0 )
50922 + return result; */
50923 + result = sdplug->present(inode, &sd, &len);
50924 + } else if (sdplug->absent != NULL)
50925 + result = sdplug->absent(inode);
50926 + if (result)
50927 + break;
50928 + /* else, we are looking at the last bit in 16-bit
50929 + portion of bitmask */
50930 + } else if (mask & 1) {
50931 + /* next portion of bitmask */
50932 + if (len < (int)sizeof(d16)) {
50933 + warning("nikita-629",
50934 + "No space for bitmap in inode %llu",
50935 + (unsigned long long)
50936 + get_inode_oid(inode));
50937 +
50938 + result = RETERR(-EINVAL);
50939 + break;
50940 + }
50941 + mask = le16_to_cpu(get_unaligned((d16 *)sd));
50942 + bigmask <<= 16;
50943 + bigmask |= mask;
50944 + move_on(&len, &sd, sizeof(d16));
50945 + ++chunk;
50946 + if (chunk == 3) {
50947 + if (!(mask & 0x8000)) {
50948 + /* clear last bit */
50949 + mask &= ~0x8000;
50950 + continue;
50951 + }
50952 + /* too much */
50953 + warning("nikita-630",
50954 + "Too many extensions in %llu",
50955 + (unsigned long long)
50956 + get_inode_oid(inode));
50957 +
50958 + result = RETERR(-EINVAL);
50959 + break;
50960 + }
50961 + } else
50962 + /* bitmask exhausted */
50963 + break;
50964 + }
50965 + state->extmask = bigmask;
50966 + /* common initialisations */
50967 + if (len - (bit / 16 * sizeof(d16)) > 0) {
50968 + /* alignment in save_len_static_sd() is taken into account
50969 + -edward */
50970 + warning("nikita-631", "unused space in inode %llu",
50971 + (unsigned long long)get_inode_oid(inode));
50972 + }
50973 +
50974 + return result;
50975 +}
50976 +
50977 +/* estimates size of stat-data required to store inode.
50978 + Installed as ->save_len() method of
50979 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50980 +/* was sd_len */
50981 +int save_len_static_sd(struct inode *inode /* object being processed */ )
50982 +{
50983 + unsigned int result;
50984 + __u64 mask;
50985 + int bit;
50986 +
50987 + assert("nikita-632", inode != NULL);
50988 +
50989 + result = sizeof(reiser4_stat_data_base);
50990 + mask = reiser4_inode_data(inode)->extmask;
50991 + for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50992 + if (mask & 1) {
50993 + sd_ext_plugin *sdplug;
50994 +
50995 + sdplug = sd_ext_plugin_by_id(bit);
50996 + assert("nikita-633", sdplug != NULL);
50997 + /* no aligment support
50998 + result +=
50999 + round_up( result, sdplug -> alignment ) - result; */
51000 + result += sdplug->save_len(inode);
51001 + }
51002 + }
51003 + result += bit / 16 * sizeof(d16);
51004 + return result;
51005 +}
51006 +
51007 +/* saves inode into stat-data.
51008 + Installed as ->save() method of
51009 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
51010 +/* was sd_save */
51011 +int save_static_sd(struct inode *inode /* object being processed */ ,
51012 + char **area /* where to save stat-data */ )
51013 +{
51014 + int result;
51015 + __u64 emask;
51016 + int bit;
51017 + unsigned int len;
51018 + reiser4_stat_data_base *sd_base;
51019 +
51020 + assert("nikita-634", inode != NULL);
51021 + assert("nikita-635", area != NULL);
51022 +
51023 + result = 0;
51024 + emask = reiser4_inode_data(inode)->extmask;
51025 + sd_base = (reiser4_stat_data_base *) * area;
51026 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
51027 + /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
51028 +
51029 + *area += sizeof *sd_base;
51030 + len = 0xffffffffu;
51031 + for (bit = 0; emask != 0; ++bit, emask >>= 1) {
51032 + if (emask & 1) {
51033 + if ((bit + 1) % 16 != 0) {
51034 + sd_ext_plugin *sdplug;
51035 + sdplug = sd_ext_plugin_by_id(bit);
51036 + assert("nikita-636", sdplug != NULL);
51037 + /* no alignment support yet
51038 + align( inode, &len, area,
51039 + sdplug -> alignment ); */
51040 + result = sdplug->save(inode, area);
51041 + if (result)
51042 + break;
51043 + } else {
51044 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
51045 + (d16 *)(*area));
51046 + /*cputod16((unsigned)(emask & 0xffff),
51047 + (d16 *) * area);*/
51048 + *area += sizeof(d16);
51049 + }
51050 + }
51051 + }
51052 + return result;
51053 +}
51054 +
51055 +/* stat-data extension handling functions. */
51056 +
51057 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
51058 + char **area /* position in stat-data */ ,
51059 + int *len /* remaining length */ )
51060 +{
51061 + if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
51062 + reiser4_light_weight_stat *sd_lw;
51063 +
51064 + sd_lw = (reiser4_light_weight_stat *) * area;
51065 +
51066 + inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
51067 + inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
51068 + inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
51069 + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
51070 + inode->i_mode &= ~S_IFIFO;
51071 + warning("", "partially converted file is encountered");
51072 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
51073 + }
51074 + move_on(len, area, sizeof *sd_lw);
51075 + return 0;
51076 + } else
51077 + return not_enough_space(inode, "lw sd");
51078 +}
51079 +
51080 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
51081 + * processed */ )
51082 +{
51083 + return sizeof(reiser4_light_weight_stat);
51084 +}
51085 +
51086 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
51087 + char **area /* position in stat-data */ )
51088 +{
51089 + reiser4_light_weight_stat *sd;
51090 + mode_t delta;
51091 +
51092 + assert("nikita-2705", inode != NULL);
51093 + assert("nikita-2706", area != NULL);
51094 + assert("nikita-2707", *area != NULL);
51095 +
51096 + sd = (reiser4_light_weight_stat *) * area;
51097 +
51098 + delta = (reiser4_inode_get_flag(inode,
51099 + REISER4_PART_MIXED) ? S_IFIFO : 0);
51100 + put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
51101 + put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
51102 + put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
51103 + *area += sizeof *sd;
51104 + return 0;
51105 +}
51106 +
51107 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
51108 + char **area /* position in stat-data */ ,
51109 + int *len /* remaining length */ )
51110 +{
51111 + assert("nikita-637", inode != NULL);
51112 + assert("nikita-638", area != NULL);
51113 + assert("nikita-639", *area != NULL);
51114 + assert("nikita-640", len != NULL);
51115 + assert("nikita-641", *len > 0);
51116 +
51117 + if (*len >= (int)sizeof(reiser4_unix_stat)) {
51118 + reiser4_unix_stat *sd;
51119 +
51120 + sd = (reiser4_unix_stat *) * area;
51121 +
51122 + inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
51123 + inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
51124 + inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
51125 + inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
51126 + inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
51127 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
51128 + inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
51129 + else
51130 + inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
51131 + move_on(len, area, sizeof *sd);
51132 + return 0;
51133 + } else
51134 + return not_enough_space(inode, "unix sd");
51135 +}
51136 +
51137 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
51138 +{
51139 + inode->i_uid = get_super_private(inode->i_sb)->default_uid;
51140 + inode->i_gid = get_super_private(inode->i_sb)->default_gid;
51141 + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
51142 + inode_set_bytes(inode, inode->i_size);
51143 + /* mark inode as lightweight, so that caller (lookup_common) will
51144 + complete initialisation by copying [ug]id from a parent. */
51145 + reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
51146 + return 0;
51147 +}
51148 +
51149 +/* Audited by: green(2002.06.14) */
51150 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
51151 + * processed */ )
51152 +{
51153 + return sizeof(reiser4_unix_stat);
51154 +}
51155 +
51156 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
51157 + char **area /* position in stat-data */ )
51158 +{
51159 + reiser4_unix_stat *sd;
51160 +
51161 + assert("nikita-642", inode != NULL);
51162 + assert("nikita-643", area != NULL);
51163 + assert("nikita-644", *area != NULL);
51164 +
51165 + sd = (reiser4_unix_stat *) * area;
51166 + put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
51167 + put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
51168 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
51169 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
51170 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
51171 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
51172 + put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
51173 + else
51174 + put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
51175 + *area += sizeof *sd;
51176 + return 0;
51177 +}
51178 +
51179 +static int
51180 +present_large_times_sd(struct inode *inode /* object being processed */ ,
51181 + char **area /* position in stat-data */ ,
51182 + int *len /* remaining length */ )
51183 +{
51184 + if (*len >= (int)sizeof(reiser4_large_times_stat)) {
51185 + reiser4_large_times_stat *sd_lt;
51186 +
51187 + sd_lt = (reiser4_large_times_stat *) * area;
51188 +
51189 + inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
51190 + inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
51191 + inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
51192 +
51193 + move_on(len, area, sizeof *sd_lt);
51194 + return 0;
51195 + } else
51196 + return not_enough_space(inode, "large times sd");
51197 +}
51198 +
51199 +static int
51200 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
51201 + /* object being processed */ )
51202 +{
51203 + return sizeof(reiser4_large_times_stat);
51204 +}
51205 +
51206 +static int
51207 +save_large_times_sd(struct inode *inode /* object being processed */ ,
51208 + char **area /* position in stat-data */ )
51209 +{
51210 + reiser4_large_times_stat *sd;
51211 +
51212 + assert("nikita-2817", inode != NULL);
51213 + assert("nikita-2818", area != NULL);
51214 + assert("nikita-2819", *area != NULL);
51215 +
51216 + sd = (reiser4_large_times_stat *) * area;
51217 +
51218 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
51219 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
51220 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
51221 +
51222 + *area += sizeof *sd;
51223 + return 0;
51224 +}
51225 +
51226 +/* symlink stat data extension */
51227 +
51228 +/* allocate memory for symlink target and attach it to inode->i_private */
51229 +static int
51230 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
51231 +{
51232 + assert("vs-845", inode->i_private == NULL);
51233 + assert("vs-846", !reiser4_inode_get_flag(inode,
51234 + REISER4_GENERIC_PTR_USED));
51235 + /* FIXME-VS: this is prone to deadlock. Not more than other similar
51236 + places, though */
51237 + inode->i_private = kmalloc((size_t) len + 1,
51238 + reiser4_ctx_gfp_mask_get());
51239 + if (!inode->i_private)
51240 + return RETERR(-ENOMEM);
51241 +
51242 + memcpy((char *)(inode->i_private), target, (size_t) len);
51243 + ((char *)(inode->i_private))[len] = 0;
51244 + reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
51245 + return 0;
51246 +}
51247 +
51248 +/* this is called on read_inode. There is nothing to do actually, but some
51249 + sanity checks */
51250 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
51251 +{
51252 + int result;
51253 + int length;
51254 + reiser4_symlink_stat *sd;
51255 +
51256 + length = (int)inode->i_size;
51257 + /*
51258 + * *len is number of bytes in stat data item from *area to the end of
51259 + * item. It must be not less than size of symlink + 1 for ending 0
51260 + */
51261 + if (length > *len)
51262 + return not_enough_space(inode, "symlink");
51263 +
51264 + if (*(*area + length) != 0) {
51265 + warning("vs-840", "Symlink is not zero terminated");
51266 + return RETERR(-EIO);
51267 + }
51268 +
51269 + sd = (reiser4_symlink_stat *) * area;
51270 + result = symlink_target_to_inode(inode, sd->body, length);
51271 +
51272 + move_on(len, area, length + 1);
51273 + return result;
51274 +}
51275 +
51276 +static int save_len_symlink_sd(struct inode *inode)
51277 +{
51278 + return inode->i_size + 1;
51279 +}
51280 +
51281 +/* this is called on create and update stat data. Do nothing on update but
51282 + update @area */
51283 +static int save_symlink_sd(struct inode *inode, char **area)
51284 +{
51285 + int result;
51286 + int length;
51287 + reiser4_symlink_stat *sd;
51288 +
51289 + length = (int)inode->i_size;
51290 + /* inode->i_size must be set already */
51291 + assert("vs-841", length);
51292 +
51293 + result = 0;
51294 + sd = (reiser4_symlink_stat *) * area;
51295 + if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
51296 + const char *target;
51297 +
51298 + target = (const char *)(inode->i_private);
51299 + inode->i_private = NULL;
51300 +
51301 + result = symlink_target_to_inode(inode, target, length);
51302 +
51303 + /* copy symlink to stat data */
51304 + memcpy(sd->body, target, (size_t) length);
51305 + (*area)[length] = 0;
51306 + } else {
51307 + /* there is nothing to do in update but move area */
51308 + assert("vs-844",
51309 + !memcmp(inode->i_private, sd->body,
51310 + (size_t) length + 1));
51311 + }
51312 +
51313 + *area += (length + 1);
51314 + return result;
51315 +}
51316 +
51317 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
51318 + char **area /* position in stat-data */ ,
51319 + int *len /* remaining length */ )
51320 +{
51321 + assert("nikita-645", inode != NULL);
51322 + assert("nikita-646", area != NULL);
51323 + assert("nikita-647", *area != NULL);
51324 + assert("nikita-648", len != NULL);
51325 + assert("nikita-649", *len > 0);
51326 +
51327 + if (*len >= (int)sizeof(reiser4_flags_stat)) {
51328 + reiser4_flags_stat *sd;
51329 +
51330 + sd = (reiser4_flags_stat *) * area;
51331 + inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
51332 + move_on(len, area, sizeof *sd);
51333 + return 0;
51334 + } else
51335 + return not_enough_space(inode, "generation and attrs");
51336 +}
51337 +
51338 +/* Audited by: green(2002.06.14) */
51339 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
51340 + * processed */ )
51341 +{
51342 + return sizeof(reiser4_flags_stat);
51343 +}
51344 +
51345 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
51346 + char **area /* position in stat-data */ )
51347 +{
51348 + reiser4_flags_stat *sd;
51349 +
51350 + assert("nikita-650", inode != NULL);
51351 + assert("nikita-651", area != NULL);
51352 + assert("nikita-652", *area != NULL);
51353 +
51354 + sd = (reiser4_flags_stat *) * area;
51355 + put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
51356 + *area += sizeof *sd;
51357 + return 0;
51358 +}
51359 +
51360 +static int absent_plugin_sd(struct inode *inode);
51361 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
51362 + char **area /* position in stat-data */ ,
51363 + int *len /* remaining length */,
51364 + int is_pset /* 1 if plugin set, 0 if heir set. */)
51365 +{
51366 + reiser4_plugin_stat *sd;
51367 + reiser4_plugin *plugin;
51368 + reiser4_inode *info;
51369 + int i;
51370 + __u16 mask;
51371 + int result;
51372 + int num_of_plugins;
51373 +
51374 + assert("nikita-653", inode != NULL);
51375 + assert("nikita-654", area != NULL);
51376 + assert("nikita-655", *area != NULL);
51377 + assert("nikita-656", len != NULL);
51378 + assert("nikita-657", *len > 0);
51379 +
51380 + if (*len < (int)sizeof(reiser4_plugin_stat))
51381 + return not_enough_space(inode, "plugin");
51382 +
51383 + sd = (reiser4_plugin_stat *) * area;
51384 + info = reiser4_inode_data(inode);
51385 +
51386 + mask = 0;
51387 + num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
51388 + move_on(len, area, sizeof *sd);
51389 + result = 0;
51390 + for (i = 0; i < num_of_plugins; ++i) {
51391 + reiser4_plugin_slot *slot;
51392 + reiser4_plugin_type type;
51393 + pset_member memb;
51394 +
51395 + slot = (reiser4_plugin_slot *) * area;
51396 + if (*len < (int)sizeof *slot)
51397 + return not_enough_space(inode, "additional plugin");
51398 +
51399 + memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
51400 + type = aset_member_to_type_unsafe(memb);
51401 +
51402 + if (type == REISER4_PLUGIN_TYPES) {
51403 + warning("nikita-3502",
51404 + "wrong %s member (%i) for %llu", is_pset ?
51405 + "pset" : "hset", memb,
51406 + (unsigned long long)get_inode_oid(inode));
51407 + return RETERR(-EINVAL);
51408 + }
51409 + plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
51410 + type, &slot->id);
51411 + if (plugin == NULL)
51412 + return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51413 +
51414 + /* plugin is loaded into inode, mark this into inode's
51415 + bitmask of loaded non-standard plugins */
51416 + if (!(mask & (1 << memb))) {
51417 + mask |= (1 << memb);
51418 + } else {
51419 + warning("nikita-658", "duplicate plugin for %llu",
51420 + (unsigned long long)get_inode_oid(inode));
51421 + return RETERR(-EINVAL);
51422 + }
51423 + move_on(len, area, sizeof *slot);
51424 + /* load plugin data, if any */
51425 + if (plugin->h.pops != NULL && plugin->h.pops->load)
51426 + result = plugin->h.pops->load(inode, plugin, area, len);
51427 + else
51428 + result = aset_set_unsafe(is_pset ? &info->pset :
51429 + &info->hset, memb, plugin);
51430 + if (result)
51431 + return result;
51432 + }
51433 + if (is_pset) {
51434 + /* if object plugin wasn't loaded from stat-data, guess it by
51435 + mode bits */
51436 + plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51437 + if (plugin == NULL)
51438 + result = absent_plugin_sd(inode);
51439 + info->plugin_mask = mask;
51440 + } else
51441 + info->heir_mask = mask;
51442 +
51443 + return result;
51444 +}
51445 +
51446 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
51447 + return present_plugin_sd(inode, area, len, 1 /* pset */);
51448 +}
51449 +
51450 +/* Determine object plugin for @inode based on i_mode.
51451 +
51452 + Many objects in reiser4 file system are controlled by standard object
51453 + plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51454 +
51455 + For such files we don't explicitly store plugin id in object stat
51456 + data. Rather required plugin is guessed from mode bits, where file "type"
51457 + is encoded (see stat(2)).
51458 +*/
51459 +static int
51460 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51461 +{
51462 + int fplug_id;
51463 + int dplug_id;
51464 + reiser4_inode *info;
51465 +
51466 + assert("nikita-736", inode != NULL);
51467 +
51468 + dplug_id = fplug_id = -1;
51469 +
51470 + switch (inode->i_mode & S_IFMT) {
51471 + case S_IFSOCK:
51472 + case S_IFBLK:
51473 + case S_IFCHR:
51474 + case S_IFIFO:
51475 + fplug_id = SPECIAL_FILE_PLUGIN_ID;
51476 + break;
51477 + case S_IFLNK:
51478 + fplug_id = SYMLINK_FILE_PLUGIN_ID;
51479 + break;
51480 + case S_IFDIR:
51481 + fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51482 + dplug_id = HASHED_DIR_PLUGIN_ID;
51483 + break;
51484 + default:
51485 + warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51486 + return RETERR(-EIO);
51487 + case S_IFREG:
51488 + fplug_id = UNIX_FILE_PLUGIN_ID;
51489 + break;
51490 + }
51491 + info = reiser4_inode_data(inode);
51492 + set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51493 + plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51494 + set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51495 + plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51496 + return 0;
51497 +}
51498 +
51499 +/* Audited by: green(2002.06.14) */
51500 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51501 +{
51502 + int result;
51503 +
51504 + assert("nikita-659", inode != NULL);
51505 +
51506 + result = guess_plugin_by_mode(inode);
51507 + /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51508 + but setup_inode_ops() will call make_bad_inode().
51509 + Another, more logical but bit more complex solution is to add
51510 + "bad-file plugin". */
51511 + /* FIXME-VS: activate was called here */
51512 + return result;
51513 +}
51514 +
51515 +/* helper function for plugin_sd_save_len(): calculate how much space
51516 + required to save state of given plugin */
51517 +/* Audited by: green(2002.06.14) */
51518 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51519 + struct inode *inode /* object being processed */ ,
51520 + pset_member memb,
51521 + int len, int is_pset)
51522 +{
51523 + reiser4_inode *info;
51524 + assert("nikita-661", inode != NULL);
51525 +
51526 + if (plugin == NULL)
51527 + return len;
51528 +
51529 + info = reiser4_inode_data(inode);
51530 + if (is_pset ?
51531 + info->plugin_mask & (1 << memb) :
51532 + info->heir_mask & (1 << memb)) {
51533 + len += sizeof(reiser4_plugin_slot);
51534 + if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51535 + /* non-standard plugin, call method */
51536 + /* commented as it is incompatible with alignment
51537 + * policy in save_plug() -edward */
51538 + /* len = round_up(len, plugin->h.pops->alignment); */
51539 + len += plugin->h.pops->save_len(inode, plugin);
51540 + }
51541 + }
51542 + return len;
51543 +}
51544 +
51545 +/* calculate how much space is required to save state of all plugins,
51546 + associated with inode */
51547 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51548 + int is_pset)
51549 +{
51550 + int len;
51551 + int last;
51552 + reiser4_inode *state;
51553 + pset_member memb;
51554 +
51555 + assert("nikita-663", inode != NULL);
51556 +
51557 + state = reiser4_inode_data(inode);
51558 +
51559 + /* common case: no non-standard plugins */
51560 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51561 + return 0;
51562 + len = sizeof(reiser4_plugin_stat);
51563 + last = PSET_LAST;
51564 +
51565 + for (memb = 0; memb < last; ++memb) {
51566 + len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51567 + inode, memb, len, is_pset);
51568 + }
51569 + assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51570 + return len;
51571 +}
51572 +
51573 +static int save_len_pset_sd(struct inode *inode) {
51574 + return save_len_plugin_sd(inode, 1 /* pset */);
51575 +}
51576 +
51577 +/* helper function for plugin_sd_save(): save plugin, associated with
51578 + inode. */
51579 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51580 + struct inode *inode /* object being processed */ ,
51581 + int memb /* what element of pset is saved */ ,
51582 + char **area /* position in stat-data */ ,
51583 + int *count /* incremented if plugin were actually saved. */,
51584 + int is_pset /* 1 for plugin set, 0 for heir set */)
51585 +{
51586 + reiser4_plugin_slot *slot;
51587 + int fake_len;
51588 + int result;
51589 +
51590 + assert("nikita-665", inode != NULL);
51591 + assert("nikita-666", area != NULL);
51592 + assert("nikita-667", *area != NULL);
51593 +
51594 + if (plugin == NULL)
51595 + return 0;
51596 +
51597 + if (is_pset ?
51598 + !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51599 + !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51600 + return 0;
51601 + slot = (reiser4_plugin_slot *) * area;
51602 + put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51603 + put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51604 + fake_len = (int)0xffff;
51605 + move_on(&fake_len, area, sizeof *slot);
51606 + ++*count;
51607 + result = 0;
51608 + if (plugin->h.pops != NULL) {
51609 + if (plugin->h.pops->save != NULL)
51610 + result = plugin->h.pops->save(inode, plugin, area);
51611 + }
51612 + return result;
51613 +}
51614 +
51615 +/* save state of all non-standard plugins associated with inode */
51616 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51617 + char **area /* position in stat-data */,
51618 + int is_pset /* 1 for pset, 0 for hset */)
51619 +{
51620 + int fake_len;
51621 + int result = 0;
51622 + int num_of_plugins;
51623 + reiser4_plugin_stat *sd;
51624 + reiser4_inode *state;
51625 + pset_member memb;
51626 +
51627 + assert("nikita-669", inode != NULL);
51628 + assert("nikita-670", area != NULL);
51629 + assert("nikita-671", *area != NULL);
51630 +
51631 + state = reiser4_inode_data(inode);
51632 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51633 + return 0;
51634 + sd = (reiser4_plugin_stat *) * area;
51635 + fake_len = (int)0xffff;
51636 + move_on(&fake_len, area, sizeof *sd);
51637 +
51638 + num_of_plugins = 0;
51639 + for (memb = 0; memb < PSET_LAST; ++memb) {
51640 + result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51641 + memb),
51642 + inode, memb, area, &num_of_plugins, is_pset);
51643 + if (result != 0)
51644 + break;
51645 + }
51646 +
51647 + put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51648 + return result;
51649 +}
51650 +
51651 +static int save_pset_sd(struct inode *inode, char **area) {
51652 + return save_plugin_sd(inode, area, 1 /* pset */);
51653 +}
51654 +
51655 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
51656 + return present_plugin_sd(inode, area, len, 0 /* hset */);
51657 +}
51658 +
51659 +static int save_len_hset_sd(struct inode *inode) {
51660 + return save_len_plugin_sd(inode, 0 /* pset */);
51661 +}
51662 +
51663 +static int save_hset_sd(struct inode *inode, char **area) {
51664 + return save_plugin_sd(inode, area, 0 /* hset */);
51665 +}
51666 +
51667 +/* helper function for crypto_sd_present(), crypto_sd_save.
51668 + Extract crypto info from stat-data and attach it to inode */
51669 +static int extract_crypto_info (struct inode * inode,
51670 + reiser4_crypto_stat * sd)
51671 +{
51672 + struct reiser4_crypto_info * info;
51673 + assert("edward-11", !inode_crypto_info(inode));
51674 + assert("edward-1413",
51675 + !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51676 + /* create and attach a crypto-stat without secret key loaded */
51677 + info = reiser4_alloc_crypto_info(inode);
51678 + if (IS_ERR(info))
51679 + return PTR_ERR(info);
51680 + info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51681 + memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51682 + reiser4_attach_crypto_info(inode, info);
51683 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51684 + return 0;
51685 +}
51686 +
51687 +/* crypto stat-data extension */
51688 +
51689 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
51690 +{
51691 + int result;
51692 + reiser4_crypto_stat *sd;
51693 + digest_plugin *dplug = inode_digest_plugin(inode);
51694 +
51695 + assert("edward-06", dplug != NULL);
51696 + assert("edward-684", dplug->fipsize);
51697 + assert("edward-07", area != NULL);
51698 + assert("edward-08", *area != NULL);
51699 + assert("edward-09", len != NULL);
51700 + assert("edward-10", *len > 0);
51701 +
51702 + if (*len < (int)sizeof(reiser4_crypto_stat)) {
51703 + return not_enough_space(inode, "crypto-sd");
51704 + }
51705 + /* *len is number of bytes in stat data item from *area to the end of
51706 + item. It must be not less than size of this extension */
51707 + assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51708 +
51709 + sd = (reiser4_crypto_stat *) * area;
51710 + result = extract_crypto_info(inode, sd);
51711 + move_on(len, area, sizeof(*sd) + dplug->fipsize);
51712 +
51713 + return result;
51714 +}
51715 +
51716 +static int save_len_crypto_sd(struct inode *inode)
51717 +{
51718 + return sizeof(reiser4_crypto_stat) +
51719 + inode_digest_plugin(inode)->fipsize;
51720 +}
51721 +
51722 +static int save_crypto_sd(struct inode *inode, char **area)
51723 +{
51724 + int result = 0;
51725 + reiser4_crypto_stat *sd;
51726 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
51727 + digest_plugin *dplug = inode_digest_plugin(inode);
51728 +
51729 + assert("edward-12", dplug != NULL);
51730 + assert("edward-13", area != NULL);
51731 + assert("edward-14", *area != NULL);
51732 + assert("edward-15", info != NULL);
51733 + assert("edward-1414", info->keyid != NULL);
51734 + assert("edward-1415", info->keysize != 0);
51735 + assert("edward-76", reiser4_inode_data(inode) != NULL);
51736 +
51737 + if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51738 + /* file is just created */
51739 + sd = (reiser4_crypto_stat *) *area;
51740 + /* copy everything but private key to the disk stat-data */
51741 + put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51742 + memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51743 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51744 + }
51745 + *area += (sizeof(*sd) + dplug->fipsize);
51746 + return result;
51747 +}
51748 +
51749 +static int eio(struct inode *inode, char **area, int *len)
51750 +{
51751 + return RETERR(-EIO);
51752 +}
51753 +
51754 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51755 + [LIGHT_WEIGHT_STAT] = {
51756 + .h = {
51757 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51758 + .id = LIGHT_WEIGHT_STAT,
51759 + .pops = NULL,
51760 + .label = "light-weight sd",
51761 + .desc = "sd for light-weight files",
51762 + .linkage = {NULL,NULL}
51763 + },
51764 + .present = present_lw_sd,
51765 + .absent = NULL,
51766 + .save_len = save_len_lw_sd,
51767 + .save = save_lw_sd,
51768 + .alignment = 8
51769 + },
51770 + [UNIX_STAT] = {
51771 + .h = {
51772 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51773 + .id = UNIX_STAT,
51774 + .pops = NULL,
51775 + .label = "unix-sd",
51776 + .desc = "unix stat-data fields",
51777 + .linkage = {NULL,NULL}
51778 + },
51779 + .present = present_unix_sd,
51780 + .absent = absent_unix_sd,
51781 + .save_len = save_len_unix_sd,
51782 + .save = save_unix_sd,
51783 + .alignment = 8
51784 + },
51785 + [LARGE_TIMES_STAT] = {
51786 + .h = {
51787 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51788 + .id = LARGE_TIMES_STAT,
51789 + .pops = NULL,
51790 + .label = "64time-sd",
51791 + .desc = "nanosecond resolution for times",
51792 + .linkage = {NULL,NULL}
51793 + },
51794 + .present = present_large_times_sd,
51795 + .absent = NULL,
51796 + .save_len = save_len_large_times_sd,
51797 + .save = save_large_times_sd,
51798 + .alignment = 8
51799 + },
51800 + [SYMLINK_STAT] = {
51801 + /* stat data of symlink has this extension */
51802 + .h = {
51803 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51804 + .id = SYMLINK_STAT,
51805 + .pops = NULL,
51806 + .label = "symlink-sd",
51807 + .desc =
51808 + "stat data is appended with symlink name",
51809 + .linkage = {NULL,NULL}
51810 + },
51811 + .present = present_symlink_sd,
51812 + .absent = NULL,
51813 + .save_len = save_len_symlink_sd,
51814 + .save = save_symlink_sd,
51815 + .alignment = 8
51816 + },
51817 + [PLUGIN_STAT] = {
51818 + .h = {
51819 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51820 + .id = PLUGIN_STAT,
51821 + .pops = NULL,
51822 + .label = "plugin-sd",
51823 + .desc = "plugin stat-data fields",
51824 + .linkage = {NULL,NULL}
51825 + },
51826 + .present = present_pset_sd,
51827 + .absent = absent_plugin_sd,
51828 + .save_len = save_len_pset_sd,
51829 + .save = save_pset_sd,
51830 + .alignment = 8
51831 + },
51832 + [HEIR_STAT] = {
51833 + .h = {
51834 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51835 + .id = HEIR_STAT,
51836 + .pops = NULL,
51837 + .label = "heir-plugin-sd",
51838 + .desc = "heir plugin stat-data fields",
51839 + .linkage = {NULL,NULL}
51840 + },
51841 + .present = present_hset_sd,
51842 + .absent = NULL,
51843 + .save_len = save_len_hset_sd,
51844 + .save = save_hset_sd,
51845 + .alignment = 8
51846 + },
51847 + [FLAGS_STAT] = {
51848 + .h = {
51849 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51850 + .id = FLAGS_STAT,
51851 + .pops = NULL,
51852 + .label = "flags-sd",
51853 + .desc = "inode bit flags",
51854 + .linkage = {NULL, NULL}
51855 + },
51856 + .present = present_flags_sd,
51857 + .absent = NULL,
51858 + .save_len = save_len_flags_sd,
51859 + .save = save_flags_sd,
51860 + .alignment = 8
51861 + },
51862 + [CAPABILITIES_STAT] = {
51863 + .h = {
51864 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51865 + .id = CAPABILITIES_STAT,
51866 + .pops = NULL,
51867 + .label = "capabilities-sd",
51868 + .desc = "capabilities",
51869 + .linkage = {NULL, NULL}
51870 + },
51871 + .present = eio,
51872 + .absent = NULL,
51873 + .save_len = save_len_flags_sd,
51874 + .save = save_flags_sd,
51875 + .alignment = 8
51876 + },
51877 + [CRYPTO_STAT] = {
51878 + .h = {
51879 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51880 + .id = CRYPTO_STAT,
51881 + .pops = NULL,
51882 + .label = "crypto-sd",
51883 + .desc = "secret key size and id",
51884 + .linkage = {NULL, NULL}
51885 + },
51886 + .present = present_crypto_sd,
51887 + .absent = NULL,
51888 + .save_len = save_len_crypto_sd,
51889 + .save = save_crypto_sd,
51890 + .alignment = 8
51891 + }
51892 +};
51893 +
51894 +/* Make Linus happy.
51895 + Local variables:
51896 + c-indentation-style: "K&R"
51897 + mode-name: "LC"
51898 + c-basic-offset: 8
51899 + tab-width: 8
51900 + fill-column: 120
51901 + End:
51902 +*/
51903 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.33/fs/reiser4/plugin/item/static_stat.h
51904 --- linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 01:00:00.000000000 +0100
51905 +++ linux-2.6.33/fs/reiser4/plugin/item/static_stat.h 2010-03-04 19:33:22.000000000 +0100
51906 @@ -0,0 +1,224 @@
51907 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51908 +
51909 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51910 +
51911 +In the case where each file has not less than the fields needed by the
51912 +stat() syscall, it is more compact to store those fields in this
51913 +struct.
51914 +
51915 +If this item does not exist, then all stats are dynamically resolved.
51916 +At the moment, we either resolve all stats dynamically or all of them
51917 +statically. If you think this is not fully optimal, and the rest of
51918 +reiser4 is working, then fix it...:-)
51919 +
51920 +*/
51921 +
51922 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51923 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51924 +
51925 +#include "../../forward.h"
51926 +#include "../../dformat.h"
51927 +
51928 +#include <linux/fs.h> /* for struct inode */
51929 +
51930 +/* Stat data layout: goals and implementation.
51931 +
51932 + We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51933 + them, including not having semantic metadata attached to them.
51934 +
51935 + There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51936 + want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51937 + sized structure because the statically sized structure knows without recording it what the names and lengths of the
51938 + attributes are.
51939 +
51940 + This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51941 + attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51942 + file in their use of file attributes.
51943 +
51944 + Yet this compromise deserves to be compromised a little.
51945 +
51946 + We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51947 + bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51948 +
51949 + If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51950 + from parent directory (as uid, gid) or initialised to some sane values.
51951 +
51952 + To capitalize on existing code infrastructure, extensions are
51953 + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51954 + Each stat-data extension plugin implements four methods:
51955 +
51956 + ->present() called by sd_load() when this extension is found in stat-data
51957 + ->absent() called by sd_load() when this extension is not found in stat-data
51958 + ->save_len() called by sd_len() to calculate total length of stat-data
51959 + ->save() called by sd_save() to store extension data into stat-data
51960 +
51961 + Implementation is in fs/reiser4/plugin/item/static_stat.c
51962 +*/
51963 +
51964 +/* stat-data extension. Please order this by presumed frequency of use */
51965 +typedef enum {
51966 + /* support for light-weight files */
51967 + LIGHT_WEIGHT_STAT,
51968 + /* data required to implement unix stat(2) call. Layout is in
51969 + reiser4_unix_stat. If this is not present, file is light-weight */
51970 + UNIX_STAT,
51971 + /* this contains additional set of 32bit [anc]time fields to implement
51972 + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51973 + if this extension is governed by 32bittimes mount option. */
51974 + LARGE_TIMES_STAT,
51975 + /* stat data has link name included */
51976 + SYMLINK_STAT,
51977 + /* on-disk slots of non-standard plugins for main plugin table
51978 + (@reiser4_inode->pset), that is, plugins that cannot be deduced
51979 + from file mode bits), for example, aggregation, interpolation etc. */
51980 + PLUGIN_STAT,
51981 + /* this extension contains persistent inode flags. These flags are
51982 + single bits: immutable, append, only, etc. Layout is in
51983 + reiser4_flags_stat. */
51984 + FLAGS_STAT,
51985 + /* this extension contains capabilities sets, associated with this
51986 + file. Layout is in reiser4_capabilities_stat */
51987 + CAPABILITIES_STAT,
51988 + /* this extension contains size and public id of the secret key.
51989 + Layout is in reiser4_crypto_stat */
51990 + CRYPTO_STAT,
51991 + /* on-disk slots of non-default plugins for inheritance, which
51992 + are extracted to special plugin table (@reiser4_inode->hset).
51993 + By default, children of the object will inherit plugins from
51994 + its main plugin table (pset). */
51995 + HEIR_STAT,
51996 + LAST_SD_EXTENSION,
51997 + /*
51998 + * init_inode_static_sd() iterates over extension mask until all
51999 + * non-zero bits are processed. This means, that neither ->present(),
52000 + * nor ->absent() methods will be called for stat-data extensions that
52001 + * go after last present extension. But some basic extensions, we want
52002 + * either ->absent() or ->present() method to be called, because these
52003 + * extensions set up something in inode even when they are not
52004 + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
52005 + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
52006 + * ->present(), or ->absent() method will be called, independently of
52007 + * what other extensions are present.
52008 + */
52009 + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
52010 +} sd_ext_bits;
52011 +
52012 +/* minimal stat-data. This allows to support light-weight files. */
52013 +typedef struct reiser4_stat_data_base {
52014 + /* 0 */ __le16 extmask;
52015 + /* 2 */
52016 +} PACKED reiser4_stat_data_base;
52017 +
52018 +typedef struct reiser4_light_weight_stat {
52019 + /* 0 */ __le16 mode;
52020 + /* 2 */ __le32 nlink;
52021 + /* 6 */ __le64 size;
52022 + /* size in bytes */
52023 + /* 14 */
52024 +} PACKED reiser4_light_weight_stat;
52025 +
52026 +typedef struct reiser4_unix_stat {
52027 + /* owner id */
52028 + /* 0 */ __le32 uid;
52029 + /* group id */
52030 + /* 4 */ __le32 gid;
52031 + /* access time */
52032 + /* 8 */ __le32 atime;
52033 + /* modification time */
52034 + /* 12 */ __le32 mtime;
52035 + /* change time */
52036 + /* 16 */ __le32 ctime;
52037 + union {
52038 + /* minor:major for device files */
52039 + /* 20 */ __le64 rdev;
52040 + /* bytes used by file */
52041 + /* 20 */ __le64 bytes;
52042 + } u;
52043 + /* 28 */
52044 +} PACKED reiser4_unix_stat;
52045 +
52046 +/* symlink stored as part of inode */
52047 +typedef struct reiser4_symlink_stat {
52048 + char body[0];
52049 +} PACKED reiser4_symlink_stat;
52050 +
52051 +typedef struct reiser4_plugin_slot {
52052 + /* 0 */ __le16 pset_memb;
52053 + /* 2 */ __le16 id;
52054 + /* 4 *//* here plugin stores its persistent state */
52055 +} PACKED reiser4_plugin_slot;
52056 +
52057 +/* stat-data extension for files with non-standard plugin. */
52058 +typedef struct reiser4_plugin_stat {
52059 + /* number of additional plugins, associated with this object */
52060 + /* 0 */ __le16 plugins_no;
52061 + /* 2 */ reiser4_plugin_slot slot[0];
52062 + /* 2 */
52063 +} PACKED reiser4_plugin_stat;
52064 +
52065 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
52066 + * bit mask. If need arise, this can be replaced with variable width
52067 + * bitmask. */
52068 +typedef struct reiser4_flags_stat {
52069 + /* 0 */ __le32 flags;
52070 + /* 4 */
52071 +} PACKED reiser4_flags_stat;
52072 +
52073 +typedef struct reiser4_capabilities_stat {
52074 + /* 0 */ __le32 effective;
52075 + /* 8 */ __le32 permitted;
52076 + /* 16 */
52077 +} PACKED reiser4_capabilities_stat;
52078 +
52079 +typedef struct reiser4_cluster_stat {
52080 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
52081 + /* 0 */ d8 cluster_shift;
52082 + /* 1 */
52083 +} PACKED reiser4_cluster_stat;
52084 +
52085 +typedef struct reiser4_crypto_stat {
52086 + /* secret key size, bits */
52087 + /* 0 */ d16 keysize;
52088 + /* secret key id */
52089 + /* 2 */ d8 keyid[0];
52090 + /* 2 */
52091 +} PACKED reiser4_crypto_stat;
52092 +
52093 +typedef struct reiser4_large_times_stat {
52094 + /* access time */
52095 + /* 0 */ d32 atime;
52096 + /* modification time */
52097 + /* 4 */ d32 mtime;
52098 + /* change time */
52099 + /* 8 */ d32 ctime;
52100 + /* 12 */
52101 +} PACKED reiser4_large_times_stat;
52102 +
52103 +/* this structure is filled by sd_item_stat */
52104 +typedef struct sd_stat {
52105 + int dirs;
52106 + int files;
52107 + int others;
52108 +} sd_stat;
52109 +
52110 +/* plugin->item.common.* */
52111 +extern void print_sd(const char *prefix, coord_t * coord);
52112 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
52113 +
52114 +/* plugin->item.s.sd.* */
52115 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
52116 +extern int save_len_static_sd(struct inode *inode);
52117 +extern int save_static_sd(struct inode *inode, char **area);
52118 +
52119 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
52120 +#endif
52121 +
52122 +/* Make Linus happy.
52123 + Local variables:
52124 + c-indentation-style: "K&R"
52125 + mode-name: "LC"
52126 + c-basic-offset: 8
52127 + tab-width: 8
52128 + fill-column: 120
52129 + End:
52130 +*/
52131 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/tail.c linux-2.6.33/fs/reiser4/plugin/item/tail.c
52132 --- linux-2.6.33.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 01:00:00.000000000 +0100
52133 +++ linux-2.6.33/fs/reiser4/plugin/item/tail.c 2010-03-04 19:33:22.000000000 +0100
52134 @@ -0,0 +1,807 @@
52135 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52136 +
52137 +#include "item.h"
52138 +#include "../../inode.h"
52139 +#include "../../page_cache.h"
52140 +#include "../../carry.h"
52141 +#include "../../vfs_ops.h"
52142 +
52143 +#include <linux/quotaops.h>
52144 +#include <asm/uaccess.h>
52145 +#include <linux/swap.h>
52146 +#include <linux/writeback.h>
52147 +
52148 +/* plugin->u.item.b.max_key_inside */
52149 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
52150 +{
52151 + item_key_by_coord(coord, key);
52152 + set_key_offset(key, get_key_offset(reiser4_max_key()));
52153 + return key;
52154 +}
52155 +
52156 +/* plugin->u.item.b.can_contain_key */
52157 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
52158 + const reiser4_item_data *data)
52159 +{
52160 + reiser4_key item_key;
52161 +
52162 + if (item_plugin_by_coord(coord) != data->iplug)
52163 + return 0;
52164 +
52165 + item_key_by_coord(coord, &item_key);
52166 + if (get_key_locality(key) != get_key_locality(&item_key) ||
52167 + get_key_objectid(key) != get_key_objectid(&item_key))
52168 + return 0;
52169 +
52170 + return 1;
52171 +}
52172 +
52173 +/* plugin->u.item.b.mergeable
52174 + first item is of tail type */
52175 +/* Audited by: green(2002.06.14) */
52176 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
52177 +{
52178 + reiser4_key key1, key2;
52179 +
52180 + assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
52181 + UNIX_FILE_METADATA_ITEM_TYPE));
52182 + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
52183 +
52184 + if (item_id_by_coord(p2) != FORMATTING_ID) {
52185 + /* second item is of another type */
52186 + return 0;
52187 + }
52188 +
52189 + item_key_by_coord(p1, &key1);
52190 + item_key_by_coord(p2, &key2);
52191 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
52192 + get_key_objectid(&key1) != get_key_objectid(&key2)
52193 + || get_key_type(&key1) != get_key_type(&key2)) {
52194 + /* items of different objects */
52195 + return 0;
52196 + }
52197 + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
52198 + /* not adjacent items */
52199 + return 0;
52200 + }
52201 + return 1;
52202 +}
52203 +
52204 +/* plugin->u.item.b.print
52205 + plugin->u.item.b.check */
52206 +
52207 +/* plugin->u.item.b.nr_units */
52208 +pos_in_node_t nr_units_tail(const coord_t * coord)
52209 +{
52210 + return item_length_by_coord(coord);
52211 +}
52212 +
52213 +/* plugin->u.item.b.lookup */
52214 +lookup_result
52215 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
52216 +{
52217 + reiser4_key item_key;
52218 + __u64 lookuped, offset;
52219 + unsigned nr_units;
52220 +
52221 + item_key_by_coord(coord, &item_key);
52222 + offset = get_key_offset(item_key_by_coord(coord, &item_key));
52223 + nr_units = nr_units_tail(coord);
52224 +
52225 + /* key we are looking for must be greater than key of item @coord */
52226 + assert("vs-416", keygt(key, &item_key));
52227 +
52228 + /* offset we are looking for */
52229 + lookuped = get_key_offset(key);
52230 +
52231 + if (lookuped >= offset && lookuped < offset + nr_units) {
52232 + /* byte we are looking for is in this item */
52233 + coord->unit_pos = lookuped - offset;
52234 + coord->between = AT_UNIT;
52235 + return CBK_COORD_FOUND;
52236 + }
52237 +
52238 + /* set coord after last unit */
52239 + coord->unit_pos = nr_units - 1;
52240 + coord->between = AFTER_UNIT;
52241 + return bias ==
52242 + FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
52243 +}
52244 +
52245 +/* plugin->u.item.b.paste */
52246 +int
52247 +paste_tail(coord_t *coord, reiser4_item_data *data,
52248 + carry_plugin_info *info UNUSED_ARG)
52249 +{
52250 + unsigned old_item_length;
52251 + char *item;
52252 +
52253 + /* length the item had before resizing has been performed */
52254 + old_item_length = item_length_by_coord(coord) - data->length;
52255 +
52256 + /* tail items never get pasted in the middle */
52257 + assert("vs-363",
52258 + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
52259 + (coord->unit_pos == old_item_length - 1 &&
52260 + coord->between == AFTER_UNIT) ||
52261 + (coord->unit_pos == 0 && old_item_length == 0
52262 + && coord->between == AT_UNIT));
52263 +
52264 + item = item_body_by_coord(coord);
52265 + if (coord->unit_pos == 0)
52266 + /* make space for pasted data when pasting at the beginning of
52267 + the item */
52268 + memmove(item + data->length, item, old_item_length);
52269 +
52270 + if (coord->between == AFTER_UNIT)
52271 + coord->unit_pos++;
52272 +
52273 + if (data->data) {
52274 + assert("vs-554", data->user == 0 || data->user == 1);
52275 + if (data->user) {
52276 + assert("nikita-3035", reiser4_schedulable());
52277 + /* copy from user space */
52278 + if (__copy_from_user(item + coord->unit_pos,
52279 + (const char __user *)data->data,
52280 + (unsigned)data->length))
52281 + return RETERR(-EFAULT);
52282 + } else
52283 + /* copy from kernel space */
52284 + memcpy(item + coord->unit_pos, data->data,
52285 + (unsigned)data->length);
52286 + } else {
52287 + memset(item + coord->unit_pos, 0, (unsigned)data->length);
52288 + }
52289 + return 0;
52290 +}
52291 +
52292 +/* plugin->u.item.b.fast_paste */
52293 +
52294 +/* plugin->u.item.b.can_shift
52295 + number of units is returned via return value, number of bytes via @size. For
52296 + tail items they coincide */
52297 +int
52298 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
52299 + znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
52300 + unsigned *size, unsigned want)
52301 +{
52302 + /* make sure that that we do not want to shift more than we have */
52303 + assert("vs-364", want > 0
52304 + && want <= (unsigned)item_length_by_coord(source));
52305 +
52306 + *size = min(want, free_space);
52307 + return *size;
52308 +}
52309 +
52310 +/* plugin->u.item.b.copy_units */
52311 +void
52312 +copy_units_tail(coord_t * target, coord_t * source,
52313 + unsigned from, unsigned count,
52314 + shift_direction where_is_free_space,
52315 + unsigned free_space UNUSED_ARG)
52316 +{
52317 + /* make sure that item @target is expanded already */
52318 + assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
52319 + assert("vs-370", free_space >= count);
52320 +
52321 + if (where_is_free_space == SHIFT_LEFT) {
52322 + /* append item @target with @count first bytes of @source */
52323 + assert("vs-365", from == 0);
52324 +
52325 + memcpy((char *)item_body_by_coord(target) +
52326 + item_length_by_coord(target) - count,
52327 + (char *)item_body_by_coord(source), count);
52328 + } else {
52329 + /* target item is moved to right already */
52330 + reiser4_key key;
52331 +
52332 + assert("vs-367",
52333 + (unsigned)item_length_by_coord(source) == from + count);
52334 +
52335 + memcpy((char *)item_body_by_coord(target),
52336 + (char *)item_body_by_coord(source) + from, count);
52337 +
52338 + /* new units are inserted before first unit in an item,
52339 + therefore, we have to update item key */
52340 + item_key_by_coord(source, &key);
52341 + set_key_offset(&key, get_key_offset(&key) + from);
52342 +
52343 + node_plugin_by_node(target->node)->update_item_key(target, &key,
52344 + NULL /*info */);
52345 + }
52346 +}
52347 +
52348 +/* plugin->u.item.b.create_hook */
52349 +
52350 +/* item_plugin->b.kill_hook
52351 + this is called when @count units starting from @from-th one are going to be removed
52352 + */
52353 +int
52354 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
52355 + pos_in_node_t count, struct carry_kill_data *kdata)
52356 +{
52357 + reiser4_key key;
52358 + loff_t start, end;
52359 +
52360 + assert("vs-1577", kdata);
52361 + assert("vs-1579", kdata->inode);
52362 +
52363 + item_key_by_coord(coord, &key);
52364 + start = get_key_offset(&key) + from;
52365 + end = start + count;
52366 + fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
52367 + return 0;
52368 +}
52369 +
52370 +/* plugin->u.item.b.shift_hook */
52371 +
52372 +/* helper for kill_units_tail and cut_units_tail */
52373 +static int
52374 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52375 + reiser4_key * smallest_removed, reiser4_key * new_first)
52376 +{
52377 + pos_in_node_t count;
52378 +
52379 + /* this method is only called to remove part of item */
52380 + assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
52381 + /* tails items are never cut from the middle of an item */
52382 + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
52383 + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
52384 +
52385 + count = to - from + 1;
52386 +
52387 + if (smallest_removed) {
52388 + /* store smallest key removed */
52389 + item_key_by_coord(coord, smallest_removed);
52390 + set_key_offset(smallest_removed,
52391 + get_key_offset(smallest_removed) + from);
52392 + }
52393 + if (new_first) {
52394 + /* head of item is cut */
52395 + assert("vs-1529", from == 0);
52396 +
52397 + item_key_by_coord(coord, new_first);
52398 + set_key_offset(new_first,
52399 + get_key_offset(new_first) + from + count);
52400 + }
52401 +
52402 + if (REISER4_DEBUG)
52403 + memset((char *)item_body_by_coord(coord) + from, 0, count);
52404 + return count;
52405 +}
52406 +
52407 +/* plugin->u.item.b.cut_units */
52408 +int
52409 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52410 + struct carry_cut_data *cdata UNUSED_ARG,
52411 + reiser4_key * smallest_removed, reiser4_key * new_first)
52412 +{
52413 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52414 +}
52415 +
52416 +/* plugin->u.item.b.kill_units */
52417 +int
52418 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52419 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52420 + reiser4_key * new_first)
52421 +{
52422 + kill_hook_tail(coord, from, to - from + 1, kdata);
52423 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52424 +}
52425 +
52426 +/* plugin->u.item.b.unit_key */
52427 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52428 +{
52429 + assert("vs-375", coord_is_existing_unit(coord));
52430 +
52431 + item_key_by_coord(coord, key);
52432 + set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52433 +
52434 + return key;
52435 +}
52436 +
52437 +/* plugin->u.item.b.estimate
52438 + plugin->u.item.b.item_data_by_flow */
52439 +
52440 +/* tail redpage function. It is called from readpage_tail(). */
52441 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52442 +{
52443 + tap_t tap;
52444 + int result;
52445 + coord_t coord;
52446 + lock_handle lh;
52447 + int count, mapped;
52448 + struct inode *inode;
52449 + char *pagedata;
52450 +
52451 + /* saving passed coord in order to do not move it by tap. */
52452 + init_lh(&lh);
52453 + copy_lh(&lh, uf_coord->lh);
52454 + inode = page->mapping->host;
52455 + coord_dup(&coord, &uf_coord->coord);
52456 +
52457 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52458 +
52459 + if ((result = reiser4_tap_load(&tap)))
52460 + goto out_tap_done;
52461 +
52462 + /* lookup until page is filled up. */
52463 + for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52464 + /* number of bytes to be copied to page */
52465 + count = item_length_by_coord(&coord) - coord.unit_pos;
52466 + if (count > PAGE_CACHE_SIZE - mapped)
52467 + count = PAGE_CACHE_SIZE - mapped;
52468 +
52469 + /* attach @page to address space and get data address */
52470 + pagedata = kmap_atomic(page, KM_USER0);
52471 +
52472 + /* copy tail item to page */
52473 + memcpy(pagedata + mapped,
52474 + ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52475 + count);
52476 + mapped += count;
52477 +
52478 + flush_dcache_page(page);
52479 +
52480 + /* dettach page from address space */
52481 + kunmap_atomic(pagedata, KM_USER0);
52482 +
52483 + /* Getting next tail item. */
52484 + if (mapped < PAGE_CACHE_SIZE) {
52485 + /*
52486 + * unlock page in order to avoid keep it locked
52487 + * during tree lookup, which takes long term locks
52488 + */
52489 + unlock_page(page);
52490 +
52491 + /* getting right neighbour. */
52492 + result = go_dir_el(&tap, RIGHT_SIDE, 0);
52493 +
52494 + /* lock page back */
52495 + lock_page(page);
52496 + if (PageUptodate(page)) {
52497 + /*
52498 + * another thread read the page, we have
52499 + * nothing to do
52500 + */
52501 + result = 0;
52502 + goto out_unlock_page;
52503 + }
52504 +
52505 + if (result) {
52506 + if (result == -E_NO_NEIGHBOR) {
52507 + /*
52508 + * rigth neighbor is not a formatted
52509 + * node
52510 + */
52511 + result = 0;
52512 + goto done;
52513 + } else {
52514 + goto out_tap_relse;
52515 + }
52516 + } else {
52517 + if (!inode_file_plugin(inode)->
52518 + owns_item(inode, &coord)) {
52519 + /* item of another file is found */
52520 + result = 0;
52521 + goto done;
52522 + }
52523 + }
52524 + }
52525 + }
52526 +
52527 + done:
52528 + if (mapped != PAGE_CACHE_SIZE)
52529 + zero_user_segment(page, mapped, PAGE_CACHE_SIZE);
52530 + SetPageUptodate(page);
52531 + out_unlock_page:
52532 + unlock_page(page);
52533 + out_tap_relse:
52534 + reiser4_tap_relse(&tap);
52535 + out_tap_done:
52536 + reiser4_tap_done(&tap);
52537 + return result;
52538 +}
52539 +
52540 +/*
52541 + plugin->s.file.readpage
52542 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52543 + or
52544 + filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_tail
52545 +
52546 + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52547 + item. */
52548 +int readpage_tail(void *vp, struct page *page)
52549 +{
52550 + uf_coord_t *uf_coord = vp;
52551 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
52552 + ON_DEBUG(reiser4_key key);
52553 +
52554 + assert("umka-2515", PageLocked(page));
52555 + assert("umka-2516", !PageUptodate(page));
52556 + assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52557 + assert("umka-2518", page->mapping && page->mapping->host);
52558 +
52559 + assert("umka-2519", znode_is_loaded(coord->node));
52560 + assert("umka-2520", item_is_tail(coord));
52561 + assert("umka-2521", coord_is_existing_unit(coord));
52562 + assert("umka-2522", znode_is_rlocked(coord->node));
52563 + assert("umka-2523",
52564 + page->mapping->host->i_ino ==
52565 + get_key_objectid(item_key_by_coord(coord, &key)));
52566 +
52567 + return do_readpage_tail(uf_coord, page);
52568 +}
52569 +
52570 +/**
52571 + * overwrite_tail
52572 + * @flow:
52573 + * @coord:
52574 + *
52575 + * Overwrites tail item or its part by user data. Returns number of bytes
52576 + * written or error code.
52577 + */
52578 +static int overwrite_tail(flow_t *flow, coord_t *coord)
52579 +{
52580 + unsigned count;
52581 +
52582 + assert("vs-570", flow->user == 1);
52583 + assert("vs-946", flow->data);
52584 + assert("vs-947", coord_is_existing_unit(coord));
52585 + assert("vs-948", znode_is_write_locked(coord->node));
52586 + assert("nikita-3036", reiser4_schedulable());
52587 +
52588 + count = item_length_by_coord(coord) - coord->unit_pos;
52589 + if (count > flow->length)
52590 + count = flow->length;
52591 +
52592 + if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52593 + (const char __user *)flow->data, count))
52594 + return RETERR(-EFAULT);
52595 +
52596 + znode_make_dirty(coord->node);
52597 + return count;
52598 +}
52599 +
52600 +/**
52601 + * insert_first_tail
52602 + * @inode:
52603 + * @flow:
52604 + * @coord:
52605 + * @lh:
52606 + *
52607 + * Returns number of bytes written or error code.
52608 + */
52609 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52610 + coord_t *coord, lock_handle *lh)
52611 +{
52612 + int result;
52613 + loff_t to_write;
52614 + struct unix_file_info *uf_info;
52615 +
52616 + if (get_key_offset(&flow->key) != 0) {
52617 + /*
52618 + * file is empty and we have to write not to the beginning of
52619 + * file. Create a hole at the beginning of file. On success
52620 + * insert_flow returns 0 as number of written bytes which is
52621 + * what we have to return on padding a file with holes
52622 + */
52623 + flow->data = NULL;
52624 + flow->length = get_key_offset(&flow->key);
52625 + set_key_offset(&flow->key, 0);
52626 + /*
52627 + * holes in files built of tails are stored just like if there
52628 + * were real data which are all zeros. Therefore we have to
52629 + * allocate quota here as well
52630 + */
52631 + if (vfs_dq_alloc_space_nodirty(inode, flow->length))
52632 + return RETERR(-EDQUOT);
52633 + result = reiser4_insert_flow(coord, lh, flow);
52634 + if (flow->length)
52635 + vfs_dq_free_space_nodirty(inode, flow->length);
52636 +
52637 + uf_info = unix_file_inode_data(inode);
52638 +
52639 + /*
52640 + * first item insertion is only possible when writing to empty
52641 + * file or performing tail conversion
52642 + */
52643 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52644 + (reiser4_inode_get_flag(inode,
52645 + REISER4_PART_MIXED) &&
52646 + reiser4_inode_get_flag(inode,
52647 + REISER4_PART_IN_CONV))));
52648 + /* if file was empty - update its state */
52649 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52650 + uf_info->container = UF_CONTAINER_TAILS;
52651 + return result;
52652 + }
52653 +
52654 + /* check quota before appending data */
52655 + if (vfs_dq_alloc_space_nodirty(inode, flow->length))
52656 + return RETERR(-EDQUOT);
52657 +
52658 + to_write = flow->length;
52659 + result = reiser4_insert_flow(coord, lh, flow);
52660 + if (flow->length)
52661 + vfs_dq_free_space_nodirty(inode, flow->length);
52662 + return (to_write - flow->length) ? (to_write - flow->length) : result;
52663 +}
52664 +
52665 +/**
52666 + * append_tail
52667 + * @inode:
52668 + * @flow:
52669 + * @coord:
52670 + * @lh:
52671 + *
52672 + * Returns number of bytes written or error code.
52673 + */
52674 +static ssize_t append_tail(struct inode *inode,
52675 + flow_t *flow, coord_t *coord, lock_handle *lh)
52676 +{
52677 + int result;
52678 + reiser4_key append_key;
52679 + loff_t to_write;
52680 +
52681 + if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52682 + flow->data = NULL;
52683 + flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52684 + set_key_offset(&flow->key, get_key_offset(&append_key));
52685 + /*
52686 + * holes in files built of tails are stored just like if there
52687 + * were real data which are all zeros. Therefore we have to
52688 + * allocate quota here as well
52689 + */
52690 + if (vfs_dq_alloc_space_nodirty(inode, flow->length))
52691 + return RETERR(-EDQUOT);
52692 + result = reiser4_insert_flow(coord, lh, flow);
52693 + if (flow->length)
52694 + vfs_dq_free_space_nodirty(inode, flow->length);
52695 + return result;
52696 + }
52697 +
52698 + /* check quota before appending data */
52699 + if (vfs_dq_alloc_space_nodirty(inode, flow->length))
52700 + return RETERR(-EDQUOT);
52701 +
52702 + to_write = flow->length;
52703 + result = reiser4_insert_flow(coord, lh, flow);
52704 + if (flow->length)
52705 + vfs_dq_free_space_nodirty(inode, flow->length);
52706 + return (to_write - flow->length) ? (to_write - flow->length) : result;
52707 +}
52708 +
52709 +/**
52710 + * write_tail_reserve_space - reserve space for tail write operation
52711 + * @inode:
52712 + *
52713 + * Estimates and reserves space which may be required for writing one flow to a
52714 + * file
52715 + */
52716 +static int write_extent_reserve_space(struct inode *inode)
52717 +{
52718 + __u64 count;
52719 + reiser4_tree *tree;
52720 +
52721 + /*
52722 + * to write one flow to a file by tails we have to reserve disk space for:
52723 +
52724 + * 1. find_file_item may have to insert empty node to the tree (empty
52725 + * leaf node between two extent items). This requires 1 block and
52726 + * number of blocks which are necessary to perform insertion of an
52727 + * internal item into twig level.
52728 + *
52729 + * 2. flow insertion
52730 + *
52731 + * 3. stat data update
52732 + */
52733 + tree = reiser4_tree_by_inode(inode);
52734 + count = estimate_one_insert_item(tree) +
52735 + estimate_insert_flow(tree->height) +
52736 + estimate_one_insert_item(tree);
52737 + grab_space_enable();
52738 + return reiser4_grab_space(count, 0 /* flags */);
52739 +}
52740 +
52741 +#define PAGE_PER_FLOW 4
52742 +
52743 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
52744 +{
52745 + loff_t faulted;
52746 + int to_fault;
52747 +
52748 + if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52749 + count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52750 + faulted = 0;
52751 + while (count > 0) {
52752 + to_fault = PAGE_CACHE_SIZE;
52753 + if (count < to_fault)
52754 + to_fault = count;
52755 + fault_in_pages_readable(buf + faulted, to_fault);
52756 + count -= to_fault;
52757 + faulted += to_fault;
52758 + }
52759 + return faulted;
52760 +}
52761 +
52762 +/**
52763 + * reiser4_write_tail - write method of tail item plugin
52764 + * @file: file to write to
52765 + * @buf: address of user-space buffer
52766 + * @count: number of bytes to write
52767 + * @pos: position in file to write to
52768 + *
52769 + * Returns number of written bytes or error code.
52770 + */
52771 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52772 + const char __user *buf, size_t count, loff_t *pos)
52773 +{
52774 + struct hint hint;
52775 + int result;
52776 + flow_t flow;
52777 + coord_t *coord;
52778 + lock_handle *lh;
52779 + znode *loaded;
52780 +
52781 + assert("edward-1548", inode != NULL);
52782 +
52783 + if (write_extent_reserve_space(inode))
52784 + return RETERR(-ENOSPC);
52785 +
52786 + result = load_file_hint(file, &hint);
52787 + BUG_ON(result != 0);
52788 +
52789 + flow.length = faultin_user_pages(buf, count);
52790 + flow.user = 1;
52791 + memcpy(&flow.data, &buf, sizeof(buf));
52792 + flow.op = WRITE_OP;
52793 + key_by_inode_and_offset_common(inode, *pos, &flow.key);
52794 +
52795 + result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52796 + if (IS_CBKERR(result))
52797 + return result;
52798 +
52799 + coord = &hint.ext_coord.coord;
52800 + lh = hint.ext_coord.lh;
52801 +
52802 + result = zload(coord->node);
52803 + BUG_ON(result != 0);
52804 + loaded = coord->node;
52805 +
52806 + if (coord->between == AFTER_UNIT) {
52807 + /* append with data or hole */
52808 + result = append_tail(inode, &flow, coord, lh);
52809 + } else if (coord->between == AT_UNIT) {
52810 + /* overwrite */
52811 + result = overwrite_tail(&flow, coord);
52812 + } else {
52813 + /* no items of this file yet. insert data or hole */
52814 + result = insert_first_tail(inode, &flow, coord, lh);
52815 + }
52816 + zrelse(loaded);
52817 + if (result < 0) {
52818 + done_lh(lh);
52819 + return result;
52820 + }
52821 +
52822 + /* seal and unlock znode */
52823 + hint.ext_coord.valid = 0;
52824 + if (hint.ext_coord.valid)
52825 + reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52826 + else
52827 + reiser4_unset_hint(&hint);
52828 +
52829 + save_file_hint(file, &hint);
52830 + return result;
52831 +}
52832 +
52833 +#if REISER4_DEBUG
52834 +
52835 +static int
52836 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52837 +{
52838 + reiser4_key item_key;
52839 +
52840 + assert("vs-1356", coord_is_existing_unit(coord));
52841 + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52842 + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52843 + return get_key_offset(key) ==
52844 + get_key_offset(&item_key) + coord->unit_pos;
52845 +
52846 +}
52847 +
52848 +#endif
52849 +
52850 +/* plugin->u.item.s.file.read */
52851 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52852 +{
52853 + unsigned count;
52854 + int item_length;
52855 + coord_t *coord;
52856 + uf_coord_t *uf_coord;
52857 +
52858 + uf_coord = &hint->ext_coord;
52859 + coord = &uf_coord->coord;
52860 +
52861 + assert("vs-571", f->user == 1);
52862 + assert("vs-571", f->data);
52863 + assert("vs-967", coord && coord->node);
52864 + assert("vs-1117", znode_is_rlocked(coord->node));
52865 + assert("vs-1118", znode_is_loaded(coord->node));
52866 +
52867 + assert("nikita-3037", reiser4_schedulable());
52868 + assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52869 +
52870 + /* calculate number of bytes to read off the item */
52871 + item_length = item_length_by_coord(coord);
52872 + count = item_length_by_coord(coord) - coord->unit_pos;
52873 + if (count > f->length)
52874 + count = f->length;
52875 +
52876 + /* user page has to be brought in so that major page fault does not
52877 + * occur here when longtem lock is held */
52878 + if (__copy_to_user((char __user *)f->data,
52879 + ((char *)item_body_by_coord(coord) + coord->unit_pos),
52880 + count))
52881 + return RETERR(-EFAULT);
52882 +
52883 + /* probably mark_page_accessed() should only be called if
52884 + * coord->unit_pos is zero. */
52885 + mark_page_accessed(znode_page(coord->node));
52886 + move_flow_forward(f, count);
52887 +
52888 + coord->unit_pos += count;
52889 + if (item_length == coord->unit_pos) {
52890 + coord->unit_pos--;
52891 + coord->between = AFTER_UNIT;
52892 + }
52893 + reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52894 + return 0;
52895 +}
52896 +
52897 +/*
52898 + plugin->u.item.s.file.append_key
52899 + key of first byte which is the next to last byte by addressed by this item
52900 +*/
52901 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52902 +{
52903 + item_key_by_coord(coord, key);
52904 + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52905 + return key;
52906 +}
52907 +
52908 +/* plugin->u.item.s.file.init_coord_extension */
52909 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52910 +{
52911 + uf_coord->valid = 1;
52912 +}
52913 +
52914 +/*
52915 + plugin->u.item.s.file.get_block
52916 +*/
52917 +int
52918 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52919 +{
52920 + assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52921 +
52922 + if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52923 + /* if node has'nt obtainet its block number yet, return 0.
52924 + * Lets avoid upsetting users with some cosmic numbers beyond
52925 + * the device capacity.*/
52926 + *block = 0;
52927 + else
52928 + *block = *znode_get_block(coord->node);
52929 + return 0;
52930 +}
52931 +
52932 +/*
52933 + * Local variables:
52934 + * c-indentation-style: "K&R"
52935 + * mode-name: "LC"
52936 + * c-basic-offset: 8
52937 + * tab-width: 8
52938 + * fill-column: 79
52939 + * scroll-step: 1
52940 + * End:
52941 + */
52942 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/tail.h linux-2.6.33/fs/reiser4/plugin/item/tail.h
52943 --- linux-2.6.33.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 01:00:00.000000000 +0100
52944 +++ linux-2.6.33/fs/reiser4/plugin/item/tail.h 2010-03-04 19:33:22.000000000 +0100
52945 @@ -0,0 +1,56 @@
52946 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52947 +
52948 +#if !defined( __REISER4_TAIL_H__ )
52949 +#define __REISER4_TAIL_H__
52950 +
52951 +struct tail_coord_extension {
52952 + int not_used;
52953 +};
52954 +
52955 +struct cut_list;
52956 +
52957 +/* plugin->u.item.b.* */
52958 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52959 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52960 + const reiser4_item_data *);
52961 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
52962 +pos_in_node_t nr_units_tail(const coord_t *);
52963 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52964 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52965 +int can_shift_tail(unsigned free_space, coord_t * source,
52966 + znode * target, shift_direction, unsigned *size,
52967 + unsigned want);
52968 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52969 + unsigned count, shift_direction, unsigned free_space);
52970 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52971 + struct carry_kill_data *);
52972 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52973 + struct carry_cut_data *, reiser4_key * smallest_removed,
52974 + reiser4_key * new_first);
52975 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52976 + struct carry_kill_data *, reiser4_key * smallest_removed,
52977 + reiser4_key * new_first);
52978 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52979 +
52980 +/* plugin->u.item.s.* */
52981 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52982 + const char __user *buf, size_t count, loff_t *pos);
52983 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52984 +int readpage_tail(void *vp, struct page *page);
52985 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52986 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52987 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52988 +
52989 +/* __REISER4_TAIL_H__ */
52990 +#endif
52991 +
52992 +/* Make Linus happy.
52993 + Local variables:
52994 + c-indentation-style: "K&R"
52995 + mode-name: "LC"
52996 + c-basic-offset: 8
52997 + tab-width: 8
52998 + fill-column: 120
52999 + scroll-step: 1
53000 + End:
53001 +*/
53002 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/Makefile linux-2.6.33/fs/reiser4/plugin/Makefile
53003 --- linux-2.6.33.orig/fs/reiser4/plugin/Makefile 1970-01-01 01:00:00.000000000 +0100
53004 +++ linux-2.6.33/fs/reiser4/plugin/Makefile 2010-03-04 19:33:22.000000000 +0100
53005 @@ -0,0 +1,26 @@
53006 +obj-$(CONFIG_REISER4_FS) += plugins.o
53007 +
53008 +plugins-objs := \
53009 + plugin.o \
53010 + plugin_set.o \
53011 + object.o \
53012 + inode_ops.o \
53013 + inode_ops_rename.o \
53014 + file_ops.o \
53015 + file_ops_readdir.o \
53016 + file_plugin_common.o \
53017 + dir_plugin_common.o \
53018 + digest.o \
53019 + hash.o \
53020 + fibration.o \
53021 + tail_policy.o \
53022 + regular.o
53023 +
53024 +obj-$(CONFIG_REISER4_FS) += item/
53025 +obj-$(CONFIG_REISER4_FS) += file/
53026 +obj-$(CONFIG_REISER4_FS) += dir/
53027 +obj-$(CONFIG_REISER4_FS) += node/
53028 +obj-$(CONFIG_REISER4_FS) += compress/
53029 +obj-$(CONFIG_REISER4_FS) += space/
53030 +obj-$(CONFIG_REISER4_FS) += disk_format/
53031 +obj-$(CONFIG_REISER4_FS) += security/
53032 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/Makefile linux-2.6.33/fs/reiser4/plugin/node/Makefile
53033 --- linux-2.6.33.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 01:00:00.000000000 +0100
53034 +++ linux-2.6.33/fs/reiser4/plugin/node/Makefile 2010-03-04 19:33:22.000000000 +0100
53035 @@ -0,0 +1,5 @@
53036 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
53037 +
53038 +node_plugins-objs := \
53039 + node.o \
53040 + node40.o
53041 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node40.c linux-2.6.33/fs/reiser4/plugin/node/node40.c
53042 --- linux-2.6.33.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 01:00:00.000000000 +0100
53043 +++ linux-2.6.33/fs/reiser4/plugin/node/node40.c 2010-03-04 19:33:22.000000000 +0100
53044 @@ -0,0 +1,2924 @@
53045 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53046 +
53047 +#include "../../debug.h"
53048 +#include "../../key.h"
53049 +#include "../../coord.h"
53050 +#include "../plugin_header.h"
53051 +#include "../item/item.h"
53052 +#include "node.h"
53053 +#include "node40.h"
53054 +#include "../plugin.h"
53055 +#include "../../jnode.h"
53056 +#include "../../znode.h"
53057 +#include "../../pool.h"
53058 +#include "../../carry.h"
53059 +#include "../../tap.h"
53060 +#include "../../tree.h"
53061 +#include "../../super.h"
53062 +#include "../../reiser4.h"
53063 +
53064 +#include <asm/uaccess.h>
53065 +#include <linux/types.h>
53066 +#include <linux/prefetch.h>
53067 +
53068 +/* leaf 40 format:
53069 +
53070 + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
53071 + plugin_id (16) key
53072 + free_space (16) pluginid (16)
53073 + free_space_start (16) offset (16)
53074 + level (8)
53075 + num_items (16)
53076 + magic (32)
53077 + flush_time (32)
53078 +*/
53079 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
53080 +/* magic number that is stored in ->magic field of node header */
53081 +static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
53082 +
53083 +static int prepare_for_update(znode * left, znode * right,
53084 + carry_plugin_info * info);
53085 +
53086 +/* header of node of reiser40 format is at the beginning of node */
53087 +static inline node40_header *node40_node_header(const znode * node /* node to
53088 + * query */ )
53089 +{
53090 + assert("nikita-567", node != NULL);
53091 + assert("nikita-568", znode_page(node) != NULL);
53092 + assert("nikita-569", zdata(node) != NULL);
53093 + return (node40_header *) zdata(node);
53094 +}
53095 +
53096 +/* functions to get/set fields of node40_header */
53097 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
53098 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
53099 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
53100 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
53101 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
53102 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
53103 +
53104 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
53105 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
53106 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
53107 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
53108 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
53109 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
53110 +
53111 +/* plugin field of node header should be read/set by
53112 + plugin_by_disk_id/save_disk_plugin */
53113 +
53114 +/* array of item headers is at the end of node */
53115 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
53116 +{
53117 + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
53118 +}
53119 +
53120 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
53121 + */
53122 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
53123 +{
53124 + return (item_header40 *) (zdata(coord->node) +
53125 + znode_size(coord->node)) - (coord->item_pos) -
53126 + 1;
53127 +}
53128 +
53129 +/* functions to get/set fields of item_header40 */
53130 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
53131 +
53132 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
53133 +
53134 +/* plugin field of item header should be read/set by
53135 + plugin_by_disk_id/save_disk_plugin */
53136 +
53137 +/* plugin methods */
53138 +
53139 +/* plugin->u.node.item_overhead
53140 + look for description of this method in plugin/node/node.h */
53141 +size_t
53142 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
53143 +{
53144 + return sizeof(item_header40);
53145 +}
53146 +
53147 +/* plugin->u.node.free_space
53148 + look for description of this method in plugin/node/node.h */
53149 +size_t free_space_node40(znode * node)
53150 +{
53151 + assert("nikita-577", node != NULL);
53152 + assert("nikita-578", znode_is_loaded(node));
53153 + assert("nikita-579", zdata(node) != NULL);
53154 +
53155 + return nh40_get_free_space(node40_node_header(node));
53156 +}
53157 +
53158 +/* private inline version of node40_num_of_items() for use in this file. This
53159 + is necessary, because address of node40_num_of_items() is taken and it is
53160 + never inlined as a result. */
53161 +static inline short node40_num_of_items_internal(const znode * node)
53162 +{
53163 + return nh40_get_num_items(node40_node_header(node));
53164 +}
53165 +
53166 +#if REISER4_DEBUG
53167 +static inline void check_num_items(const znode * node)
53168 +{
53169 + assert("nikita-2749",
53170 + node40_num_of_items_internal(node) == node->nr_items);
53171 + assert("nikita-2746", znode_is_write_locked(node));
53172 +}
53173 +#else
53174 +#define check_num_items(node) noop
53175 +#endif
53176 +
53177 +/* plugin->u.node.num_of_items
53178 + look for description of this method in plugin/node/node.h */
53179 +int num_of_items_node40(const znode * node)
53180 +{
53181 + return node40_num_of_items_internal(node);
53182 +}
53183 +
53184 +static void
53185 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
53186 +{
53187 + assert("nikita-2751", node != NULL);
53188 + assert("nikita-2750", nh == node40_node_header(node));
53189 +
53190 + check_num_items(node);
53191 + nh40_set_num_items(nh, value);
53192 + node->nr_items = value;
53193 + check_num_items(node);
53194 +}
53195 +
53196 +/* plugin->u.node.item_by_coord
53197 + look for description of this method in plugin/node/node.h */
53198 +char *item_by_coord_node40(const coord_t * coord)
53199 +{
53200 + item_header40 *ih;
53201 + char *p;
53202 +
53203 + /* @coord is set to existing item */
53204 + assert("nikita-596", coord != NULL);
53205 + assert("vs-255", coord_is_existing_item(coord));
53206 +
53207 + ih = node40_ih_at_coord(coord);
53208 + p = zdata(coord->node) + ih40_get_offset(ih);
53209 + return p;
53210 +}
53211 +
53212 +/* plugin->u.node.length_by_coord
53213 + look for description of this method in plugin/node/node.h */
53214 +int length_by_coord_node40(const coord_t * coord)
53215 +{
53216 + item_header40 *ih;
53217 + int result;
53218 +
53219 + /* @coord is set to existing item */
53220 + assert("vs-256", coord != NULL);
53221 + assert("vs-257", coord_is_existing_item(coord));
53222 +
53223 + ih = node40_ih_at_coord(coord);
53224 + if ((int)coord->item_pos ==
53225 + node40_num_of_items_internal(coord->node) - 1)
53226 + result =
53227 + nh40_get_free_space_start(node40_node_header(coord->node)) -
53228 + ih40_get_offset(ih);
53229 + else
53230 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
53231 +
53232 + return result;
53233 +}
53234 +
53235 +static pos_in_node_t
53236 +node40_item_length(const znode * node, pos_in_node_t item_pos)
53237 +{
53238 + item_header40 *ih;
53239 + pos_in_node_t result;
53240 +
53241 + /* @coord is set to existing item */
53242 + assert("vs-256", node != NULL);
53243 + assert("vs-257", node40_num_of_items_internal(node) > item_pos);
53244 +
53245 + ih = node40_ih_at(node, item_pos);
53246 + if (item_pos == node40_num_of_items_internal(node) - 1)
53247 + result =
53248 + nh40_get_free_space_start(node40_node_header(node)) -
53249 + ih40_get_offset(ih);
53250 + else
53251 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
53252 +
53253 + return result;
53254 +}
53255 +
53256 +/* plugin->u.node.plugin_by_coord
53257 + look for description of this method in plugin/node/node.h */
53258 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
53259 +{
53260 + item_header40 *ih;
53261 + item_plugin *result;
53262 +
53263 + /* @coord is set to existing item */
53264 + assert("vs-258", coord != NULL);
53265 + assert("vs-259", coord_is_existing_item(coord));
53266 +
53267 + ih = node40_ih_at_coord(coord);
53268 + /* pass NULL in stead of current tree. This is time critical call. */
53269 + result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
53270 + return result;
53271 +}
53272 +
53273 +/* plugin->u.node.key_at
53274 + look for description of this method in plugin/node/node.h */
53275 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
53276 +{
53277 + item_header40 *ih;
53278 +
53279 + assert("nikita-1765", coord_is_existing_item(coord));
53280 +
53281 + /* @coord is set to existing item */
53282 + ih = node40_ih_at_coord(coord);
53283 + memcpy(key, &ih->key, sizeof(reiser4_key));
53284 + return key;
53285 +}
53286 +
53287 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
53288 +
53289 +#define NODE_INCSTAT(n, counter) \
53290 + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
53291 +
53292 +#define NODE_ADDSTAT(n, counter, val) \
53293 + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
53294 +
53295 +/* plugin->u.node.lookup
53296 + look for description of this method in plugin/node/node.h */
53297 +node_search_result lookup_node40(znode * node /* node to query */ ,
53298 + const reiser4_key * key /* key to look for */ ,
53299 + lookup_bias bias /* search bias */ ,
53300 + coord_t * coord /* resulting coord */ )
53301 +{
53302 + int left;
53303 + int right;
53304 + int found;
53305 + int items;
53306 +
53307 + item_header40 *lefth;
53308 + item_header40 *righth;
53309 +
53310 + item_plugin *iplug;
53311 + item_header40 *bstop;
53312 + item_header40 *ih;
53313 + cmp_t order;
53314 +
53315 + assert("nikita-583", node != NULL);
53316 + assert("nikita-584", key != NULL);
53317 + assert("nikita-585", coord != NULL);
53318 + assert("nikita-2693", znode_is_any_locked(node));
53319 + cassert(REISER4_SEQ_SEARCH_BREAK > 2);
53320 +
53321 + items = node_num_items(node);
53322 +
53323 + if (unlikely(items == 0)) {
53324 + coord_init_first_unit(coord, node);
53325 + return NS_NOT_FOUND;
53326 + }
53327 +
53328 + /* binary search for item that can contain given key */
53329 + left = 0;
53330 + right = items - 1;
53331 + coord->node = node;
53332 + coord_clear_iplug(coord);
53333 + found = 0;
53334 +
53335 + lefth = node40_ih_at(node, left);
53336 + righth = node40_ih_at(node, right);
53337 +
53338 + /* It is known that for small arrays sequential search is on average
53339 + more efficient than binary. This is because sequential search is
53340 + coded as tight loop that can be better optimized by compilers and
53341 + for small array size gain from this optimization makes sequential
53342 + search the winner. Another, maybe more important, reason for this,
53343 + is that sequential array is more CPU cache friendly, whereas binary
53344 + search effectively destroys CPU caching.
53345 +
53346 + Critical here is the notion of "smallness". Reasonable value of
53347 + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
53348 + fs/reiser4/ulevel/ulevel.c:test_search().
53349 +
53350 + Don't try to further optimize sequential search by scanning from
53351 + right to left in attempt to use more efficient loop termination
53352 + condition (comparison with 0). This doesn't work.
53353 +
53354 + */
53355 +
53356 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
53357 + int median;
53358 + item_header40 *medianh;
53359 +
53360 + median = (left + right) / 2;
53361 + medianh = node40_ih_at(node, median);
53362 +
53363 + assert("nikita-1084", median >= 0);
53364 + assert("nikita-1085", median < items);
53365 + switch (keycmp(key, &medianh->key)) {
53366 + case LESS_THAN:
53367 + right = median;
53368 + righth = medianh;
53369 + break;
53370 + default:
53371 + wrong_return_value("nikita-586", "keycmp");
53372 + case GREATER_THAN:
53373 + left = median;
53374 + lefth = medianh;
53375 + break;
53376 + case EQUAL_TO:
53377 + do {
53378 + --median;
53379 + /* headers are ordered from right to left */
53380 + ++medianh;
53381 + } while (median >= 0 && keyeq(key, &medianh->key));
53382 + right = left = median + 1;
53383 + ih = lefth = righth = medianh - 1;
53384 + found = 1;
53385 + break;
53386 + }
53387 + }
53388 + /* sequential scan. Item headers, and, therefore, keys are stored at
53389 + the rightmost part of a node from right to left. We are trying to
53390 + access memory from left to right, and hence, scan in _descending_
53391 + order of item numbers.
53392 + */
53393 + if (!found) {
53394 + for (left = right, ih = righth; left >= 0; ++ih, --left) {
53395 + cmp_t comparison;
53396 +
53397 + prefetchkey(&(ih + 1)->key);
53398 + comparison = keycmp(&ih->key, key);
53399 + if (comparison == GREATER_THAN)
53400 + continue;
53401 + if (comparison == EQUAL_TO) {
53402 + found = 1;
53403 + do {
53404 + --left;
53405 + ++ih;
53406 + } while (left >= 0 && keyeq(&ih->key, key));
53407 + ++left;
53408 + --ih;
53409 + } else {
53410 + assert("nikita-1256", comparison == LESS_THAN);
53411 + }
53412 + break;
53413 + }
53414 + if (unlikely(left < 0))
53415 + left = 0;
53416 + }
53417 +
53418 + assert("nikita-3212", right >= left);
53419 + assert("nikita-3214",
53420 + equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53421 +
53422 + coord_set_item_pos(coord, left);
53423 + coord->unit_pos = 0;
53424 + coord->between = AT_UNIT;
53425 +
53426 + /* key < leftmost key in a mode or node is corrupted and keys
53427 + are not sorted */
53428 + bstop = node40_ih_at(node, (unsigned)left);
53429 + order = keycmp(&bstop->key, key);
53430 + if (unlikely(order == GREATER_THAN)) {
53431 + if (unlikely(left != 0)) {
53432 + /* screw up */
53433 + warning("nikita-587", "Key less than %i key in a node",
53434 + left);
53435 + reiser4_print_key("key", key);
53436 + reiser4_print_key("min", &bstop->key);
53437 + print_coord_content("coord", coord);
53438 + return RETERR(-EIO);
53439 + } else {
53440 + coord->between = BEFORE_UNIT;
53441 + return NS_NOT_FOUND;
53442 + }
53443 + }
53444 + /* left <= key, ok */
53445 + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53446 +
53447 + if (unlikely(iplug == NULL)) {
53448 + warning("nikita-588", "Unknown plugin %i",
53449 + le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53450 + reiser4_print_key("key", key);
53451 + print_coord_content("coord", coord);
53452 + return RETERR(-EIO);
53453 + }
53454 +
53455 + coord_set_iplug(coord, iplug);
53456 +
53457 + /* if exact key from item header was found by binary search, no
53458 + further checks are necessary. */
53459 + if (found) {
53460 + assert("nikita-1259", order == EQUAL_TO);
53461 + return NS_FOUND;
53462 + }
53463 + if (iplug->b.max_key_inside != NULL) {
53464 + reiser4_key max_item_key;
53465 +
53466 + /* key > max_item_key --- outside of an item */
53467 + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53468 + coord->unit_pos = 0;
53469 + coord->between = AFTER_ITEM;
53470 + /* FIXME-VS: key we are looking for does not fit into
53471 + found item. Return NS_NOT_FOUND then. Without that
53472 + the following case does not work: there is extent of
53473 + file 10000, 10001. File 10000, 10002 has been just
53474 + created. When writing to position 0 in that file -
53475 + traverse_tree will stop here on twig level. When we
53476 + want it to go down to leaf level
53477 + */
53478 + return NS_NOT_FOUND;
53479 + }
53480 + }
53481 +
53482 + if (iplug->b.lookup != NULL) {
53483 + return iplug->b.lookup(key, bias, coord);
53484 + } else {
53485 + assert("nikita-1260", order == LESS_THAN);
53486 + coord->between = AFTER_UNIT;
53487 + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53488 + }
53489 +}
53490 +
53491 +#undef NODE_ADDSTAT
53492 +#undef NODE_INCSTAT
53493 +
53494 +/* plugin->u.node.estimate
53495 + look for description of this method in plugin/node/node.h */
53496 +size_t estimate_node40(znode * node)
53497 +{
53498 + size_t result;
53499 +
53500 + assert("nikita-597", node != NULL);
53501 +
53502 + result = free_space_node40(node) - sizeof(item_header40);
53503 +
53504 + return (result > 0) ? result : 0;
53505 +}
53506 +
53507 +/* plugin->u.node.check
53508 + look for description of this method in plugin/node/node.h */
53509 +int check_node40(const znode * node /* node to check */ ,
53510 + __u32 flags /* check flags */ ,
53511 + const char **error /* where to store error message */ )
53512 +{
53513 + int nr_items;
53514 + int i;
53515 + reiser4_key prev;
53516 + unsigned old_offset;
53517 + tree_level level;
53518 + coord_t coord;
53519 + int result;
53520 +
53521 + assert("nikita-580", node != NULL);
53522 + assert("nikita-581", error != NULL);
53523 + assert("nikita-2948", znode_is_loaded(node));
53524 +
53525 + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53526 + return 0;
53527 +
53528 + assert("nikita-582", zdata(node) != NULL);
53529 +
53530 + nr_items = node40_num_of_items_internal(node);
53531 + if (nr_items < 0) {
53532 + *error = "Negative number of items";
53533 + return -1;
53534 + }
53535 +
53536 + if (flags & REISER4_NODE_DKEYS)
53537 + prev = *znode_get_ld_key((znode *) node);
53538 + else
53539 + prev = *reiser4_min_key();
53540 +
53541 + old_offset = 0;
53542 + coord_init_zero(&coord);
53543 + coord.node = (znode *) node;
53544 + coord.unit_pos = 0;
53545 + coord.between = AT_UNIT;
53546 + level = znode_get_level(node);
53547 + for (i = 0; i < nr_items; i++) {
53548 + item_header40 *ih;
53549 + reiser4_key unit_key;
53550 + unsigned j;
53551 +
53552 + ih = node40_ih_at(node, (unsigned)i);
53553 + coord_set_item_pos(&coord, i);
53554 + if ((ih40_get_offset(ih) >=
53555 + znode_size(node) - nr_items * sizeof(item_header40)) ||
53556 + (ih40_get_offset(ih) < sizeof(node40_header))) {
53557 + *error = "Offset is out of bounds";
53558 + return -1;
53559 + }
53560 + if (ih40_get_offset(ih) <= old_offset) {
53561 + *error = "Offsets are in wrong order";
53562 + return -1;
53563 + }
53564 + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53565 + *error = "Wrong offset of first item";
53566 + return -1;
53567 + }
53568 + old_offset = ih40_get_offset(ih);
53569 +
53570 + if (keygt(&prev, &ih->key)) {
53571 + *error = "Keys are in wrong order";
53572 + return -1;
53573 + }
53574 + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53575 + *error = "Wrong key of first unit";
53576 + return -1;
53577 + }
53578 + prev = ih->key;
53579 + for (j = 0; j < coord_num_units(&coord); ++j) {
53580 + coord.unit_pos = j;
53581 + unit_key_by_coord(&coord, &unit_key);
53582 + if (keygt(&prev, &unit_key)) {
53583 + *error = "Unit keys are in wrong order";
53584 + return -1;
53585 + }
53586 + prev = unit_key;
53587 + }
53588 + coord.unit_pos = 0;
53589 + if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53590 + *error = "extent on the wrong level";
53591 + return -1;
53592 + }
53593 + if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53594 + *error = "internal item on the wrong level";
53595 + return -1;
53596 + }
53597 + if (level != LEAF_LEVEL &&
53598 + !item_is_internal(&coord) && !item_is_extent(&coord)) {
53599 + *error = "wrong item on the internal level";
53600 + return -1;
53601 + }
53602 + if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53603 + *error = "non-internal item on the internal level";
53604 + return -1;
53605 + }
53606 +#if REISER4_DEBUG
53607 + if (item_plugin_by_coord(&coord)->b.check
53608 + && item_plugin_by_coord(&coord)->b.check(&coord, error))
53609 + return -1;
53610 +#endif
53611 + if (i) {
53612 + coord_t prev_coord;
53613 + /* two neighboring items can not be mergeable */
53614 + coord_dup(&prev_coord, &coord);
53615 + coord_prev_item(&prev_coord);
53616 + if (are_items_mergeable(&prev_coord, &coord)) {
53617 + *error = "mergeable items in one node";
53618 + return -1;
53619 + }
53620 +
53621 + }
53622 + }
53623 +
53624 + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53625 + coord_t coord;
53626 + item_plugin *iplug;
53627 +
53628 + coord_init_last_unit(&coord, node);
53629 + iplug = item_plugin_by_coord(&coord);
53630 + if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53631 + iplug->s.file.append_key != NULL) {
53632 + reiser4_key mkey;
53633 +
53634 + iplug->s.file.append_key(&coord, &mkey);
53635 + set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53636 + read_lock_dk(current_tree);
53637 + result = keygt(&mkey, znode_get_rd_key((znode *) node));
53638 + read_unlock_dk(current_tree);
53639 + if (result) {
53640 + *error = "key of rightmost item is too large";
53641 + return -1;
53642 + }
53643 + }
53644 + }
53645 + if (flags & REISER4_NODE_DKEYS) {
53646 + read_lock_tree(current_tree);
53647 + read_lock_dk(current_tree);
53648 +
53649 + flags |= REISER4_NODE_TREE_STABLE;
53650 +
53651 + if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53652 + if (flags & REISER4_NODE_TREE_STABLE) {
53653 + *error = "Last key is greater than rdkey";
53654 + read_unlock_dk(current_tree);
53655 + read_unlock_tree(current_tree);
53656 + return -1;
53657 + }
53658 + }
53659 + if (keygt
53660 + (znode_get_ld_key((znode *) node),
53661 + znode_get_rd_key((znode *) node))) {
53662 + *error = "ldkey is greater than rdkey";
53663 + read_unlock_dk(current_tree);
53664 + read_unlock_tree(current_tree);
53665 + return -1;
53666 + }
53667 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53668 + (node->left != NULL) &&
53669 + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53670 + ergo(flags & REISER4_NODE_TREE_STABLE,
53671 + !keyeq(znode_get_rd_key(node->left),
53672 + znode_get_ld_key((znode *) node)))
53673 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53674 + keygt(znode_get_rd_key(node->left),
53675 + znode_get_ld_key((znode *) node)))) {
53676 + *error = "left rdkey or ldkey is wrong";
53677 + read_unlock_dk(current_tree);
53678 + read_unlock_tree(current_tree);
53679 + return -1;
53680 + }
53681 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53682 + (node->right != NULL) &&
53683 + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53684 + ergo(flags & REISER4_NODE_TREE_STABLE,
53685 + !keyeq(znode_get_rd_key((znode *) node),
53686 + znode_get_ld_key(node->right)))
53687 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53688 + keygt(znode_get_rd_key((znode *) node),
53689 + znode_get_ld_key(node->right)))) {
53690 + *error = "rdkey or right ldkey is wrong";
53691 + read_unlock_dk(current_tree);
53692 + read_unlock_tree(current_tree);
53693 + return -1;
53694 + }
53695 +
53696 + read_unlock_dk(current_tree);
53697 + read_unlock_tree(current_tree);
53698 + }
53699 +
53700 + return 0;
53701 +}
53702 +
53703 +/* plugin->u.node.parse
53704 + look for description of this method in plugin/node/node.h */
53705 +int parse_node40(znode * node /* node to parse */ )
53706 +{
53707 + node40_header *header;
53708 + int result;
53709 + d8 level;
53710 +
53711 + header = node40_node_header((znode *) node);
53712 + result = -EIO;
53713 + level = nh40_get_level(header);
53714 + if (unlikely(((__u8) znode_get_level(node)) != level))
53715 + warning("nikita-494", "Wrong level found in node: %i != %i",
53716 + znode_get_level(node), level);
53717 + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53718 + warning("nikita-495",
53719 + "Wrong magic in tree node: want %x, got %x",
53720 + REISER4_NODE_MAGIC, nh40_get_magic(header));
53721 + else {
53722 + node->nr_items = node40_num_of_items_internal(node);
53723 + result = 0;
53724 + }
53725 + return RETERR(result);
53726 +}
53727 +
53728 +/* plugin->u.node.init
53729 + look for description of this method in plugin/node/node.h */
53730 +int init_node40(znode * node /* node to initialise */ )
53731 +{
53732 + node40_header *header;
53733 +
53734 + assert("nikita-570", node != NULL);
53735 + assert("nikita-572", zdata(node) != NULL);
53736 +
53737 + header = node40_node_header(node);
53738 + memset(header, 0, sizeof(node40_header));
53739 + nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53740 + nh40_set_free_space_start(header, sizeof(node40_header));
53741 + /* sane hypothesis: 0 in CPU format is 0 in disk format */
53742 + /* items: 0 */
53743 + save_plugin_id(node_plugin_to_plugin(node->nplug),
53744 + &header->common_header.plugin_id);
53745 + nh40_set_level(header, znode_get_level(node));
53746 + nh40_set_magic(header, REISER4_NODE_MAGIC);
53747 + node->nr_items = 0;
53748 + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53749 +
53750 + /* flags: 0 */
53751 + return 0;
53752 +}
53753 +
53754 +#ifdef GUESS_EXISTS
53755 +int guess_node40(const znode * node /* node to guess plugin of */ )
53756 +{
53757 + node40_header *nethack;
53758 +
53759 + assert("nikita-1058", node != NULL);
53760 + nethack = node40_node_header(node);
53761 + return
53762 + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53763 + (plugin_by_disk_id(znode_get_tree(node),
53764 + REISER4_NODE_PLUGIN_TYPE,
53765 + &nethack->common_header.plugin_id)->h.id ==
53766 + NODE40_ID);
53767 +}
53768 +#endif
53769 +
53770 +/* plugin->u.node.chage_item_size
53771 + look for description of this method in plugin/node/node.h */
53772 +void change_item_size_node40(coord_t * coord, int by)
53773 +{
53774 + node40_header *nh;
53775 + item_header40 *ih;
53776 + char *item_data;
53777 + int item_length;
53778 + unsigned i;
53779 +
53780 + /* make sure that @item is coord of existing item */
53781 + assert("vs-210", coord_is_existing_item(coord));
53782 +
53783 + nh = node40_node_header(coord->node);
53784 +
53785 + item_data = item_by_coord_node40(coord);
53786 + item_length = length_by_coord_node40(coord);
53787 +
53788 + /* move item bodies */
53789 + ih = node40_ih_at_coord(coord);
53790 + memmove(item_data + item_length + by, item_data + item_length,
53791 + nh40_get_free_space_start(node40_node_header(coord->node)) -
53792 + (ih40_get_offset(ih) + item_length));
53793 +
53794 + /* update offsets of moved items */
53795 + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53796 + ih = node40_ih_at(coord->node, i);
53797 + ih40_set_offset(ih, ih40_get_offset(ih) + by);
53798 + }
53799 +
53800 + /* update node header */
53801 + nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53802 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53803 +}
53804 +
53805 +static int should_notify_parent(const znode * node)
53806 +{
53807 + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53808 + return !disk_addr_eq(znode_get_block(node),
53809 + &znode_get_tree(node)->root_block);
53810 +}
53811 +
53812 +/* plugin->u.node.create_item
53813 + look for description of this method in plugin/node/node.h */
53814 +int
53815 +create_item_node40(coord_t *target, const reiser4_key *key,
53816 + reiser4_item_data *data, carry_plugin_info *info)
53817 +{
53818 + node40_header *nh;
53819 + item_header40 *ih;
53820 + unsigned offset;
53821 + unsigned i;
53822 +
53823 + nh = node40_node_header(target->node);
53824 +
53825 + assert("vs-212", coord_is_between_items(target));
53826 + /* node must have enough free space */
53827 + assert("vs-254",
53828 + free_space_node40(target->node) >=
53829 + data->length + sizeof(item_header40));
53830 + assert("vs-1410", data->length >= 0);
53831 +
53832 + if (coord_set_to_right(target))
53833 + /* there are not items to the right of @target, so, new item
53834 + will be inserted after last one */
53835 + coord_set_item_pos(target, nh40_get_num_items(nh));
53836 +
53837 + if (target->item_pos < nh40_get_num_items(nh)) {
53838 + /* there are items to be moved to prepare space for new
53839 + item */
53840 + ih = node40_ih_at_coord(target);
53841 + /* new item will start at this offset */
53842 + offset = ih40_get_offset(ih);
53843 +
53844 + memmove(zdata(target->node) + offset + data->length,
53845 + zdata(target->node) + offset,
53846 + nh40_get_free_space_start(nh) - offset);
53847 + /* update headers of moved items */
53848 + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53849 + ih = node40_ih_at(target->node, i);
53850 + ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53851 + }
53852 +
53853 + /* @ih is set to item header of the last item, move item headers */
53854 + memmove(ih - 1, ih,
53855 + sizeof(item_header40) * (nh40_get_num_items(nh) -
53856 + target->item_pos));
53857 + } else {
53858 + /* new item will start at this offset */
53859 + offset = nh40_get_free_space_start(nh);
53860 + }
53861 +
53862 + /* make item header for the new item */
53863 + ih = node40_ih_at_coord(target);
53864 + memcpy(&ih->key, key, sizeof(reiser4_key));
53865 + ih40_set_offset(ih, offset);
53866 + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53867 +
53868 + /* update node header */
53869 + nh40_set_free_space(nh,
53870 + nh40_get_free_space(nh) - data->length -
53871 + sizeof(item_header40));
53872 + nh40_set_free_space_start(nh,
53873 + nh40_get_free_space_start(nh) + data->length);
53874 + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53875 +
53876 + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53877 + target->unit_pos = 0;
53878 + target->between = AT_UNIT;
53879 + coord_clear_iplug(target);
53880 +
53881 + /* initialize item */
53882 + if (data->iplug->b.init != NULL) {
53883 + data->iplug->b.init(target, NULL, data);
53884 + }
53885 + /* copy item body */
53886 + if (data->iplug->b.paste != NULL) {
53887 + data->iplug->b.paste(target, data, info);
53888 + } else if (data->data != NULL) {
53889 + if (data->user) {
53890 + /* AUDIT: Are we really should not check that pointer
53891 + from userspace was valid and data bytes were
53892 + available? How will we return -EFAULT of some kind
53893 + without this check? */
53894 + assert("nikita-3038", reiser4_schedulable());
53895 + /* copy data from user space */
53896 + __copy_from_user(zdata(target->node) + offset,
53897 + (const char __user *)data->data,
53898 + (unsigned)data->length);
53899 + } else
53900 + /* copy from kernel space */
53901 + memcpy(zdata(target->node) + offset, data->data,
53902 + (unsigned)data->length);
53903 + }
53904 +
53905 + if (target->item_pos == 0) {
53906 + /* left delimiting key has to be updated */
53907 + prepare_for_update(NULL, target->node, info);
53908 + }
53909 +
53910 + if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53911 + item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53912 + }
53913 +
53914 + return 0;
53915 +}
53916 +
53917 +/* plugin->u.node.update_item_key
53918 + look for description of this method in plugin/node/node.h */
53919 +void
53920 +update_item_key_node40(coord_t * target, const reiser4_key * key,
53921 + carry_plugin_info * info)
53922 +{
53923 + item_header40 *ih;
53924 +
53925 + ih = node40_ih_at_coord(target);
53926 + memcpy(&ih->key, key, sizeof(reiser4_key));
53927 +
53928 + if (target->item_pos == 0) {
53929 + prepare_for_update(NULL, target->node, info);
53930 + }
53931 +}
53932 +
53933 +/* this bits encode cut mode */
53934 +#define CMODE_TAIL 1
53935 +#define CMODE_WHOLE 2
53936 +#define CMODE_HEAD 4
53937 +
53938 +struct cut40_info {
53939 + int mode;
53940 + pos_in_node_t tail_removed; /* position of item which gets tail removed */
53941 + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
53942 + pos_in_node_t removed_count; /* number of items removed completely */
53943 + pos_in_node_t head_removed; /* position of item which gets head removed */
53944 +
53945 + pos_in_node_t freed_space_start;
53946 + pos_in_node_t freed_space_end;
53947 + pos_in_node_t first_moved;
53948 + pos_in_node_t head_removed_location;
53949 +};
53950 +
53951 +static void init_cinfo(struct cut40_info *cinfo)
53952 +{
53953 + cinfo->mode = 0;
53954 + cinfo->tail_removed = MAX_POS_IN_NODE;
53955 + cinfo->first_removed = MAX_POS_IN_NODE;
53956 + cinfo->removed_count = MAX_POS_IN_NODE;
53957 + cinfo->head_removed = MAX_POS_IN_NODE;
53958 + cinfo->freed_space_start = MAX_POS_IN_NODE;
53959 + cinfo->freed_space_end = MAX_POS_IN_NODE;
53960 + cinfo->first_moved = MAX_POS_IN_NODE;
53961 + cinfo->head_removed_location = MAX_POS_IN_NODE;
53962 +}
53963 +
53964 +/* complete cut_node40/kill_node40 content by removing the gap created by */
53965 +static void compact(znode * node, struct cut40_info *cinfo)
53966 +{
53967 + node40_header *nh;
53968 + item_header40 *ih;
53969 + pos_in_node_t freed;
53970 + pos_in_node_t pos, nr_items;
53971 +
53972 + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53973 + cinfo->freed_space_end != MAX_POS_IN_NODE &&
53974 + cinfo->first_moved != MAX_POS_IN_NODE));
53975 + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53976 +
53977 + nh = node40_node_header(node);
53978 + nr_items = nh40_get_num_items(nh);
53979 +
53980 + /* remove gap made up by removal */
53981 + memmove(zdata(node) + cinfo->freed_space_start,
53982 + zdata(node) + cinfo->freed_space_end,
53983 + nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53984 +
53985 + /* update item headers of moved items - change their locations */
53986 + pos = cinfo->first_moved;
53987 + ih = node40_ih_at(node, pos);
53988 + if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53989 + assert("vs-1580", pos == cinfo->head_removed);
53990 + ih40_set_offset(ih, cinfo->head_removed_location);
53991 + pos++;
53992 + ih--;
53993 + }
53994 +
53995 + freed = cinfo->freed_space_end - cinfo->freed_space_start;
53996 + for (; pos < nr_items; pos++, ih--) {
53997 + assert("vs-1581", ih == node40_ih_at(node, pos));
53998 + ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53999 + }
54000 +
54001 + /* free space start moved to right */
54002 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
54003 +
54004 + if (cinfo->removed_count != MAX_POS_IN_NODE) {
54005 + /* number of items changed. Remove item headers of those items */
54006 + ih = node40_ih_at(node, nr_items - 1);
54007 + memmove(ih + cinfo->removed_count, ih,
54008 + sizeof(item_header40) * (nr_items -
54009 + cinfo->removed_count -
54010 + cinfo->first_removed));
54011 + freed += sizeof(item_header40) * cinfo->removed_count;
54012 + node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
54013 + }
54014 +
54015 + /* total amount of free space increased */
54016 + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
54017 +}
54018 +
54019 +int shrink_item_node40(coord_t * coord, int delta)
54020 +{
54021 + node40_header *nh;
54022 + item_header40 *ih;
54023 + pos_in_node_t pos;
54024 + pos_in_node_t nr_items;
54025 + char *end;
54026 + znode *node;
54027 + int off;
54028 +
54029 + assert("nikita-3487", coord != NULL);
54030 + assert("nikita-3488", delta >= 0);
54031 +
54032 + node = coord->node;
54033 + nh = node40_node_header(node);
54034 + nr_items = nh40_get_num_items(nh);
54035 +
54036 + ih = node40_ih_at_coord(coord);
54037 + assert("nikita-3489", delta <= length_by_coord_node40(coord));
54038 + off = ih40_get_offset(ih) + length_by_coord_node40(coord);
54039 + end = zdata(node) + off;
54040 +
54041 + /* remove gap made up by removal */
54042 + memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
54043 +
54044 + /* update item headers of moved items - change their locations */
54045 + pos = coord->item_pos + 1;
54046 + ih = node40_ih_at(node, pos);
54047 + for (; pos < nr_items; pos++, ih--) {
54048 + assert("nikita-3490", ih == node40_ih_at(node, pos));
54049 + ih40_set_offset(ih, ih40_get_offset(ih) - delta);
54050 + }
54051 +
54052 + /* free space start moved to left */
54053 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
54054 + /* total amount of free space increased */
54055 + nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
54056 + /*
54057 + * This method does _not_ changes number of items. Hence, it cannot
54058 + * make node empty. Also it doesn't remove items at all, which means
54059 + * that no keys have to be updated either.
54060 + */
54061 + return 0;
54062 +}
54063 +
54064 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
54065 + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
54066 + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
54067 + getting head cut. Function returns 0 in this case */
54068 +static int
54069 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
54070 +{
54071 + reiser4_key left_key, right_key;
54072 + reiser4_key min_from_key, max_to_key;
54073 + const reiser4_key *from_key, *to_key;
54074 +
54075 + init_cinfo(cinfo);
54076 +
54077 + /* calculate minimal key stored in first item of items to be cut (params->from) */
54078 + item_key_by_coord(params->from, &min_from_key);
54079 + /* and max key stored in last item of items to be cut (params->to) */
54080 + max_item_key_by_coord(params->to, &max_to_key);
54081 +
54082 + /* if cut key range is not defined in input parameters - define it using cut coord range */
54083 + if (params->from_key == NULL) {
54084 + assert("vs-1513", params->to_key == NULL);
54085 + unit_key_by_coord(params->from, &left_key);
54086 + from_key = &left_key;
54087 + max_unit_key_by_coord(params->to, &right_key);
54088 + to_key = &right_key;
54089 + } else {
54090 + from_key = params->from_key;
54091 + to_key = params->to_key;
54092 + }
54093 +
54094 + if (params->from->item_pos == params->to->item_pos) {
54095 + if (keylt(&min_from_key, from_key)
54096 + && keylt(to_key, &max_to_key))
54097 + return 1;
54098 +
54099 + if (keygt(from_key, &min_from_key)) {
54100 + /* tail of item is to be cut cut */
54101 + cinfo->tail_removed = params->from->item_pos;
54102 + cinfo->mode |= CMODE_TAIL;
54103 + } else if (keylt(to_key, &max_to_key)) {
54104 + /* head of item is to be cut */
54105 + cinfo->head_removed = params->from->item_pos;
54106 + cinfo->mode |= CMODE_HEAD;
54107 + } else {
54108 + /* item is removed completely */
54109 + cinfo->first_removed = params->from->item_pos;
54110 + cinfo->removed_count = 1;
54111 + cinfo->mode |= CMODE_WHOLE;
54112 + }
54113 + } else {
54114 + cinfo->first_removed = params->from->item_pos + 1;
54115 + cinfo->removed_count =
54116 + params->to->item_pos - params->from->item_pos - 1;
54117 +
54118 + if (keygt(from_key, &min_from_key)) {
54119 + /* first item is not cut completely */
54120 + cinfo->tail_removed = params->from->item_pos;
54121 + cinfo->mode |= CMODE_TAIL;
54122 + } else {
54123 + cinfo->first_removed--;
54124 + cinfo->removed_count++;
54125 + }
54126 + if (keylt(to_key, &max_to_key)) {
54127 + /* last item is not cut completely */
54128 + cinfo->head_removed = params->to->item_pos;
54129 + cinfo->mode |= CMODE_HEAD;
54130 + } else {
54131 + cinfo->removed_count++;
54132 + }
54133 + if (cinfo->removed_count)
54134 + cinfo->mode |= CMODE_WHOLE;
54135 + }
54136 +
54137 + return 0;
54138 +}
54139 +
54140 +static void
54141 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
54142 + carry_kill_data * kdata)
54143 +{
54144 + coord_t coord;
54145 + item_plugin *iplug;
54146 + pos_in_node_t pos;
54147 +
54148 + coord.node = node;
54149 + coord.unit_pos = 0;
54150 + coord.between = AT_UNIT;
54151 + for (pos = 0; pos < count; pos++) {
54152 + coord_set_item_pos(&coord, from + pos);
54153 + coord.unit_pos = 0;
54154 + coord.between = AT_UNIT;
54155 + iplug = item_plugin_by_coord(&coord);
54156 + if (iplug->b.kill_hook) {
54157 + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
54158 + kdata);
54159 + }
54160 + }
54161 +}
54162 +
54163 +/* this is used to kill item partially */
54164 +static pos_in_node_t
54165 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
54166 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
54167 +{
54168 + struct carry_kill_data *kdata;
54169 + item_plugin *iplug;
54170 +
54171 + kdata = data;
54172 + iplug = item_plugin_by_coord(coord);
54173 +
54174 + assert("vs-1524", iplug->b.kill_units);
54175 + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
54176 + new_first_key);
54177 +}
54178 +
54179 +/* call item plugin to cut tail of file */
54180 +static pos_in_node_t
54181 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
54182 +{
54183 + struct carry_kill_data *kdata;
54184 + pos_in_node_t to;
54185 +
54186 + kdata = data;
54187 + to = coord_last_unit_pos(coord);
54188 + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
54189 + NULL);
54190 +}
54191 +
54192 +/* call item plugin to cut head of item */
54193 +static pos_in_node_t
54194 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
54195 + reiser4_key * new_first_key)
54196 +{
54197 + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
54198 + new_first_key);
54199 +}
54200 +
54201 +/* this is used to cut item partially */
54202 +static pos_in_node_t
54203 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
54204 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
54205 +{
54206 + carry_cut_data *cdata;
54207 + item_plugin *iplug;
54208 +
54209 + cdata = data;
54210 + iplug = item_plugin_by_coord(coord);
54211 + assert("vs-302", iplug->b.cut_units);
54212 + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
54213 + new_first_key);
54214 +}
54215 +
54216 +/* call item plugin to cut tail of file */
54217 +static pos_in_node_t
54218 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
54219 +{
54220 + carry_cut_data *cdata;
54221 + pos_in_node_t to;
54222 +
54223 + cdata = data;
54224 + to = coord_last_unit_pos(cdata->params.from);
54225 + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
54226 +}
54227 +
54228 +/* call item plugin to cut head of item */
54229 +static pos_in_node_t
54230 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
54231 + reiser4_key * new_first_key)
54232 +{
54233 + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
54234 + new_first_key);
54235 +}
54236 +
54237 +/* this returns 1 of key of first item changed, 0 - if it did not */
54238 +static int
54239 +prepare_for_compact(struct cut40_info *cinfo,
54240 + const struct cut_kill_params *params, int is_cut,
54241 + void *data, carry_plugin_info * info)
54242 +{
54243 + znode *node;
54244 + item_header40 *ih;
54245 + pos_in_node_t freed;
54246 + pos_in_node_t item_pos;
54247 + coord_t coord;
54248 + reiser4_key new_first_key;
54249 + pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
54250 + void *, reiser4_key *, reiser4_key *);
54251 + pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
54252 + pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
54253 + reiser4_key *);
54254 + int retval;
54255 +
54256 + retval = 0;
54257 +
54258 + node = params->from->node;
54259 +
54260 + assert("vs-184", node == params->to->node);
54261 + assert("vs-312", !node_is_empty(node));
54262 + assert("vs-297",
54263 + coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
54264 +
54265 + if (is_cut) {
54266 + kill_units_f = cut_units;
54267 + kill_tail_f = cut_tail;
54268 + kill_head_f = cut_head;
54269 + } else {
54270 + kill_units_f = kill_units;
54271 + kill_tail_f = kill_tail;
54272 + kill_head_f = kill_head;
54273 + }
54274 +
54275 + if (parse_cut(cinfo, params) == 1) {
54276 + /* cut from the middle of item */
54277 + freed =
54278 + kill_units_f(params->from, params->from->unit_pos,
54279 + params->to->unit_pos, data,
54280 + params->smallest_removed, NULL);
54281 +
54282 + item_pos = params->from->item_pos;
54283 + ih = node40_ih_at(node, item_pos);
54284 + cinfo->freed_space_start =
54285 + ih40_get_offset(ih) + node40_item_length(node,
54286 + item_pos) - freed;
54287 + cinfo->freed_space_end = cinfo->freed_space_start + freed;
54288 + cinfo->first_moved = item_pos + 1;
54289 + } else {
54290 + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
54291 + cinfo->first_removed != MAX_POS_IN_NODE ||
54292 + cinfo->head_removed != MAX_POS_IN_NODE));
54293 +
54294 + switch (cinfo->mode) {
54295 + case CMODE_TAIL:
54296 + /* one item gets cut partially from its end */
54297 + assert("vs-1562",
54298 + cinfo->tail_removed == params->from->item_pos);
54299 +
54300 + freed =
54301 + kill_tail_f(params->from, data,
54302 + params->smallest_removed);
54303 +
54304 + item_pos = cinfo->tail_removed;
54305 + ih = node40_ih_at(node, item_pos);
54306 + cinfo->freed_space_start =
54307 + ih40_get_offset(ih) + node40_item_length(node,
54308 + item_pos) -
54309 + freed;
54310 + cinfo->freed_space_end =
54311 + cinfo->freed_space_start + freed;
54312 + cinfo->first_moved = cinfo->tail_removed + 1;
54313 + break;
54314 +
54315 + case CMODE_WHOLE:
54316 + /* one or more items get removed completely */
54317 + assert("vs-1563",
54318 + cinfo->first_removed == params->from->item_pos);
54319 + assert("vs-1564", cinfo->removed_count > 0
54320 + && cinfo->removed_count != MAX_POS_IN_NODE);
54321 +
54322 + /* call kill hook for all items removed completely */
54323 + if (is_cut == 0)
54324 + call_kill_hooks(node, cinfo->first_removed,
54325 + cinfo->removed_count, data);
54326 +
54327 + item_pos = cinfo->first_removed;
54328 + ih = node40_ih_at(node, item_pos);
54329 +
54330 + if (params->smallest_removed)
54331 + memcpy(params->smallest_removed, &ih->key,
54332 + sizeof(reiser4_key));
54333 +
54334 + cinfo->freed_space_start = ih40_get_offset(ih);
54335 +
54336 + item_pos += (cinfo->removed_count - 1);
54337 + ih -= (cinfo->removed_count - 1);
54338 + cinfo->freed_space_end =
54339 + ih40_get_offset(ih) + node40_item_length(node,
54340 + item_pos);
54341 + cinfo->first_moved = item_pos + 1;
54342 + if (cinfo->first_removed == 0)
54343 + /* key of first item of the node changes */
54344 + retval = 1;
54345 + break;
54346 +
54347 + case CMODE_HEAD:
54348 + /* one item gets cut partially from its head */
54349 + assert("vs-1565",
54350 + cinfo->head_removed == params->from->item_pos);
54351 +
54352 + freed =
54353 + kill_head_f(params->to, data,
54354 + params->smallest_removed,
54355 + &new_first_key);
54356 +
54357 + item_pos = cinfo->head_removed;
54358 + ih = node40_ih_at(node, item_pos);
54359 + cinfo->freed_space_start = ih40_get_offset(ih);
54360 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54361 + cinfo->first_moved = cinfo->head_removed + 1;
54362 +
54363 + /* item head is removed, therefore, item key changed */
54364 + coord.node = node;
54365 + coord_set_item_pos(&coord, item_pos);
54366 + coord.unit_pos = 0;
54367 + coord.between = AT_UNIT;
54368 + update_item_key_node40(&coord, &new_first_key, NULL);
54369 + if (item_pos == 0)
54370 + /* key of first item of the node changes */
54371 + retval = 1;
54372 + break;
54373 +
54374 + case CMODE_TAIL | CMODE_WHOLE:
54375 + /* one item gets cut from its end and one or more items get removed completely */
54376 + assert("vs-1566",
54377 + cinfo->tail_removed == params->from->item_pos);
54378 + assert("vs-1567",
54379 + cinfo->first_removed == cinfo->tail_removed + 1);
54380 + assert("vs-1564", cinfo->removed_count > 0
54381 + && cinfo->removed_count != MAX_POS_IN_NODE);
54382 +
54383 + freed =
54384 + kill_tail_f(params->from, data,
54385 + params->smallest_removed);
54386 +
54387 + item_pos = cinfo->tail_removed;
54388 + ih = node40_ih_at(node, item_pos);
54389 + cinfo->freed_space_start =
54390 + ih40_get_offset(ih) + node40_item_length(node,
54391 + item_pos) -
54392 + freed;
54393 +
54394 + /* call kill hook for all items removed completely */
54395 + if (is_cut == 0)
54396 + call_kill_hooks(node, cinfo->first_removed,
54397 + cinfo->removed_count, data);
54398 +
54399 + item_pos += cinfo->removed_count;
54400 + ih -= cinfo->removed_count;
54401 + cinfo->freed_space_end =
54402 + ih40_get_offset(ih) + node40_item_length(node,
54403 + item_pos);
54404 + cinfo->first_moved = item_pos + 1;
54405 + break;
54406 +
54407 + case CMODE_WHOLE | CMODE_HEAD:
54408 + /* one or more items get removed completely and one item gets cut partially from its head */
54409 + assert("vs-1568",
54410 + cinfo->first_removed == params->from->item_pos);
54411 + assert("vs-1564", cinfo->removed_count > 0
54412 + && cinfo->removed_count != MAX_POS_IN_NODE);
54413 + assert("vs-1569",
54414 + cinfo->head_removed ==
54415 + cinfo->first_removed + cinfo->removed_count);
54416 +
54417 + /* call kill hook for all items removed completely */
54418 + if (is_cut == 0)
54419 + call_kill_hooks(node, cinfo->first_removed,
54420 + cinfo->removed_count, data);
54421 +
54422 + item_pos = cinfo->first_removed;
54423 + ih = node40_ih_at(node, item_pos);
54424 +
54425 + if (params->smallest_removed)
54426 + memcpy(params->smallest_removed, &ih->key,
54427 + sizeof(reiser4_key));
54428 +
54429 + freed =
54430 + kill_head_f(params->to, data, NULL, &new_first_key);
54431 +
54432 + cinfo->freed_space_start = ih40_get_offset(ih);
54433 +
54434 + ih = node40_ih_at(node, cinfo->head_removed);
54435 + /* this is the most complex case. Item which got head removed and items which are to be moved
54436 + intact change their location differently. */
54437 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54438 + cinfo->first_moved = cinfo->head_removed;
54439 + cinfo->head_removed_location = cinfo->freed_space_start;
54440 +
54441 + /* item head is removed, therefore, item key changed */
54442 + coord.node = node;
54443 + coord_set_item_pos(&coord, cinfo->head_removed);
54444 + coord.unit_pos = 0;
54445 + coord.between = AT_UNIT;
54446 + update_item_key_node40(&coord, &new_first_key, NULL);
54447 +
54448 + assert("vs-1579", cinfo->first_removed == 0);
54449 + /* key of first item of the node changes */
54450 + retval = 1;
54451 + break;
54452 +
54453 + case CMODE_TAIL | CMODE_HEAD:
54454 + /* one item get cut from its end and its neighbor gets cut from its tail */
54455 + impossible("vs-1576", "this can not happen currently");
54456 + break;
54457 +
54458 + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54459 + impossible("vs-1577", "this can not happen currently");
54460 + break;
54461 + default:
54462 + impossible("vs-1578", "unexpected cut mode");
54463 + break;
54464 + }
54465 + }
54466 + return retval;
54467 +}
54468 +
54469 +/* plugin->u.node.kill
54470 + return value is number of items removed completely */
54471 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54472 +{
54473 + znode *node;
54474 + struct cut40_info cinfo;
54475 + int first_key_changed;
54476 +
54477 + node = kdata->params.from->node;
54478 +
54479 + first_key_changed =
54480 + prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54481 + info);
54482 + compact(node, &cinfo);
54483 +
54484 + if (info) {
54485 + /* it is not called by node40_shift, so we have to take care
54486 + of changes on upper levels */
54487 + if (node_is_empty(node)
54488 + && !(kdata->flags & DELETE_RETAIN_EMPTY))
54489 + /* all contents of node is deleted */
54490 + prepare_removal_node40(node, info);
54491 + else if (first_key_changed) {
54492 + prepare_for_update(NULL, node, info);
54493 + }
54494 + }
54495 +
54496 + coord_clear_iplug(kdata->params.from);
54497 + coord_clear_iplug(kdata->params.to);
54498 +
54499 + znode_make_dirty(node);
54500 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54501 +}
54502 +
54503 +/* plugin->u.node.cut
54504 + return value is number of items removed completely */
54505 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54506 +{
54507 + znode *node;
54508 + struct cut40_info cinfo;
54509 + int first_key_changed;
54510 +
54511 + node = cdata->params.from->node;
54512 +
54513 + first_key_changed =
54514 + prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54515 + info);
54516 + compact(node, &cinfo);
54517 +
54518 + if (info) {
54519 + /* it is not called by node40_shift, so we have to take care
54520 + of changes on upper levels */
54521 + if (node_is_empty(node))
54522 + /* all contents of node is deleted */
54523 + prepare_removal_node40(node, info);
54524 + else if (first_key_changed) {
54525 + prepare_for_update(NULL, node, info);
54526 + }
54527 + }
54528 +
54529 + coord_clear_iplug(cdata->params.from);
54530 + coord_clear_iplug(cdata->params.to);
54531 +
54532 + znode_make_dirty(node);
54533 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54534 +}
54535 +
54536 +/* this structure is used by shift method of node40 plugin */
54537 +struct shift_params {
54538 + shift_direction pend; /* when @pend == append - we are shifting to
54539 + left, when @pend == prepend - to right */
54540 + coord_t wish_stop; /* when shifting to left this is last unit we
54541 + want shifted, when shifting to right - this
54542 + is set to unit we want to start shifting
54543 + from */
54544 + znode *target;
54545 + int everything; /* it is set to 1 if everything we have to shift is
54546 + shifted, 0 - otherwise */
54547 +
54548 + /* FIXME-VS: get rid of read_stop */
54549 +
54550 + /* these are set by estimate_shift */
54551 + coord_t real_stop; /* this will be set to last unit which will be
54552 + really shifted */
54553 +
54554 + /* coordinate in source node before operation of unit which becomes
54555 + first after shift to left of last after shift to right */
54556 + union {
54557 + coord_t future_first;
54558 + coord_t future_last;
54559 + } u;
54560 +
54561 + unsigned merging_units; /* number of units of first item which have to
54562 + be merged with last item of target node */
54563 + unsigned merging_bytes; /* number of bytes in those units */
54564 +
54565 + unsigned entire; /* items shifted in their entirety */
54566 + unsigned entire_bytes; /* number of bytes in those items */
54567 +
54568 + unsigned part_units; /* number of units of partially copied item */
54569 + unsigned part_bytes; /* number of bytes in those units */
54570 +
54571 + unsigned shift_bytes; /* total number of bytes in items shifted (item
54572 + headers not included) */
54573 +
54574 +};
54575 +
54576 +static int item_creation_overhead(coord_t *item)
54577 +{
54578 + return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54579 +}
54580 +
54581 +/* how many units are there in @source starting from source->unit_pos
54582 + but not further than @stop_coord */
54583 +static int
54584 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54585 +{
54586 + if (pend == SHIFT_LEFT) {
54587 + assert("vs-181", source->unit_pos == 0);
54588 + } else {
54589 + assert("vs-182",
54590 + source->unit_pos == coord_last_unit_pos(source));
54591 + }
54592 +
54593 + if (source->item_pos != stop_coord->item_pos) {
54594 + /* @source and @stop_coord are different items */
54595 + return coord_last_unit_pos(source) + 1;
54596 + }
54597 +
54598 + if (pend == SHIFT_LEFT) {
54599 + return stop_coord->unit_pos + 1;
54600 + } else {
54601 + return source->unit_pos - stop_coord->unit_pos + 1;
54602 + }
54603 +}
54604 +
54605 +/* this calculates what can be copied from @shift->wish_stop.node to
54606 + @shift->target */
54607 +static void
54608 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54609 +{
54610 + unsigned target_free_space, size;
54611 + pos_in_node_t stop_item; /* item which estimating should not consider */
54612 + unsigned want; /* number of units of item we want shifted */
54613 + coord_t source; /* item being estimated */
54614 + item_plugin *iplug;
54615 +
54616 + /* shifting to left/right starts from first/last units of
54617 + @shift->wish_stop.node */
54618 + if (shift->pend == SHIFT_LEFT) {
54619 + coord_init_first_unit(&source, shift->wish_stop.node);
54620 + } else {
54621 + coord_init_last_unit(&source, shift->wish_stop.node);
54622 + }
54623 + shift->real_stop = source;
54624 +
54625 + /* free space in target node and number of items in source */
54626 + target_free_space = znode_free_space(shift->target);
54627 +
54628 + shift->everything = 0;
54629 + if (!node_is_empty(shift->target)) {
54630 + /* target node is not empty, check for boundary items
54631 + mergeability */
54632 + coord_t to;
54633 +
54634 + /* item we try to merge @source with */
54635 + if (shift->pend == SHIFT_LEFT) {
54636 + coord_init_last_unit(&to, shift->target);
54637 + } else {
54638 + coord_init_first_unit(&to, shift->target);
54639 + }
54640 +
54641 + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54642 + &source) :
54643 + are_items_mergeable(&source, &to)) {
54644 + /* how many units of @source do we want to merge to
54645 + item @to */
54646 + want =
54647 + wanted_units(&source, &shift->wish_stop,
54648 + shift->pend);
54649 +
54650 + /* how many units of @source we can merge to item
54651 + @to */
54652 + iplug = item_plugin_by_coord(&source);
54653 + if (iplug->b.can_shift != NULL)
54654 + shift->merging_units =
54655 + iplug->b.can_shift(target_free_space,
54656 + &source, shift->target,
54657 + shift->pend, &size,
54658 + want);
54659 + else {
54660 + shift->merging_units = 0;
54661 + size = 0;
54662 + }
54663 + shift->merging_bytes = size;
54664 + shift->shift_bytes += size;
54665 + /* update stop coord to be set to last unit of @source
54666 + we can merge to @target */
54667 + if (shift->merging_units)
54668 + /* at least one unit can be shifted */
54669 + shift->real_stop.unit_pos =
54670 + (shift->merging_units - source.unit_pos -
54671 + 1) * shift->pend;
54672 + else {
54673 + /* nothing can be shifted */
54674 + if (shift->pend == SHIFT_LEFT)
54675 + coord_init_before_first_item(&shift->
54676 + real_stop,
54677 + source.
54678 + node);
54679 + else
54680 + coord_init_after_last_item(&shift->
54681 + real_stop,
54682 + source.node);
54683 + }
54684 + assert("nikita-2081", shift->real_stop.unit_pos + 1);
54685 +
54686 + if (shift->merging_units != want) {
54687 + /* we could not copy as many as we want, so,
54688 + there is no reason for estimating any
54689 + longer */
54690 + return;
54691 + }
54692 +
54693 + target_free_space -= size;
54694 + coord_add_item_pos(&source, shift->pend);
54695 + }
54696 + }
54697 +
54698 + /* number of item nothing of which we want to shift */
54699 + stop_item = shift->wish_stop.item_pos + shift->pend;
54700 +
54701 + /* calculate how many items can be copied into given free
54702 + space as whole */
54703 + for (; source.item_pos != stop_item;
54704 + coord_add_item_pos(&source, shift->pend)) {
54705 + if (shift->pend == SHIFT_RIGHT)
54706 + source.unit_pos = coord_last_unit_pos(&source);
54707 +
54708 + /* how many units of @source do we want to copy */
54709 + want = wanted_units(&source, &shift->wish_stop, shift->pend);
54710 +
54711 + if (want == coord_last_unit_pos(&source) + 1) {
54712 + /* we want this item to be copied entirely */
54713 + size =
54714 + item_length_by_coord(&source) +
54715 + item_creation_overhead(&source);
54716 + if (size <= target_free_space) {
54717 + /* item fits into target node as whole */
54718 + target_free_space -= size;
54719 + shift->shift_bytes +=
54720 + size - item_creation_overhead(&source);
54721 + shift->entire_bytes +=
54722 + size - item_creation_overhead(&source);
54723 + shift->entire++;
54724 +
54725 + /* update shift->real_stop coord to be set to
54726 + last unit of @source we can merge to
54727 + @target */
54728 + shift->real_stop = source;
54729 + if (shift->pend == SHIFT_LEFT)
54730 + shift->real_stop.unit_pos =
54731 + coord_last_unit_pos(&shift->
54732 + real_stop);
54733 + else
54734 + shift->real_stop.unit_pos = 0;
54735 + continue;
54736 + }
54737 + }
54738 +
54739 + /* we reach here only for an item which does not fit into
54740 + target node in its entirety. This item may be either
54741 + partially shifted, or not shifted at all. We will have to
54742 + create new item in target node, so decrease amout of free
54743 + space by an item creation overhead. We can reach here also
54744 + if stop coord is in this item */
54745 + if (target_free_space >=
54746 + (unsigned)item_creation_overhead(&source)) {
54747 + target_free_space -= item_creation_overhead(&source);
54748 + iplug = item_plugin_by_coord(&source);
54749 + if (iplug->b.can_shift) {
54750 + shift->part_units = iplug->b.can_shift(target_free_space,
54751 + &source,
54752 + NULL, /* target */
54753 + shift->pend,
54754 + &size,
54755 + want);
54756 + } else {
54757 + target_free_space = 0;
54758 + shift->part_units = 0;
54759 + size = 0;
54760 + }
54761 + } else {
54762 + target_free_space = 0;
54763 + shift->part_units = 0;
54764 + size = 0;
54765 + }
54766 + shift->part_bytes = size;
54767 + shift->shift_bytes += size;
54768 +
54769 + /* set @shift->real_stop to last unit of @source we can merge
54770 + to @shift->target */
54771 + if (shift->part_units) {
54772 + shift->real_stop = source;
54773 + shift->real_stop.unit_pos =
54774 + (shift->part_units - source.unit_pos -
54775 + 1) * shift->pend;
54776 + assert("nikita-2082", shift->real_stop.unit_pos + 1);
54777 + }
54778 +
54779 + if (want != shift->part_units)
54780 + /* not everything wanted were shifted */
54781 + return;
54782 + break;
54783 + }
54784 +
54785 + shift->everything = 1;
54786 +}
54787 +
54788 +static void
54789 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54790 + shift_direction dir, unsigned free_space)
54791 +{
54792 + item_plugin *iplug;
54793 +
54794 + assert("nikita-1463", target != NULL);
54795 + assert("nikita-1464", source != NULL);
54796 + assert("nikita-1465", from + count <= coord_num_units(source));
54797 +
54798 + iplug = item_plugin_by_coord(source);
54799 + assert("nikita-1468", iplug == item_plugin_by_coord(target));
54800 + iplug->b.copy_units(target, source, from, count, dir, free_space);
54801 +
54802 + if (dir == SHIFT_RIGHT) {
54803 + /* FIXME-VS: this looks not necessary. update_item_key was
54804 + called already by copy_units method */
54805 + reiser4_key split_key;
54806 +
54807 + assert("nikita-1469", target->unit_pos == 0);
54808 +
54809 + unit_key_by_coord(target, &split_key);
54810 + node_plugin_by_coord(target)->update_item_key(target,
54811 + &split_key, NULL);
54812 + }
54813 +}
54814 +
54815 +/* copy part of @shift->real_stop.node starting either from its beginning or
54816 + from its end and ending at @shift->real_stop to either the end or the
54817 + beginning of @shift->target */
54818 +static void copy(struct shift_params *shift)
54819 +{
54820 + node40_header *nh;
54821 + coord_t from;
54822 + coord_t to;
54823 + item_header40 *from_ih, *to_ih;
54824 + int free_space_start;
54825 + int new_items;
54826 + unsigned old_items;
54827 + int old_offset;
54828 + unsigned i;
54829 +
54830 + nh = node40_node_header(shift->target);
54831 + free_space_start = nh40_get_free_space_start(nh);
54832 + old_items = nh40_get_num_items(nh);
54833 + new_items = shift->entire + (shift->part_units ? 1 : 0);
54834 + assert("vs-185",
54835 + shift->shift_bytes ==
54836 + shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54837 +
54838 + from = shift->wish_stop;
54839 +
54840 + coord_init_first_unit(&to, shift->target);
54841 +
54842 + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54843 + hence to.between is set to EMPTY_NODE above. Looks like we want it
54844 + to be AT_UNIT.
54845 +
54846 + Oh, wonders of ->betweeness...
54847 +
54848 + */
54849 + to.between = AT_UNIT;
54850 +
54851 + if (shift->pend == SHIFT_LEFT) {
54852 + /* copying to left */
54853 +
54854 + coord_set_item_pos(&from, 0);
54855 + from_ih = node40_ih_at(from.node, 0);
54856 +
54857 + coord_set_item_pos(&to,
54858 + node40_num_of_items_internal(to.node) - 1);
54859 + if (shift->merging_units) {
54860 + /* expand last item, so that plugin methods will see
54861 + correct data */
54862 + free_space_start += shift->merging_bytes;
54863 + nh40_set_free_space_start(nh,
54864 + (unsigned)free_space_start);
54865 + nh40_set_free_space(nh,
54866 + nh40_get_free_space(nh) -
54867 + shift->merging_bytes);
54868 +
54869 + /* appending last item of @target */
54870 + copy_units(&to, &from, 0, /* starting from 0-th unit */
54871 + shift->merging_units, SHIFT_LEFT,
54872 + shift->merging_bytes);
54873 + coord_inc_item_pos(&from);
54874 + from_ih--;
54875 + coord_inc_item_pos(&to);
54876 + }
54877 +
54878 + to_ih = node40_ih_at(shift->target, old_items);
54879 + if (shift->entire) {
54880 + /* copy @entire items entirely */
54881 +
54882 + /* copy item headers */
54883 + memcpy(to_ih - shift->entire + 1,
54884 + from_ih - shift->entire + 1,
54885 + shift->entire * sizeof(item_header40));
54886 + /* update item header offset */
54887 + old_offset = ih40_get_offset(from_ih);
54888 + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54889 + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54890 + ih40_set_offset(to_ih,
54891 + ih40_get_offset(from_ih) -
54892 + old_offset + free_space_start);
54893 +
54894 + /* copy item bodies */
54895 + memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
54896 + shift->entire_bytes);
54897 +
54898 + coord_add_item_pos(&from, (int)shift->entire);
54899 + coord_add_item_pos(&to, (int)shift->entire);
54900 + }
54901 +
54902 + nh40_set_free_space_start(nh,
54903 + free_space_start +
54904 + shift->shift_bytes -
54905 + shift->merging_bytes);
54906 + nh40_set_free_space(nh,
54907 + nh40_get_free_space(nh) -
54908 + (shift->shift_bytes - shift->merging_bytes +
54909 + sizeof(item_header40) * new_items));
54910 +
54911 + /* update node header */
54912 + node40_set_num_items(shift->target, nh, old_items + new_items);
54913 + assert("vs-170",
54914 + nh40_get_free_space(nh) < znode_size(shift->target));
54915 +
54916 + if (shift->part_units) {
54917 + /* copy heading part (@part units) of @source item as
54918 + a new item into @target->node */
54919 +
54920 + /* copy item header of partially copied item */
54921 + coord_set_item_pos(&to,
54922 + node40_num_of_items_internal(to.node)
54923 + - 1);
54924 + memcpy(to_ih, from_ih, sizeof(item_header40));
54925 + ih40_set_offset(to_ih,
54926 + nh40_get_free_space_start(nh) -
54927 + shift->part_bytes);
54928 + if (item_plugin_by_coord(&to)->b.init)
54929 + item_plugin_by_coord(&to)->b.init(&to, &from,
54930 + NULL);
54931 + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54932 + shift->part_bytes);
54933 + }
54934 +
54935 + } else {
54936 + /* copying to right */
54937 +
54938 + coord_set_item_pos(&from,
54939 + node40_num_of_items_internal(from.node) - 1);
54940 + from_ih = node40_ih_at_coord(&from);
54941 +
54942 + coord_set_item_pos(&to, 0);
54943 +
54944 + /* prepare space for new items */
54945 + memmove(zdata(to.node) + sizeof(node40_header) +
54946 + shift->shift_bytes,
54947 + zdata(to.node) + sizeof(node40_header),
54948 + free_space_start - sizeof(node40_header));
54949 + /* update item headers of moved items */
54950 + to_ih = node40_ih_at(to.node, 0);
54951 + /* first item gets @merging_bytes longer. free space appears
54952 + at its beginning */
54953 + if (!node_is_empty(to.node))
54954 + ih40_set_offset(to_ih,
54955 + ih40_get_offset(to_ih) +
54956 + shift->shift_bytes -
54957 + shift->merging_bytes);
54958 +
54959 + for (i = 1; i < old_items; i++)
54960 + ih40_set_offset(to_ih - i,
54961 + ih40_get_offset(to_ih - i) +
54962 + shift->shift_bytes);
54963 +
54964 + /* move item headers to make space for new items */
54965 + memmove(to_ih - old_items + 1 - new_items,
54966 + to_ih - old_items + 1,
54967 + sizeof(item_header40) * old_items);
54968 + to_ih -= (new_items - 1);
54969 +
54970 + nh40_set_free_space_start(nh,
54971 + free_space_start +
54972 + shift->shift_bytes);
54973 + nh40_set_free_space(nh,
54974 + nh40_get_free_space(nh) -
54975 + (shift->shift_bytes +
54976 + sizeof(item_header40) * new_items));
54977 +
54978 + /* update node header */
54979 + node40_set_num_items(shift->target, nh, old_items + new_items);
54980 + assert("vs-170",
54981 + nh40_get_free_space(nh) < znode_size(shift->target));
54982 +
54983 + if (shift->merging_units) {
54984 + coord_add_item_pos(&to, new_items);
54985 + to.unit_pos = 0;
54986 + to.between = AT_UNIT;
54987 + /* prepend first item of @to */
54988 + copy_units(&to, &from,
54989 + coord_last_unit_pos(&from) -
54990 + shift->merging_units + 1,
54991 + shift->merging_units, SHIFT_RIGHT,
54992 + shift->merging_bytes);
54993 + coord_dec_item_pos(&from);
54994 + from_ih++;
54995 + }
54996 +
54997 + if (shift->entire) {
54998 + /* copy @entire items entirely */
54999 +
55000 + /* copy item headers */
55001 + memcpy(to_ih, from_ih,
55002 + shift->entire * sizeof(item_header40));
55003 +
55004 + /* update item header offset */
55005 + old_offset =
55006 + ih40_get_offset(from_ih + shift->entire - 1);
55007 + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
55008 + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
55009 + ih40_set_offset(to_ih,
55010 + ih40_get_offset(from_ih) -
55011 + old_offset +
55012 + sizeof(node40_header) +
55013 + shift->part_bytes);
55014 + /* copy item bodies */
55015 + coord_add_item_pos(&from, -(int)(shift->entire - 1));
55016 + memcpy(zdata(to.node) + sizeof(node40_header) +
55017 + shift->part_bytes, item_by_coord_node40(&from),
55018 + shift->entire_bytes);
55019 + coord_dec_item_pos(&from);
55020 + }
55021 +
55022 + if (shift->part_units) {
55023 + coord_set_item_pos(&to, 0);
55024 + to.unit_pos = 0;
55025 + to.between = AT_UNIT;
55026 + /* copy heading part (@part units) of @source item as
55027 + a new item into @target->node */
55028 +
55029 + /* copy item header of partially copied item */
55030 + memcpy(to_ih, from_ih, sizeof(item_header40));
55031 + ih40_set_offset(to_ih, sizeof(node40_header));
55032 + if (item_plugin_by_coord(&to)->b.init)
55033 + item_plugin_by_coord(&to)->b.init(&to, &from,
55034 + NULL);
55035 + copy_units(&to, &from,
55036 + coord_last_unit_pos(&from) -
55037 + shift->part_units + 1, shift->part_units,
55038 + SHIFT_RIGHT, shift->part_bytes);
55039 + }
55040 + }
55041 +}
55042 +
55043 +/* remove everything either before or after @fact_stop. Number of items
55044 + removed completely is returned */
55045 +static int delete_copied(struct shift_params *shift)
55046 +{
55047 + coord_t from;
55048 + coord_t to;
55049 + struct carry_cut_data cdata;
55050 +
55051 + if (shift->pend == SHIFT_LEFT) {
55052 + /* we were shifting to left, remove everything from the
55053 + beginning of @shift->wish_stop->node upto
55054 + @shift->wish_stop */
55055 + coord_init_first_unit(&from, shift->real_stop.node);
55056 + to = shift->real_stop;
55057 +
55058 + /* store old coordinate of unit which will be first after
55059 + shift to left */
55060 + shift->u.future_first = to;
55061 + coord_next_unit(&shift->u.future_first);
55062 + } else {
55063 + /* we were shifting to right, remove everything from
55064 + @shift->stop_coord upto to end of
55065 + @shift->stop_coord->node */
55066 + from = shift->real_stop;
55067 + coord_init_last_unit(&to, from.node);
55068 +
55069 + /* store old coordinate of unit which will be last after
55070 + shift to right */
55071 + shift->u.future_last = from;
55072 + coord_prev_unit(&shift->u.future_last);
55073 + }
55074 +
55075 + cdata.params.from = &from;
55076 + cdata.params.to = &to;
55077 + cdata.params.from_key = NULL;
55078 + cdata.params.to_key = NULL;
55079 + cdata.params.smallest_removed = NULL;
55080 + return cut_node40(&cdata, NULL);
55081 +}
55082 +
55083 +/* something was moved between @left and @right. Add carry operation to @info
55084 + list to have carry to update delimiting key between them */
55085 +static int
55086 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
55087 +{
55088 + carry_op *op;
55089 + carry_node *cn;
55090 +
55091 + if (info == NULL)
55092 + /* nowhere to send operation to. */
55093 + return 0;
55094 +
55095 + if (!should_notify_parent(right))
55096 + return 0;
55097 +
55098 + op = node_post_carry(info, COP_UPDATE, right, 1);
55099 + if (IS_ERR(op) || op == NULL)
55100 + return op ? PTR_ERR(op) : -EIO;
55101 +
55102 + if (left != NULL) {
55103 + carry_node *reference;
55104 +
55105 + if (info->doing)
55106 + reference = insert_carry_node(info->doing,
55107 + info->todo, left);
55108 + else
55109 + reference = op->node;
55110 + assert("nikita-2992", reference != NULL);
55111 + cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
55112 + if (IS_ERR(cn))
55113 + return PTR_ERR(cn);
55114 + cn->parent = 1;
55115 + cn->node = left;
55116 + if (ZF_ISSET(left, JNODE_ORPHAN))
55117 + cn->left_before = 1;
55118 + op->u.update.left = cn;
55119 + } else
55120 + op->u.update.left = NULL;
55121 + return 0;
55122 +}
55123 +
55124 +/* plugin->u.node.prepare_removal
55125 + to delete a pointer to @empty from the tree add corresponding carry
55126 + operation (delete) to @info list */
55127 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
55128 +{
55129 + carry_op *op;
55130 + reiser4_tree *tree;
55131 +
55132 + if (!should_notify_parent(empty))
55133 + return 0;
55134 + /* already on a road to Styx */
55135 + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
55136 + return 0;
55137 + op = node_post_carry(info, COP_DELETE, empty, 1);
55138 + if (IS_ERR(op) || op == NULL)
55139 + return RETERR(op ? PTR_ERR(op) : -EIO);
55140 +
55141 + op->u.delete.child = NULL;
55142 + op->u.delete.flags = 0;
55143 +
55144 + /* fare thee well */
55145 + tree = znode_get_tree(empty);
55146 + read_lock_tree(tree);
55147 + write_lock_dk(tree);
55148 + znode_set_ld_key(empty, znode_get_rd_key(empty));
55149 + if (znode_is_left_connected(empty) && empty->left)
55150 + znode_set_rd_key(empty->left, znode_get_rd_key(empty));
55151 + write_unlock_dk(tree);
55152 + read_unlock_tree(tree);
55153 +
55154 + ZF_SET(empty, JNODE_HEARD_BANSHEE);
55155 + return 0;
55156 +}
55157 +
55158 +/* something were shifted from @insert_coord->node to @shift->target, update
55159 + @insert_coord correspondingly */
55160 +static void
55161 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
55162 + int including_insert_coord)
55163 +{
55164 + /* item plugin was invalidated by shifting */
55165 + coord_clear_iplug(insert_coord);
55166 +
55167 + if (node_is_empty(shift->wish_stop.node)) {
55168 + assert("vs-242", shift->everything);
55169 + if (including_insert_coord) {
55170 + if (shift->pend == SHIFT_RIGHT) {
55171 + /* set @insert_coord before first unit of
55172 + @shift->target node */
55173 + coord_init_before_first_item(insert_coord,
55174 + shift->target);
55175 + } else {
55176 + /* set @insert_coord after last in target node */
55177 + coord_init_after_last_item(insert_coord,
55178 + shift->target);
55179 + }
55180 + } else {
55181 + /* set @insert_coord inside of empty node. There is
55182 + only one possible coord within an empty
55183 + node. init_first_unit will set that coord */
55184 + coord_init_first_unit(insert_coord,
55185 + shift->wish_stop.node);
55186 + }
55187 + return;
55188 + }
55189 +
55190 + if (shift->pend == SHIFT_RIGHT) {
55191 + /* there was shifting to right */
55192 + if (shift->everything) {
55193 + /* everything wanted was shifted */
55194 + if (including_insert_coord) {
55195 + /* @insert_coord is set before first unit of
55196 + @to node */
55197 + coord_init_before_first_item(insert_coord,
55198 + shift->target);
55199 + insert_coord->between = BEFORE_UNIT;
55200 + } else {
55201 + /* @insert_coord is set after last unit of
55202 + @insert->node */
55203 + coord_init_last_unit(insert_coord,
55204 + shift->wish_stop.node);
55205 + insert_coord->between = AFTER_UNIT;
55206 + }
55207 + }
55208 + return;
55209 + }
55210 +
55211 + /* there was shifting to left */
55212 + if (shift->everything) {
55213 + /* everything wanted was shifted */
55214 + if (including_insert_coord) {
55215 + /* @insert_coord is set after last unit in @to node */
55216 + coord_init_after_last_item(insert_coord, shift->target);
55217 + } else {
55218 + /* @insert_coord is set before first unit in the same
55219 + node */
55220 + coord_init_before_first_item(insert_coord,
55221 + shift->wish_stop.node);
55222 + }
55223 + return;
55224 + }
55225 +
55226 + /* FIXME-VS: the code below is complicated because with between ==
55227 + AFTER_ITEM unit_pos is set to 0 */
55228 +
55229 + if (!removed) {
55230 + /* no items were shifted entirely */
55231 + assert("vs-195", shift->merging_units == 0
55232 + || shift->part_units == 0);
55233 +
55234 + if (shift->real_stop.item_pos == insert_coord->item_pos) {
55235 + if (shift->merging_units) {
55236 + if (insert_coord->between == AFTER_UNIT) {
55237 + assert("nikita-1441",
55238 + insert_coord->unit_pos >=
55239 + shift->merging_units);
55240 + insert_coord->unit_pos -=
55241 + shift->merging_units;
55242 + } else if (insert_coord->between == BEFORE_UNIT) {
55243 + assert("nikita-2090",
55244 + insert_coord->unit_pos >
55245 + shift->merging_units);
55246 + insert_coord->unit_pos -=
55247 + shift->merging_units;
55248 + }
55249 +
55250 + assert("nikita-2083",
55251 + insert_coord->unit_pos + 1);
55252 + } else {
55253 + if (insert_coord->between == AFTER_UNIT) {
55254 + assert("nikita-1442",
55255 + insert_coord->unit_pos >=
55256 + shift->part_units);
55257 + insert_coord->unit_pos -=
55258 + shift->part_units;
55259 + } else if (insert_coord->between == BEFORE_UNIT) {
55260 + assert("nikita-2089",
55261 + insert_coord->unit_pos >
55262 + shift->part_units);
55263 + insert_coord->unit_pos -=
55264 + shift->part_units;
55265 + }
55266 +
55267 + assert("nikita-2084",
55268 + insert_coord->unit_pos + 1);
55269 + }
55270 + }
55271 + return;
55272 + }
55273 +
55274 + /* we shifted to left and there was no enough space for everything */
55275 + switch (insert_coord->between) {
55276 + case AFTER_UNIT:
55277 + case BEFORE_UNIT:
55278 + if (shift->real_stop.item_pos == insert_coord->item_pos)
55279 + insert_coord->unit_pos -= shift->part_units;
55280 + case AFTER_ITEM:
55281 + coord_add_item_pos(insert_coord, -removed);
55282 + break;
55283 + default:
55284 + impossible("nikita-2087", "not ready");
55285 + }
55286 + assert("nikita-2085", insert_coord->unit_pos + 1);
55287 +}
55288 +
55289 +static int call_shift_hooks(struct shift_params *shift)
55290 +{
55291 + unsigned i, shifted;
55292 + coord_t coord;
55293 + item_plugin *iplug;
55294 +
55295 + assert("vs-275", !node_is_empty(shift->target));
55296 +
55297 + /* number of items shift touches */
55298 + shifted =
55299 + shift->entire + (shift->merging_units ? 1 : 0) +
55300 + (shift->part_units ? 1 : 0);
55301 +
55302 + if (shift->pend == SHIFT_LEFT) {
55303 + /* moved items are at the end */
55304 + coord_init_last_unit(&coord, shift->target);
55305 + coord.unit_pos = 0;
55306 +
55307 + assert("vs-279", shift->pend == 1);
55308 + for (i = 0; i < shifted; i++) {
55309 + unsigned from, count;
55310 +
55311 + iplug = item_plugin_by_coord(&coord);
55312 + if (i == 0 && shift->part_units) {
55313 + assert("vs-277",
55314 + coord_num_units(&coord) ==
55315 + shift->part_units);
55316 + count = shift->part_units;
55317 + from = 0;
55318 + } else if (i == shifted - 1 && shift->merging_units) {
55319 + count = shift->merging_units;
55320 + from = coord_num_units(&coord) - count;
55321 + } else {
55322 + count = coord_num_units(&coord);
55323 + from = 0;
55324 + }
55325 +
55326 + if (iplug->b.shift_hook) {
55327 + iplug->b.shift_hook(&coord, from, count,
55328 + shift->wish_stop.node);
55329 + }
55330 + coord_add_item_pos(&coord, -shift->pend);
55331 + }
55332 + } else {
55333 + /* moved items are at the beginning */
55334 + coord_init_first_unit(&coord, shift->target);
55335 +
55336 + assert("vs-278", shift->pend == -1);
55337 + for (i = 0; i < shifted; i++) {
55338 + unsigned from, count;
55339 +
55340 + iplug = item_plugin_by_coord(&coord);
55341 + if (i == 0 && shift->part_units) {
55342 + assert("vs-277",
55343 + coord_num_units(&coord) ==
55344 + shift->part_units);
55345 + count = coord_num_units(&coord);
55346 + from = 0;
55347 + } else if (i == shifted - 1 && shift->merging_units) {
55348 + count = shift->merging_units;
55349 + from = 0;
55350 + } else {
55351 + count = coord_num_units(&coord);
55352 + from = 0;
55353 + }
55354 +
55355 + if (iplug->b.shift_hook) {
55356 + iplug->b.shift_hook(&coord, from, count,
55357 + shift->wish_stop.node);
55358 + }
55359 + coord_add_item_pos(&coord, -shift->pend);
55360 + }
55361 + }
55362 +
55363 + return 0;
55364 +}
55365 +
55366 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
55367 +static int
55368 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
55369 +{
55370 + assert("vs-944", shift->real_stop.node == old->node);
55371 +
55372 + if (shift->real_stop.item_pos < old->item_pos)
55373 + return 0;
55374 + if (shift->real_stop.item_pos == old->item_pos) {
55375 + if (shift->real_stop.unit_pos < old->unit_pos)
55376 + return 0;
55377 + }
55378 + return 1;
55379 +}
55380 +
55381 +/* shift to right is completed. Return 1 if unit @old was moved to right
55382 + neighbor */
55383 +static int
55384 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
55385 +{
55386 + assert("vs-944", shift->real_stop.node == old->node);
55387 +
55388 + if (shift->real_stop.item_pos > old->item_pos)
55389 + return 0;
55390 + if (shift->real_stop.item_pos == old->item_pos) {
55391 + if (shift->real_stop.unit_pos > old->unit_pos)
55392 + return 0;
55393 + }
55394 + return 1;
55395 +}
55396 +
55397 +/* coord @old was set in node from which shift was performed. What was shifted
55398 + is stored in @shift. Update @old correspondingly to performed shift */
55399 +static coord_t *adjust_coord2(const struct shift_params *shift,
55400 + const coord_t * old, coord_t * new)
55401 +{
55402 + coord_clear_iplug(new);
55403 + new->between = old->between;
55404 +
55405 + coord_clear_iplug(new);
55406 + if (old->node == shift->target) {
55407 + if (shift->pend == SHIFT_LEFT) {
55408 + /* coord which is set inside of left neighbor does not
55409 + change during shift to left */
55410 + coord_dup(new, old);
55411 + return new;
55412 + }
55413 + new->node = old->node;
55414 + coord_set_item_pos(new,
55415 + old->item_pos + shift->entire +
55416 + (shift->part_units ? 1 : 0));
55417 + new->unit_pos = old->unit_pos;
55418 + if (old->item_pos == 0 && shift->merging_units)
55419 + new->unit_pos += shift->merging_units;
55420 + return new;
55421 + }
55422 +
55423 + assert("vs-977", old->node == shift->wish_stop.node);
55424 + if (shift->pend == SHIFT_LEFT) {
55425 + if (unit_moved_left(shift, old)) {
55426 + /* unit @old moved to left neighbor. Calculate its
55427 + coordinate there */
55428 + new->node = shift->target;
55429 + coord_set_item_pos(new,
55430 + node_num_items(shift->target) -
55431 + shift->entire -
55432 + (shift->part_units ? 1 : 0) +
55433 + old->item_pos);
55434 +
55435 + new->unit_pos = old->unit_pos;
55436 + if (shift->merging_units) {
55437 + coord_dec_item_pos(new);
55438 + if (old->item_pos == 0) {
55439 + /* unit_pos only changes if item got
55440 + merged */
55441 + new->unit_pos =
55442 + coord_num_units(new) -
55443 + (shift->merging_units -
55444 + old->unit_pos);
55445 + }
55446 + }
55447 + } else {
55448 + /* unit @old did not move to left neighbor.
55449 +
55450 + Use _nocheck, because @old is outside of its node.
55451 + */
55452 + coord_dup_nocheck(new, old);
55453 + coord_add_item_pos(new,
55454 + -shift->u.future_first.item_pos);
55455 + if (new->item_pos == 0)
55456 + new->unit_pos -= shift->u.future_first.unit_pos;
55457 + }
55458 + } else {
55459 + if (unit_moved_right(shift, old)) {
55460 + /* unit @old moved to right neighbor */
55461 + new->node = shift->target;
55462 + coord_set_item_pos(new,
55463 + old->item_pos -
55464 + shift->real_stop.item_pos);
55465 + if (new->item_pos == 0) {
55466 + /* unit @old might change unit pos */
55467 + coord_set_item_pos(new,
55468 + old->unit_pos -
55469 + shift->real_stop.unit_pos);
55470 + }
55471 + } else {
55472 + /* unit @old did not move to right neighbor, therefore
55473 + it did not change */
55474 + coord_dup(new, old);
55475 + }
55476 + }
55477 + coord_set_iplug(new, item_plugin_by_coord(new));
55478 + return new;
55479 +}
55480 +
55481 +/* this is called when shift is completed (something of source node is copied
55482 + to target and deleted in source) to update all taps set in current
55483 + context */
55484 +static void update_taps(const struct shift_params *shift)
55485 +{
55486 + tap_t *tap;
55487 + coord_t new;
55488 +
55489 + for_all_taps(tap) {
55490 + /* update only taps set to nodes participating in shift */
55491 + if (tap->coord->node == shift->wish_stop.node
55492 + || tap->coord->node == shift->target)
55493 + tap_to_coord(tap,
55494 + adjust_coord2(shift, tap->coord, &new));
55495 + }
55496 +}
55497 +
55498 +#if REISER4_DEBUG
55499 +
55500 +struct shift_check {
55501 + reiser4_key key;
55502 + __u16 plugin_id;
55503 + union {
55504 + __u64 bytes;
55505 + __u64 entries;
55506 + void *unused;
55507 + } u;
55508 +};
55509 +
55510 +void *shift_check_prepare(const znode * left, const znode * right)
55511 +{
55512 + pos_in_node_t i, nr_items;
55513 + int mergeable;
55514 + struct shift_check *data;
55515 + item_header40 *ih;
55516 +
55517 + if (node_is_empty(left) || node_is_empty(right))
55518 + mergeable = 0;
55519 + else {
55520 + coord_t l, r;
55521 +
55522 + coord_init_last_unit(&l, left);
55523 + coord_init_first_unit(&r, right);
55524 + mergeable = are_items_mergeable(&l, &r);
55525 + }
55526 + nr_items =
55527 + node40_num_of_items_internal(left) +
55528 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55529 + data =
55530 + kmalloc(sizeof(struct shift_check) * nr_items,
55531 + reiser4_ctx_gfp_mask_get());
55532 + if (data != NULL) {
55533 + coord_t coord;
55534 + pos_in_node_t item_pos;
55535 +
55536 + coord_init_first_unit(&coord, left);
55537 + i = 0;
55538 +
55539 + for (item_pos = 0;
55540 + item_pos < node40_num_of_items_internal(left);
55541 + item_pos++) {
55542 +
55543 + coord_set_item_pos(&coord, item_pos);
55544 + ih = node40_ih_at_coord(&coord);
55545 +
55546 + data[i].key = ih->key;
55547 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55548 + switch (data[i].plugin_id) {
55549 + case CTAIL_ID:
55550 + case FORMATTING_ID:
55551 + data[i].u.bytes = coord_num_units(&coord);
55552 + break;
55553 + case EXTENT_POINTER_ID:
55554 + data[i].u.bytes =
55555 + reiser4_extent_size(&coord,
55556 + coord_num_units(&coord));
55557 + break;
55558 + case COMPOUND_DIR_ID:
55559 + data[i].u.entries = coord_num_units(&coord);
55560 + break;
55561 + default:
55562 + data[i].u.unused = NULL;
55563 + break;
55564 + }
55565 + i++;
55566 + }
55567 +
55568 + coord_init_first_unit(&coord, right);
55569 +
55570 + if (mergeable) {
55571 + assert("vs-1609", i != 0);
55572 +
55573 + ih = node40_ih_at_coord(&coord);
55574 +
55575 + assert("vs-1589",
55576 + data[i - 1].plugin_id ==
55577 + le16_to_cpu(get_unaligned(&ih->plugin_id)));
55578 + switch (data[i - 1].plugin_id) {
55579 + case CTAIL_ID:
55580 + case FORMATTING_ID:
55581 + data[i - 1].u.bytes += coord_num_units(&coord);
55582 + break;
55583 + case EXTENT_POINTER_ID:
55584 + data[i - 1].u.bytes +=
55585 + reiser4_extent_size(&coord,
55586 + coord_num_units(&coord));
55587 + break;
55588 + case COMPOUND_DIR_ID:
55589 + data[i - 1].u.entries +=
55590 + coord_num_units(&coord);
55591 + break;
55592 + default:
55593 + impossible("vs-1605", "wrong mergeable item");
55594 + break;
55595 + }
55596 + item_pos = 1;
55597 + } else
55598 + item_pos = 0;
55599 + for (; item_pos < node40_num_of_items_internal(right);
55600 + item_pos++) {
55601 +
55602 + assert("vs-1604", i < nr_items);
55603 + coord_set_item_pos(&coord, item_pos);
55604 + ih = node40_ih_at_coord(&coord);
55605 +
55606 + data[i].key = ih->key;
55607 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55608 + switch (data[i].plugin_id) {
55609 + case CTAIL_ID:
55610 + case FORMATTING_ID:
55611 + data[i].u.bytes = coord_num_units(&coord);
55612 + break;
55613 + case EXTENT_POINTER_ID:
55614 + data[i].u.bytes =
55615 + reiser4_extent_size(&coord,
55616 + coord_num_units(&coord));
55617 + break;
55618 + case COMPOUND_DIR_ID:
55619 + data[i].u.entries = coord_num_units(&coord);
55620 + break;
55621 + default:
55622 + data[i].u.unused = NULL;
55623 + break;
55624 + }
55625 + i++;
55626 + }
55627 + assert("vs-1606", i == nr_items);
55628 + }
55629 + return data;
55630 +}
55631 +
55632 +void shift_check(void *vp, const znode * left, const znode * right)
55633 +{
55634 + pos_in_node_t i, nr_items;
55635 + coord_t coord;
55636 + __u64 last_bytes;
55637 + int mergeable;
55638 + item_header40 *ih;
55639 + pos_in_node_t item_pos;
55640 + struct shift_check *data;
55641 +
55642 + data = (struct shift_check *)vp;
55643 +
55644 + if (data == NULL)
55645 + return;
55646 +
55647 + if (node_is_empty(left) || node_is_empty(right))
55648 + mergeable = 0;
55649 + else {
55650 + coord_t l, r;
55651 +
55652 + coord_init_last_unit(&l, left);
55653 + coord_init_first_unit(&r, right);
55654 + mergeable = are_items_mergeable(&l, &r);
55655 + }
55656 +
55657 + nr_items =
55658 + node40_num_of_items_internal(left) +
55659 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55660 +
55661 + i = 0;
55662 + last_bytes = 0;
55663 +
55664 + coord_init_first_unit(&coord, left);
55665 +
55666 + for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55667 + item_pos++) {
55668 +
55669 + coord_set_item_pos(&coord, item_pos);
55670 + ih = node40_ih_at_coord(&coord);
55671 +
55672 + assert("vs-1611", i == item_pos);
55673 + assert("vs-1590", keyeq(&ih->key, &data[i].key));
55674 + assert("vs-1591",
55675 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55676 + if ((i < (node40_num_of_items_internal(left) - 1))
55677 + || !mergeable) {
55678 + switch (data[i].plugin_id) {
55679 + case CTAIL_ID:
55680 + case FORMATTING_ID:
55681 + assert("vs-1592",
55682 + data[i].u.bytes ==
55683 + coord_num_units(&coord));
55684 + break;
55685 + case EXTENT_POINTER_ID:
55686 + assert("vs-1593",
55687 + data[i].u.bytes ==
55688 + reiser4_extent_size(&coord,
55689 + coord_num_units
55690 + (&coord)));
55691 + break;
55692 + case COMPOUND_DIR_ID:
55693 + assert("vs-1594",
55694 + data[i].u.entries ==
55695 + coord_num_units(&coord));
55696 + break;
55697 + default:
55698 + break;
55699 + }
55700 + }
55701 + if (item_pos == (node40_num_of_items_internal(left) - 1)
55702 + && mergeable) {
55703 + switch (data[i].plugin_id) {
55704 + case CTAIL_ID:
55705 + case FORMATTING_ID:
55706 + last_bytes = coord_num_units(&coord);
55707 + break;
55708 + case EXTENT_POINTER_ID:
55709 + last_bytes =
55710 + reiser4_extent_size(&coord,
55711 + coord_num_units(&coord));
55712 + break;
55713 + case COMPOUND_DIR_ID:
55714 + last_bytes = coord_num_units(&coord);
55715 + break;
55716 + default:
55717 + impossible("vs-1595", "wrong mergeable item");
55718 + break;
55719 + }
55720 + }
55721 + i++;
55722 + }
55723 +
55724 + coord_init_first_unit(&coord, right);
55725 + if (mergeable) {
55726 + ih = node40_ih_at_coord(&coord);
55727 +
55728 + assert("vs-1589",
55729 + data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55730 + assert("vs-1608", last_bytes != 0);
55731 + switch (data[i - 1].plugin_id) {
55732 + case CTAIL_ID:
55733 + case FORMATTING_ID:
55734 + assert("vs-1596",
55735 + data[i - 1].u.bytes ==
55736 + last_bytes + coord_num_units(&coord));
55737 + break;
55738 +
55739 + case EXTENT_POINTER_ID:
55740 + assert("vs-1597",
55741 + data[i - 1].u.bytes ==
55742 + last_bytes + reiser4_extent_size(&coord,
55743 + coord_num_units
55744 + (&coord)));
55745 + break;
55746 +
55747 + case COMPOUND_DIR_ID:
55748 + assert("vs-1598",
55749 + data[i - 1].u.bytes ==
55750 + last_bytes + coord_num_units(&coord));
55751 + break;
55752 + default:
55753 + impossible("vs-1599", "wrong mergeable item");
55754 + break;
55755 + }
55756 + item_pos = 1;
55757 + } else
55758 + item_pos = 0;
55759 +
55760 + for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55761 +
55762 + coord_set_item_pos(&coord, item_pos);
55763 + ih = node40_ih_at_coord(&coord);
55764 +
55765 + assert("vs-1612", keyeq(&ih->key, &data[i].key));
55766 + assert("vs-1613",
55767 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55768 + switch (data[i].plugin_id) {
55769 + case CTAIL_ID:
55770 + case FORMATTING_ID:
55771 + assert("vs-1600",
55772 + data[i].u.bytes == coord_num_units(&coord));
55773 + break;
55774 + case EXTENT_POINTER_ID:
55775 + assert("vs-1601",
55776 + data[i].u.bytes ==
55777 + reiser4_extent_size(&coord,
55778 + coord_num_units
55779 + (&coord)));
55780 + break;
55781 + case COMPOUND_DIR_ID:
55782 + assert("vs-1602",
55783 + data[i].u.entries == coord_num_units(&coord));
55784 + break;
55785 + default:
55786 + break;
55787 + }
55788 + i++;
55789 + }
55790 +
55791 + assert("vs-1603", i == nr_items);
55792 + kfree(data);
55793 +}
55794 +
55795 +#endif
55796 +
55797 +/* plugin->u.node.shift
55798 + look for description of this method in plugin/node/node.h */
55799 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
55800 + deleted from the tree if this is set to 1 */
55801 + int including_stop_coord, carry_plugin_info * info)
55802 +{
55803 + struct shift_params shift;
55804 + int result;
55805 + znode *left, *right;
55806 + znode *source;
55807 + int target_empty;
55808 +
55809 + assert("nikita-2161", coord_check(from));
55810 +
55811 + memset(&shift, 0, sizeof(shift));
55812 + shift.pend = pend;
55813 + shift.wish_stop = *from;
55814 + shift.target = to;
55815 +
55816 + assert("nikita-1473", znode_is_write_locked(from->node));
55817 + assert("nikita-1474", znode_is_write_locked(to));
55818 +
55819 + source = from->node;
55820 +
55821 + /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55822 + shifted */
55823 + if (pend == SHIFT_LEFT) {
55824 + result = coord_set_to_left(&shift.wish_stop);
55825 + left = to;
55826 + right = from->node;
55827 + } else {
55828 + result = coord_set_to_right(&shift.wish_stop);
55829 + left = from->node;
55830 + right = to;
55831 + }
55832 +
55833 + if (result) {
55834 + /* move insertion coord even if there is nothing to move */
55835 + if (including_stop_coord) {
55836 + /* move insertion coord (@from) */
55837 + if (pend == SHIFT_LEFT) {
55838 + /* after last item in target node */
55839 + coord_init_after_last_item(from, to);
55840 + } else {
55841 + /* before first item in target node */
55842 + coord_init_before_first_item(from, to);
55843 + }
55844 + }
55845 +
55846 + if (delete_child && node_is_empty(shift.wish_stop.node))
55847 + result =
55848 + prepare_removal_node40(shift.wish_stop.node, info);
55849 + else
55850 + result = 0;
55851 + /* there is nothing to shift */
55852 + assert("nikita-2078", coord_check(from));
55853 + return result;
55854 + }
55855 +
55856 + target_empty = node_is_empty(to);
55857 +
55858 + /* when first node plugin with item body compression is implemented,
55859 + this must be changed to call node specific plugin */
55860 +
55861 + /* shift->stop_coord is updated to last unit which really will be
55862 + shifted */
55863 + estimate_shift(&shift, get_current_context());
55864 + if (!shift.shift_bytes) {
55865 + /* we could not shift anything */
55866 + assert("nikita-2079", coord_check(from));
55867 + return 0;
55868 + }
55869 +
55870 + copy(&shift);
55871 +
55872 + /* result value of this is important. It is used by adjust_coord below */
55873 + result = delete_copied(&shift);
55874 +
55875 + assert("vs-1610", result >= 0);
55876 + assert("vs-1471",
55877 + ((reiser4_context *) current->journal_info)->magic ==
55878 + context_magic);
55879 +
55880 + /* item which has been moved from one node to another might want to do
55881 + something on that event. This can be done by item's shift_hook
55882 + method, which will be now called for every moved items */
55883 + call_shift_hooks(&shift);
55884 +
55885 + assert("vs-1472",
55886 + ((reiser4_context *) current->journal_info)->magic ==
55887 + context_magic);
55888 +
55889 + update_taps(&shift);
55890 +
55891 + assert("vs-1473",
55892 + ((reiser4_context *) current->journal_info)->magic ==
55893 + context_magic);
55894 +
55895 + /* adjust @from pointer in accordance with @including_stop_coord flag
55896 + and amount of data which was really shifted */
55897 + adjust_coord(from, &shift, result, including_stop_coord);
55898 +
55899 + if (target_empty)
55900 + /*
55901 + * items were shifted into empty node. Update delimiting key.
55902 + */
55903 + result = prepare_for_update(NULL, left, info);
55904 +
55905 + /* add update operation to @info, which is the list of operations to
55906 + be performed on a higher level */
55907 + result = prepare_for_update(left, right, info);
55908 + if (!result && node_is_empty(source) && delete_child) {
55909 + /* all contents of @from->node is moved to @to and @from->node
55910 + has to be removed from the tree, so, on higher level we
55911 + will be removing the pointer to node @from->node */
55912 + result = prepare_removal_node40(source, info);
55913 + }
55914 + assert("nikita-2080", coord_check(from));
55915 + return result ? result : (int)shift.shift_bytes;
55916 +}
55917 +
55918 +/* plugin->u.node.fast_insert()
55919 + look for description of this method in plugin/node/node.h */
55920 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55921 +{
55922 + return 1;
55923 +}
55924 +
55925 +/* plugin->u.node.fast_paste()
55926 + look for description of this method in plugin/node/node.h */
55927 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55928 +{
55929 + return 1;
55930 +}
55931 +
55932 +/* plugin->u.node.fast_cut()
55933 + look for description of this method in plugin/node/node.h */
55934 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55935 +{
55936 + return 1;
55937 +}
55938 +
55939 +/* plugin->u.node.modify - not defined */
55940 +
55941 +/* plugin->u.node.max_item_size */
55942 +int max_item_size_node40(void)
55943 +{
55944 + return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55945 + sizeof(item_header40);
55946 +}
55947 +
55948 +/* plugin->u.node.set_item_plugin */
55949 +int set_item_plugin_node40(coord_t *coord, item_id id)
55950 +{
55951 + item_header40 *ih;
55952 +
55953 + ih = node40_ih_at_coord(coord);
55954 + put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55955 + coord->iplugid = id;
55956 + return 0;
55957 +}
55958 +
55959 +/*
55960 + Local variables:
55961 + c-indentation-style: "K&R"
55962 + mode-name: "LC"
55963 + c-basic-offset: 8
55964 + tab-width: 8
55965 + fill-column: 120
55966 + scroll-step: 1
55967 + End:
55968 +*/
55969 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node40.h linux-2.6.33/fs/reiser4/plugin/node/node40.h
55970 --- linux-2.6.33.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 01:00:00.000000000 +0100
55971 +++ linux-2.6.33/fs/reiser4/plugin/node/node40.h 2010-03-04 19:33:22.000000000 +0100
55972 @@ -0,0 +1,125 @@
55973 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55974 +
55975 +#if !defined( __REISER4_NODE40_H__ )
55976 +#define __REISER4_NODE40_H__
55977 +
55978 +#include "../../forward.h"
55979 +#include "../../dformat.h"
55980 +#include "node.h"
55981 +
55982 +#include <linux/types.h>
55983 +
55984 +/* format of node header for 40 node layouts. Keep bloat out of this struct. */
55985 +typedef struct node40_header {
55986 + /* identifier of node plugin. Must be located at the very beginning
55987 + of a node. */
55988 + common_node_header common_header; /* this is 16 bits */
55989 + /* number of items. Should be first element in the node header,
55990 + because we haven't yet finally decided whether it shouldn't go into
55991 + common_header.
55992 + */
55993 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55994 + * node format at compile time, and it is this one, accesses do not function dereference when
55995 + * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
55996 + d16 nr_items;
55997 + /* free space in node measured in bytes */
55998 + d16 free_space;
55999 + /* offset to start of free space in node */
56000 + d16 free_space_start;
56001 + /* for reiser4_fsck. When information about what is a free
56002 + block is corrupted, and we try to recover everything even
56003 + if marked as freed, then old versions of data may
56004 + duplicate newer versions, and this field allows us to
56005 + restore the newer version. Also useful for when users
56006 + who don't have the new trashcan installed on their linux distro
56007 + delete the wrong files and send us desperate emails
56008 + offering $25 for them back. */
56009 +
56010 + /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
56011 + d32 magic;
56012 + /* flushstamp is made of mk_id and write_counter. mk_id is an
56013 + id generated randomly at mkreiserfs time. So we can just
56014 + skip all nodes with different mk_id. write_counter is d64
56015 + incrementing counter of writes on disk. It is used for
56016 + choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
56017 +
56018 + d32 mkfs_id;
56019 + d64 flush_id;
56020 + /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
56021 + and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
56022 + d16 flags;
56023 +
56024 + /* 1 is leaf level, 2 is twig level, root is the numerically
56025 + largest level */
56026 + d8 level;
56027 +
56028 + d8 pad;
56029 +} PACKED node40_header;
56030 +
56031 +/* item headers are not standard across all node layouts, pass
56032 + pos_in_node to functions instead */
56033 +typedef struct item_header40 {
56034 + /* key of item */
56035 + /* 0 */ reiser4_key key;
56036 + /* offset from start of a node measured in 8-byte chunks */
56037 + /* 24 */ d16 offset;
56038 + /* 26 */ d16 flags;
56039 + /* 28 */ d16 plugin_id;
56040 +} PACKED item_header40;
56041 +
56042 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
56043 +size_t free_space_node40(znode * node);
56044 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
56045 + lookup_bias bias, coord_t * coord);
56046 +int num_of_items_node40(const znode * node);
56047 +char *item_by_coord_node40(const coord_t * coord);
56048 +int length_by_coord_node40(const coord_t * coord);
56049 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
56050 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
56051 +size_t estimate_node40(znode * node);
56052 +int check_node40(const znode * node, __u32 flags, const char **error);
56053 +int parse_node40(znode * node);
56054 +int init_node40(znode * node);
56055 +#ifdef GUESS_EXISTS
56056 +int guess_node40(const znode * node);
56057 +#endif
56058 +void change_item_size_node40(coord_t * coord, int by);
56059 +int create_item_node40(coord_t * target, const reiser4_key * key,
56060 + reiser4_item_data * data, carry_plugin_info * info);
56061 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
56062 + carry_plugin_info * info);
56063 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
56064 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
56065 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
56066 + /* if @from->node becomes
56067 + empty - it will be deleted from
56068 + the tree if this is set to 1
56069 + */
56070 + int delete_child, int including_stop_coord,
56071 + carry_plugin_info * info);
56072 +
56073 +int fast_insert_node40(const coord_t * coord);
56074 +int fast_paste_node40(const coord_t * coord);
56075 +int fast_cut_node40(const coord_t * coord);
56076 +int max_item_size_node40(void);
56077 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
56078 +int set_item_plugin_node40(coord_t * coord, item_id id);
56079 +int shrink_item_node40(coord_t * coord, int delta);
56080 +
56081 +#if REISER4_DEBUG
56082 +void *shift_check_prepare(const znode *left, const znode *right);
56083 +void shift_check(void *vp, const znode *left, const znode *right);
56084 +#endif
56085 +
56086 +/* __REISER4_NODE40_H__ */
56087 +#endif
56088 +/*
56089 + Local variables:
56090 + c-indentation-style: "K&R"
56091 + mode-name: "LC"
56092 + c-basic-offset: 8
56093 + tab-width: 8
56094 + fill-column: 120
56095 + scroll-step: 1
56096 + End:
56097 +*/
56098 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node.c linux-2.6.33/fs/reiser4/plugin/node/node.c
56099 --- linux-2.6.33.orig/fs/reiser4/plugin/node/node.c 1970-01-01 01:00:00.000000000 +0100
56100 +++ linux-2.6.33/fs/reiser4/plugin/node/node.c 2010-03-04 19:33:22.000000000 +0100
56101 @@ -0,0 +1,131 @@
56102 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56103 +
56104 +/* Node plugin interface.
56105 +
56106 + Description: The tree provides the abstraction of flows, which it
56107 + internally fragments into items which it stores in nodes.
56108 +
56109 + A key_atom is a piece of data bound to a single key.
56110 +
56111 + For reasonable space efficiency to be achieved it is often
56112 + necessary to store key_atoms in the nodes in the form of items, where
56113 + an item is a sequence of key_atoms of the same or similar type. It is
56114 + more space-efficient, because the item can implement (very)
56115 + efficient compression of key_atom's bodies using internal knowledge
56116 + about their semantics, and it can often avoid having a key for each
56117 + key_atom. Each type of item has specific operations implemented by its
56118 + item handler (see balance.c).
56119 +
56120 + Rationale: the rest of the code (specifically balancing routines)
56121 + accesses leaf level nodes through this interface. This way we can
56122 + implement various block layouts and even combine various layouts
56123 + within the same tree. Balancing/allocating algorithms should not
56124 + care about peculiarities of splitting/merging specific item types,
56125 + but rather should leave that to the item's item handler.
56126 +
56127 + Items, including those that provide the abstraction of flows, have
56128 + the property that if you move them in part or in whole to another
56129 + node, the balancing code invokes their is_left_mergeable()
56130 + item_operation to determine if they are mergeable with their new
56131 + neighbor in the node you have moved them to. For some items the
56132 + is_left_mergeable() function always returns null.
56133 +
56134 + When moving the bodies of items from one node to another:
56135 +
56136 + if a partial item is shifted to another node the balancing code invokes
56137 + an item handler method to handle the item splitting.
56138 +
56139 + if the balancing code needs to merge with an item in the node it
56140 + is shifting to, it will invoke an item handler method to handle
56141 + the item merging.
56142 +
56143 + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
56144 + adjusting the item headers after the move is done using the node handler.
56145 +*/
56146 +
56147 +#include "../../forward.h"
56148 +#include "../../debug.h"
56149 +#include "../../key.h"
56150 +#include "../../coord.h"
56151 +#include "../plugin_header.h"
56152 +#include "../item/item.h"
56153 +#include "node.h"
56154 +#include "../plugin.h"
56155 +#include "../../znode.h"
56156 +#include "../../tree.h"
56157 +#include "../../super.h"
56158 +#include "../../reiser4.h"
56159 +
56160 +/**
56161 + * leftmost_key_in_node - get the smallest key in node
56162 + * @node:
56163 + * @key: store result here
56164 + *
56165 + * Stores the leftmost key of @node in @key.
56166 + */
56167 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
56168 +{
56169 + assert("nikita-1634", node != NULL);
56170 + assert("nikita-1635", key != NULL);
56171 +
56172 + if (!node_is_empty(node)) {
56173 + coord_t first_item;
56174 +
56175 + coord_init_first_unit(&first_item, (znode *) node);
56176 + item_key_by_coord(&first_item, key);
56177 + } else
56178 + *key = *reiser4_max_key();
56179 + return key;
56180 +}
56181 +
56182 +node_plugin node_plugins[LAST_NODE_ID] = {
56183 + [NODE40_ID] = {
56184 + .h = {
56185 + .type_id = REISER4_NODE_PLUGIN_TYPE,
56186 + .id = NODE40_ID,
56187 + .pops = NULL,
56188 + .label = "unified",
56189 + .desc = "unified node layout",
56190 + .linkage = {NULL, NULL}
56191 + },
56192 + .item_overhead = item_overhead_node40,
56193 + .free_space = free_space_node40,
56194 + .lookup = lookup_node40,
56195 + .num_of_items = num_of_items_node40,
56196 + .item_by_coord = item_by_coord_node40,
56197 + .length_by_coord = length_by_coord_node40,
56198 + .plugin_by_coord = plugin_by_coord_node40,
56199 + .key_at = key_at_node40,
56200 + .estimate = estimate_node40,
56201 + .check = check_node40,
56202 + .parse = parse_node40,
56203 + .init = init_node40,
56204 +#ifdef GUESS_EXISTS
56205 + .guess = guess_node40,
56206 +#endif
56207 + .change_item_size = change_item_size_node40,
56208 + .create_item = create_item_node40,
56209 + .update_item_key = update_item_key_node40,
56210 + .cut_and_kill = kill_node40,
56211 + .cut = cut_node40,
56212 + .shift = shift_node40,
56213 + .shrink_item = shrink_item_node40,
56214 + .fast_insert = fast_insert_node40,
56215 + .fast_paste = fast_paste_node40,
56216 + .fast_cut = fast_cut_node40,
56217 + .max_item_size = max_item_size_node40,
56218 + .prepare_removal = prepare_removal_node40,
56219 + .set_item_plugin = set_item_plugin_node40
56220 + }
56221 +};
56222 +
56223 +/*
56224 + Local variables:
56225 + c-indentation-style: "K&R"
56226 + mode-name: "LC"
56227 + c-basic-offset: 8
56228 + tab-width: 8
56229 + fill-column: 120
56230 + scroll-step: 1
56231 + End:
56232 +*/
56233 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node.h linux-2.6.33/fs/reiser4/plugin/node/node.h
56234 --- linux-2.6.33.orig/fs/reiser4/plugin/node/node.h 1970-01-01 01:00:00.000000000 +0100
56235 +++ linux-2.6.33/fs/reiser4/plugin/node/node.h 2010-03-04 19:33:22.000000000 +0100
56236 @@ -0,0 +1,272 @@
56237 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56238 +
56239 +/* We need a definition of the default node layout here. */
56240 +
56241 +/* Generally speaking, it is best to have free space in the middle of the
56242 + node so that two sets of things can grow towards it, and to have the
56243 + item bodies on the left so that the last one of them grows into free
56244 + space. We optimize for the case where we append new items to the end
56245 + of the node, or grow the last item, because it hurts nothing to so
56246 + optimize and it is a common special case to do massive insertions in
56247 + increasing key order (and one of cases more likely to have a real user
56248 + notice the delay time for).
56249 +
56250 + formatted leaf default layout: (leaf1)
56251 +
56252 + |node header:item bodies:free space:key + pluginid + item offset|
56253 +
56254 + We grow towards the middle, optimizing layout for the case where we
56255 + append new items to the end of the node. The node header is fixed
56256 + length. Keys, and item offsets plus pluginids for the items
56257 + corresponding to them are in increasing key order, and are fixed
56258 + length. Item offsets are relative to start of node (16 bits creating
56259 + a node size limit of 64k, 12 bits might be a better choice....). Item
56260 + bodies are in decreasing key order. Item bodies have a variable size.
56261 + There is a one to one to one mapping of keys to item offsets to item
56262 + bodies. Item offsets consist of pointers to the zeroth byte of the
56263 + item body. Item length equals the start of the next item minus the
56264 + start of this item, except the zeroth item whose length equals the end
56265 + of the node minus the start of that item (plus a byte). In other
56266 + words, the item length is not recorded anywhere, and it does not need
56267 + to be since it is computable.
56268 +
56269 + Leaf variable length items and keys layout : (lvar)
56270 +
56271 + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
56272 +
56273 + We grow towards the middle, optimizing layout for the case where we
56274 + append new items to the end of the node. The node header is fixed
56275 + length. Keys and item offsets for the items corresponding to them are
56276 + in increasing key order, and keys are variable length. Item offsets
56277 + are relative to start of node (16 bits). Item bodies are in
56278 + decreasing key order. Item bodies have a variable size. There is a
56279 + one to one to one mapping of keys to item offsets to item bodies.
56280 + Item offsets consist of pointers to the zeroth byte of the item body.
56281 + Item length equals the start of the next item's key minus the start of
56282 + this item, except the zeroth item whose length equals the end of the
56283 + node minus the start of that item (plus a byte).
56284 +
56285 + leaf compressed keys layout: (lcomp)
56286 +
56287 + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
56288 +
56289 + We grow towards the middle, optimizing layout for the case where we
56290 + append new items to the end of the node. The node header is fixed
56291 + length. Keys and item offsets for the items corresponding to them are
56292 + in increasing key order, and keys are variable length. The "key
56293 + inherit" field indicates how much of the key prefix is identical to
56294 + the previous key (stem compression as described in "Managing
56295 + Gigabytes" is used). key_inherit is a one byte integer. The
56296 + intra-node searches performed through this layout are linear searches,
56297 + and this is theorized to not hurt performance much due to the high
56298 + cost of processor stalls on modern CPUs, and the small number of keys
56299 + in a single node. Item offsets are relative to start of node (16
56300 + bits). Item bodies are in decreasing key order. Item bodies have a
56301 + variable size. There is a one to one to one mapping of keys to item
56302 + offsets to item bodies. Item offsets consist of pointers to the
56303 + zeroth byte of the item body. Item length equals the start of the
56304 + next item minus the start of this item, except the zeroth item whose
56305 + length equals the end of the node minus the start of that item (plus a
56306 + byte). In other words, item length and key length is not recorded
56307 + anywhere, and it does not need to be since it is computable.
56308 +
56309 + internal node default layout: (idef1)
56310 +
56311 + just like ldef1 except that item bodies are either blocknrs of
56312 + children or extents, and moving them may require updating parent
56313 + pointers in the nodes that they point to.
56314 +*/
56315 +
56316 +/* There is an inherent 3-way tradeoff between optimizing and
56317 + exchanging disks between different architectures and code
56318 + complexity. This is optimal and simple and inexchangeable.
56319 + Someone else can do the code for exchanging disks and make it
56320 + complex. It would not be that hard. Using other than the PAGE_SIZE
56321 + might be suboptimal.
56322 +*/
56323 +
56324 +#if !defined( __REISER4_NODE_H__ )
56325 +#define __REISER4_NODE_H__
56326 +
56327 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
56328 +
56329 +#include "../../dformat.h"
56330 +#include "../plugin_header.h"
56331 +
56332 +#include <linux/types.h>
56333 +
56334 +typedef enum {
56335 + NS_FOUND = 0,
56336 + NS_NOT_FOUND = -ENOENT
56337 +} node_search_result;
56338 +
56339 +/* Maximal possible space overhead for creation of new item in a node */
56340 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
56341 +
56342 +typedef enum {
56343 + REISER4_NODE_DKEYS = (1 << 0),
56344 + REISER4_NODE_TREE_STABLE = (1 << 1)
56345 +} reiser4_node_check_flag;
56346 +
56347 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
56348 +struct cut_list {
56349 + coord_t *from;
56350 + coord_t *to;
56351 + const reiser4_key *from_key;
56352 + const reiser4_key *to_key;
56353 + reiser4_key *smallest_removed;
56354 + carry_plugin_info *info;
56355 + __u32 flags;
56356 + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
56357 + lock_handle *left;
56358 + lock_handle *right;
56359 +};
56360 +
56361 +struct carry_cut_data;
56362 +struct carry_kill_data;
56363 +
56364 +/* The responsibility of the node plugin is to store and give access
56365 + to the sequence of items within the node. */
56366 +typedef struct node_plugin {
56367 + /* generic plugin fields */
56368 + plugin_header h;
56369 +
56370 + /* calculates the amount of space that will be required to store an
56371 + item which is in addition to the space consumed by the item body.
56372 + (the space consumed by the item body can be gotten by calling
56373 + item->estimate) */
56374 + size_t(*item_overhead) (const znode * node, flow_t * f);
56375 +
56376 + /* returns free space by looking into node (i.e., without using
56377 + znode->free_space). */
56378 + size_t(*free_space) (znode * node);
56379 + /* search within the node for the one item which might
56380 + contain the key, invoking item->search_within to search within
56381 + that item to see if it is in there */
56382 + node_search_result(*lookup) (znode * node, const reiser4_key * key,
56383 + lookup_bias bias, coord_t * coord);
56384 + /* number of items in node */
56385 + int (*num_of_items) (const znode * node);
56386 +
56387 + /* store information about item in @coord in @data */
56388 + /* break into several node ops, don't add any more uses of this before doing so */
56389 + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
56390 + char *(*item_by_coord) (const coord_t * coord);
56391 + int (*length_by_coord) (const coord_t * coord);
56392 + item_plugin *(*plugin_by_coord) (const coord_t * coord);
56393 +
56394 + /* store item key in @key */
56395 + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
56396 + /* conservatively estimate whether unit of what size can fit
56397 + into node. This estimation should be performed without
56398 + actually looking into the node's content (free space is saved in
56399 + znode). */
56400 + size_t(*estimate) (znode * node);
56401 +
56402 + /* performs every consistency check the node plugin author could
56403 + imagine. Optional. */
56404 + int (*check) (const znode * node, __u32 flags, const char **error);
56405 +
56406 + /* Called when node is read into memory and node plugin is
56407 + already detected. This should read some data into znode (like free
56408 + space counter) and, optionally, check data consistency.
56409 + */
56410 + int (*parse) (znode * node);
56411 + /* This method is called on a new node to initialise plugin specific
56412 + data (header, etc.) */
56413 + int (*init) (znode * node);
56414 + /* Check whether @node content conforms to this plugin format.
56415 + Probably only useful after support for old V3.x formats is added.
56416 + Uncomment after 4.0 only.
56417 + */
56418 + /* int ( *guess )( const znode *node ); */
56419 +#if REISER4_DEBUG
56420 + void (*print) (const char *prefix, const znode * node, __u32 flags);
56421 +#endif
56422 + /* change size of @item by @by bytes. @item->node has enough free
56423 + space. When @by > 0 - free space is appended to end of item. When
56424 + @by < 0 - item is truncated - it is assumed that last @by bytes if
56425 + the item are freed already */
56426 + void (*change_item_size) (coord_t * item, int by);
56427 +
56428 + /* create new item @length bytes long in coord @target */
56429 + int (*create_item) (coord_t * target, const reiser4_key * key,
56430 + reiser4_item_data * data, carry_plugin_info * info);
56431 +
56432 + /* update key of item. */
56433 + void (*update_item_key) (coord_t * target, const reiser4_key * key,
56434 + carry_plugin_info * info);
56435 +
56436 + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56437 + int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56438 +
56439 + /*
56440 + * shrink item pointed to by @coord by @delta bytes.
56441 + */
56442 + int (*shrink_item) (coord_t * coord, int delta);
56443 +
56444 + /* copy as much as possible but not more than up to @stop from
56445 + @stop->node to @target. If (pend == append) then data from beginning of
56446 + @stop->node are copied to the end of @target. If (pend == prepend) then
56447 + data from the end of @stop->node are copied to the beginning of
56448 + @target. Copied data are removed from @stop->node. Information
56449 + about what to do on upper level is stored in @todo */
56450 + int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56451 + int delete_node, int including_insert_coord,
56452 + carry_plugin_info * info);
56453 + /* return true if this node allows skip carry() in some situations
56454 + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56455 + emulation doesn't.
56456 +
56457 + This will speedup insertions that doesn't require updates to the
56458 + parent, by bypassing initialisation of carry() structures. It's
56459 + believed that majority of insertions will fit there.
56460 +
56461 + */
56462 + int (*fast_insert) (const coord_t * coord);
56463 + int (*fast_paste) (const coord_t * coord);
56464 + int (*fast_cut) (const coord_t * coord);
56465 + /* this limits max size of item which can be inserted into a node and
56466 + number of bytes item in a node may be appended with */
56467 + int (*max_item_size) (void);
56468 + int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56469 + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56470 + * files */
56471 + int (*set_item_plugin) (coord_t * coord, item_id);
56472 +} node_plugin;
56473 +
56474 +typedef enum {
56475 + /* standard unified node layout used for both leaf and internal
56476 + nodes */
56477 + NODE40_ID,
56478 + LAST_NODE_ID
56479 +} reiser4_node_id;
56480 +
56481 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56482 +#if REISER4_DEBUG
56483 +extern void print_node_content(const char *prefix, const znode * node,
56484 + __u32 flags);
56485 +#endif
56486 +
56487 +extern void indent_znode(const znode * node);
56488 +
56489 +typedef struct common_node_header {
56490 + /*
56491 + * identifier of node plugin. Must be located at the very beginning of
56492 + * a node.
56493 + */
56494 + __le16 plugin_id;
56495 +} common_node_header;
56496 +
56497 +/* __REISER4_NODE_H__ */
56498 +#endif
56499 +/*
56500 + * Local variables:
56501 + * c-indentation-style: "K&R"
56502 + * mode-name: "LC"
56503 + * c-basic-offset: 8
56504 + * tab-width: 8
56505 + * fill-column: 79
56506 + * scroll-step: 1
56507 + * End:
56508 + */
56509 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/object.c linux-2.6.33/fs/reiser4/plugin/object.c
56510 --- linux-2.6.33.orig/fs/reiser4/plugin/object.c 1970-01-01 01:00:00.000000000 +0100
56511 +++ linux-2.6.33/fs/reiser4/plugin/object.c 2010-03-04 19:33:22.000000000 +0100
56512 @@ -0,0 +1,531 @@
56513 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56514 + * reiser4/README */
56515 +
56516 +/*
56517 + * Examples of object plugins: file, directory, symlink, special file.
56518 + *
56519 + * Plugins associated with inode:
56520 + *
56521 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
56522 + * stat-data. How we store this plugin in in-core inode is not
56523 + * important. Currently pointers are used, another variant is to store offsets
56524 + * and do array lookup on each access.
56525 + *
56526 + * Now, each inode has one selected plugin: object plugin that
56527 + * determines what type of file this object is: directory, regular etc.
56528 + *
56529 + * This main plugin can use other plugins that are thus subordinated to
56530 + * it. Directory instance of object plugin uses hash; regular file
56531 + * instance uses tail policy plugin.
56532 + *
56533 + * Object plugin is either taken from id in stat-data or guessed from
56534 + * i_mode bits. Once it is established we ask it to install its
56535 + * subordinate plugins, by looking again in stat-data or inheriting them
56536 + * from parent.
56537 + *
56538 + * How new inode is initialized during ->read_inode():
56539 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
56540 + * i_generation, capabilities etc.
56541 + * 2 read plugin id from stat data or try to guess plugin id
56542 + * from inode->i_mode bits if plugin id is missing.
56543 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56544 + *
56545 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
56546 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
56547 + *
56548 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
56549 + * from stat-data or guessed from mode bits
56550 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56551 + * plugins from parent.
56552 + *
56553 + * Easy induction proves that on last step all plugins of inode would be
56554 + * initialized.
56555 + *
56556 + * When creating new object:
56557 + * 1 obtain object plugin id (see next period)
56558 + * NIKITA-FIXME-HANS: period?
56559 + * 2 ->install() this plugin
56560 + * 3 ->inherit() the rest from the parent
56561 + *
56562 + * We need some examples of creating an object with default and non-default
56563 + * plugin ids. Nikita, please create them.
56564 + */
56565 +
56566 +#include "../inode.h"
56567 +
56568 +static int _bugop(void)
56569 +{
56570 + BUG_ON(1);
56571 + return 0;
56572 +}
56573 +
56574 +#define bugop ((void *)_bugop)
56575 +
56576 +static int _dummyop(void)
56577 +{
56578 + return 0;
56579 +}
56580 +
56581 +#define dummyop ((void *)_dummyop)
56582 +
56583 +static int change_file(struct inode *inode,
56584 + reiser4_plugin * plugin,
56585 + pset_member memb)
56586 +{
56587 + /* cannot change object plugin of already existing object */
56588 + if (memb == PSET_FILE)
56589 + return RETERR(-EINVAL);
56590 +
56591 + /* Change PSET_CREATE */
56592 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56593 +}
56594 +
56595 +static reiser4_plugin_ops file_plugin_ops = {
56596 + .change = change_file
56597 +};
56598 +
56599 +static struct inode_operations null_i_ops = {.create = NULL};
56600 +static struct file_operations null_f_ops = {.owner = NULL};
56601 +static struct address_space_operations null_a_ops = {.writepage = NULL};
56602 +
56603 +/* VFS methods for regular files */
56604 +static struct inode_operations regular_file_i_ops = {
56605 + .permission = reiser4_permission_common,
56606 + .setattr = reiser4_setattr,
56607 + .getattr = reiser4_getattr_common
56608 +};
56609 +static struct file_operations regular_file_f_ops = {
56610 + .llseek = generic_file_llseek,
56611 + .read = reiser4_read_careful,
56612 + .write = reiser4_write_careful,
56613 + .aio_read = generic_file_aio_read,
56614 + .ioctl = reiser4_ioctl_careful,
56615 + .mmap = reiser4_mmap_careful,
56616 + .open = reiser4_open_careful,
56617 + .release = reiser4_release_careful,
56618 + .fsync = reiser4_sync_file_common,
56619 + .splice_read = generic_file_splice_read,
56620 + .splice_write = generic_file_splice_write
56621 +};
56622 +static struct address_space_operations regular_file_a_ops = {
56623 + .writepage = reiser4_writepage,
56624 + .readpage = reiser4_readpage,
56625 + .sync_page = block_sync_page,
56626 + .writepages = reiser4_writepages,
56627 + .set_page_dirty = reiser4_set_page_dirty,
56628 + .readpages = reiser4_readpages,
56629 + .write_begin = reiser4_write_begin_careful,
56630 + .write_end = reiser4_write_end_careful,
56631 + .bmap = reiser4_bmap_careful,
56632 + .invalidatepage = reiser4_invalidatepage,
56633 + .releasepage = reiser4_releasepage
56634 +};
56635 +
56636 +/* VFS methods for symlink files */
56637 +static struct inode_operations symlink_file_i_ops = {
56638 + .readlink = generic_readlink,
56639 + .follow_link = reiser4_follow_link_common,
56640 + .permission = reiser4_permission_common,
56641 + .setattr = reiser4_setattr_common,
56642 + .getattr = reiser4_getattr_common
56643 +};
56644 +
56645 +/* VFS methods for special files */
56646 +static struct inode_operations special_file_i_ops = {
56647 + .permission = reiser4_permission_common,
56648 + .setattr = reiser4_setattr_common,
56649 + .getattr = reiser4_getattr_common
56650 +};
56651 +
56652 +/* VFS methods for directories */
56653 +static struct inode_operations directory_i_ops = {
56654 + .create = reiser4_create_common,
56655 + .lookup = reiser4_lookup_common,
56656 + .link = reiser4_link_common,
56657 + .unlink = reiser4_unlink_common,
56658 + .symlink = reiser4_symlink_common,
56659 + .mkdir = reiser4_mkdir_common,
56660 + .rmdir = reiser4_unlink_common,
56661 + .mknod = reiser4_mknod_common,
56662 + .rename = reiser4_rename_common,
56663 + .permission = reiser4_permission_common,
56664 + .setattr = reiser4_setattr_common,
56665 + .getattr = reiser4_getattr_common
56666 +};
56667 +static struct file_operations directory_f_ops = {
56668 + .llseek = reiser4_llseek_dir_common,
56669 + .read = generic_read_dir,
56670 + .readdir = reiser4_readdir_common,
56671 + .release = reiser4_release_dir_common,
56672 + .fsync = reiser4_sync_common
56673 +};
56674 +static struct address_space_operations directory_a_ops = {
56675 + .writepage = bugop,
56676 + .sync_page = bugop,
56677 + .writepages = dummyop,
56678 + .set_page_dirty = bugop,
56679 + .readpages = bugop,
56680 + .write_begin = bugop,
56681 + .write_end = bugop,
56682 + .bmap = bugop,
56683 + .invalidatepage = bugop,
56684 + .releasepage = bugop
56685 +};
56686 +
56687 +/*
56688 + * Definitions of object plugins.
56689 + */
56690 +
56691 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56692 + [UNIX_FILE_PLUGIN_ID] = {
56693 + .h = {
56694 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56695 + .id = UNIX_FILE_PLUGIN_ID,
56696 + .groups = (1 << REISER4_REGULAR_FILE),
56697 + .pops = &file_plugin_ops,
56698 + .label = "reg",
56699 + .desc = "regular file",
56700 + .linkage = {NULL, NULL},
56701 + },
56702 + /*
56703 + * invariant vfs ops
56704 + */
56705 + .inode_ops = &regular_file_i_ops,
56706 + .file_ops = &regular_file_f_ops,
56707 + .as_ops = &regular_file_a_ops,
56708 + /*
56709 + * private i_ops
56710 + */
56711 + .setattr = setattr_unix_file,
56712 + .open = open_unix_file,
56713 + .read = read_unix_file,
56714 + .write = write_unix_file,
56715 + .ioctl = ioctl_unix_file,
56716 + .mmap = mmap_unix_file,
56717 + .release = release_unix_file,
56718 + /*
56719 + * private f_ops
56720 + */
56721 + .readpage = readpage_unix_file,
56722 + .readpages = readpages_unix_file,
56723 + .writepages = writepages_unix_file,
56724 + .write_begin = write_begin_unix_file,
56725 + .write_end = write_end_unix_file,
56726 + /*
56727 + * private a_ops
56728 + */
56729 + .bmap = bmap_unix_file,
56730 + /*
56731 + * other private methods
56732 + */
56733 + .write_sd_by_inode = write_sd_by_inode_common,
56734 + .flow_by_inode = flow_by_inode_unix_file,
56735 + .key_by_inode = key_by_inode_and_offset_common,
56736 + .set_plug_in_inode = set_plug_in_inode_common,
56737 + .adjust_to_parent = adjust_to_parent_common,
56738 + .create_object = reiser4_create_object_common,
56739 + .delete_object = delete_object_unix_file,
56740 + .add_link = reiser4_add_link_common,
56741 + .rem_link = reiser4_rem_link_common,
56742 + .owns_item = owns_item_unix_file,
56743 + .can_add_link = can_add_link_common,
56744 + .detach = dummyop,
56745 + .bind = dummyop,
56746 + .safelink = safelink_common,
56747 + .estimate = {
56748 + .create = estimate_create_common,
56749 + .update = estimate_update_common,
56750 + .unlink = estimate_unlink_common
56751 + },
56752 + .init_inode_data = init_inode_data_unix_file,
56753 + .cut_tree_worker = cut_tree_worker_common,
56754 + .wire = {
56755 + .write = wire_write_common,
56756 + .read = wire_read_common,
56757 + .get = wire_get_common,
56758 + .size = wire_size_common,
56759 + .done = wire_done_common
56760 + }
56761 + },
56762 + [DIRECTORY_FILE_PLUGIN_ID] = {
56763 + .h = {
56764 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56765 + .id = DIRECTORY_FILE_PLUGIN_ID,
56766 + .groups = (1 << REISER4_DIRECTORY_FILE),
56767 + .pops = &file_plugin_ops,
56768 + .label = "dir",
56769 + .desc = "directory",
56770 + .linkage = {NULL, NULL}
56771 + },
56772 + .inode_ops = &null_i_ops,
56773 + .file_ops = &null_f_ops,
56774 + .as_ops = &null_a_ops,
56775 +
56776 + .write_sd_by_inode = write_sd_by_inode_common,
56777 + .flow_by_inode = bugop,
56778 + .key_by_inode = bugop,
56779 + .set_plug_in_inode = set_plug_in_inode_common,
56780 + .adjust_to_parent = adjust_to_parent_common_dir,
56781 + .create_object = reiser4_create_object_common,
56782 + .delete_object = reiser4_delete_dir_common,
56783 + .add_link = reiser4_add_link_common,
56784 + .rem_link = rem_link_common_dir,
56785 + .owns_item = owns_item_common_dir,
56786 + .can_add_link = can_add_link_common,
56787 + .can_rem_link = can_rem_link_common_dir,
56788 + .detach = reiser4_detach_common_dir,
56789 + .bind = reiser4_bind_common_dir,
56790 + .safelink = safelink_common,
56791 + .estimate = {
56792 + .create = estimate_create_common_dir,
56793 + .update = estimate_update_common,
56794 + .unlink = estimate_unlink_common_dir
56795 + },
56796 + .wire = {
56797 + .write = wire_write_common,
56798 + .read = wire_read_common,
56799 + .get = wire_get_common,
56800 + .size = wire_size_common,
56801 + .done = wire_done_common
56802 + },
56803 + .init_inode_data = init_inode_ordering,
56804 + .cut_tree_worker = cut_tree_worker_common,
56805 + },
56806 + [SYMLINK_FILE_PLUGIN_ID] = {
56807 + .h = {
56808 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56809 + .id = SYMLINK_FILE_PLUGIN_ID,
56810 + .groups = (1 << REISER4_SYMLINK_FILE),
56811 + .pops = &file_plugin_ops,
56812 + .label = "symlink",
56813 + .desc = "symbolic link",
56814 + .linkage = {NULL,NULL}
56815 + },
56816 + .inode_ops = &symlink_file_i_ops,
56817 + /* inode->i_fop of symlink is initialized
56818 + by NULL in setup_inode_ops */
56819 + .file_ops = &null_f_ops,
56820 + .as_ops = &null_a_ops,
56821 +
56822 + .write_sd_by_inode = write_sd_by_inode_common,
56823 + .set_plug_in_inode = set_plug_in_inode_common,
56824 + .adjust_to_parent = adjust_to_parent_common,
56825 + .create_object = reiser4_create_symlink,
56826 + .delete_object = reiser4_delete_object_common,
56827 + .add_link = reiser4_add_link_common,
56828 + .rem_link = reiser4_rem_link_common,
56829 + .can_add_link = can_add_link_common,
56830 + .detach = dummyop,
56831 + .bind = dummyop,
56832 + .safelink = safelink_common,
56833 + .estimate = {
56834 + .create = estimate_create_common,
56835 + .update = estimate_update_common,
56836 + .unlink = estimate_unlink_common
56837 + },
56838 + .init_inode_data = init_inode_ordering,
56839 + .cut_tree_worker = cut_tree_worker_common,
56840 + .destroy_inode = destroy_inode_symlink,
56841 + .wire = {
56842 + .write = wire_write_common,
56843 + .read = wire_read_common,
56844 + .get = wire_get_common,
56845 + .size = wire_size_common,
56846 + .done = wire_done_common
56847 + }
56848 + },
56849 + [SPECIAL_FILE_PLUGIN_ID] = {
56850 + .h = {
56851 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56852 + .id = SPECIAL_FILE_PLUGIN_ID,
56853 + .groups = (1 << REISER4_SPECIAL_FILE),
56854 + .pops = &file_plugin_ops,
56855 + .label = "special",
56856 + .desc =
56857 + "special: fifo, device or socket",
56858 + .linkage = {NULL, NULL}
56859 + },
56860 + .inode_ops = &special_file_i_ops,
56861 + /* file_ops of special files (sockets, block, char, fifo) are
56862 + initialized by init_special_inode. */
56863 + .file_ops = &null_f_ops,
56864 + .as_ops = &null_a_ops,
56865 +
56866 + .write_sd_by_inode = write_sd_by_inode_common,
56867 + .set_plug_in_inode = set_plug_in_inode_common,
56868 + .adjust_to_parent = adjust_to_parent_common,
56869 + .create_object = reiser4_create_object_common,
56870 + .delete_object = reiser4_delete_object_common,
56871 + .add_link = reiser4_add_link_common,
56872 + .rem_link = reiser4_rem_link_common,
56873 + .owns_item = owns_item_common,
56874 + .can_add_link = can_add_link_common,
56875 + .detach = dummyop,
56876 + .bind = dummyop,
56877 + .safelink = safelink_common,
56878 + .estimate = {
56879 + .create = estimate_create_common,
56880 + .update = estimate_update_common,
56881 + .unlink = estimate_unlink_common
56882 + },
56883 + .init_inode_data = init_inode_ordering,
56884 + .cut_tree_worker = cut_tree_worker_common,
56885 + .wire = {
56886 + .write = wire_write_common,
56887 + .read = wire_read_common,
56888 + .get = wire_get_common,
56889 + .size = wire_size_common,
56890 + .done = wire_done_common
56891 + }
56892 + },
56893 + [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56894 + .h = {
56895 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56896 + .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56897 + .groups = (1 << REISER4_REGULAR_FILE),
56898 + .pops = &file_plugin_ops,
56899 + .label = "cryptcompress",
56900 + .desc = "cryptcompress file",
56901 + .linkage = {NULL, NULL}
56902 + },
56903 + .inode_ops = &regular_file_i_ops,
56904 + .file_ops = &regular_file_f_ops,
56905 + .as_ops = &regular_file_a_ops,
56906 +
56907 + .setattr = setattr_cryptcompress,
56908 + .open = open_cryptcompress,
56909 + .read = read_cryptcompress,
56910 + .write = write_cryptcompress,
56911 + .ioctl = ioctl_cryptcompress,
56912 + .mmap = mmap_cryptcompress,
56913 + .release = release_cryptcompress,
56914 +
56915 + .readpage = readpage_cryptcompress,
56916 + .readpages = readpages_cryptcompress,
56917 + .writepages = writepages_cryptcompress,
56918 + .write_begin = write_begin_cryptcompress,
56919 + .write_end = write_end_cryptcompress,
56920 +
56921 + .bmap = bmap_cryptcompress,
56922 +
56923 + .write_sd_by_inode = write_sd_by_inode_common,
56924 + .flow_by_inode = flow_by_inode_cryptcompress,
56925 + .key_by_inode = key_by_inode_cryptcompress,
56926 + .set_plug_in_inode = set_plug_in_inode_common,
56927 + .adjust_to_parent = adjust_to_parent_cryptcompress,
56928 + .create_object = create_object_cryptcompress,
56929 + .delete_object = delete_object_cryptcompress,
56930 + .add_link = reiser4_add_link_common,
56931 + .rem_link = reiser4_rem_link_common,
56932 + .owns_item = owns_item_common,
56933 + .can_add_link = can_add_link_common,
56934 + .detach = dummyop,
56935 + .bind = dummyop,
56936 + .safelink = safelink_common,
56937 + .estimate = {
56938 + .create = estimate_create_common,
56939 + .update = estimate_update_common,
56940 + .unlink = estimate_unlink_common
56941 + },
56942 + .init_inode_data = init_inode_data_cryptcompress,
56943 + .cut_tree_worker = cut_tree_worker_cryptcompress,
56944 + .destroy_inode = destroy_inode_cryptcompress,
56945 + .wire = {
56946 + .write = wire_write_common,
56947 + .read = wire_read_common,
56948 + .get = wire_get_common,
56949 + .size = wire_size_common,
56950 + .done = wire_done_common
56951 + }
56952 + }
56953 +};
56954 +
56955 +static int change_dir(struct inode *inode,
56956 + reiser4_plugin * plugin,
56957 + pset_member memb)
56958 +{
56959 + /* cannot change dir plugin of already existing object */
56960 + return RETERR(-EINVAL);
56961 +}
56962 +
56963 +static reiser4_plugin_ops dir_plugin_ops = {
56964 + .change = change_dir
56965 +};
56966 +
56967 +/*
56968 + * definition of directory plugins
56969 + */
56970 +
56971 +dir_plugin dir_plugins[LAST_DIR_ID] = {
56972 + /* standard hashed directory plugin */
56973 + [HASHED_DIR_PLUGIN_ID] = {
56974 + .h = {
56975 + .type_id = REISER4_DIR_PLUGIN_TYPE,
56976 + .id = HASHED_DIR_PLUGIN_ID,
56977 + .pops = &dir_plugin_ops,
56978 + .label = "dir",
56979 + .desc = "hashed directory",
56980 + .linkage = {NULL, NULL}
56981 + },
56982 + .inode_ops = &directory_i_ops,
56983 + .file_ops = &directory_f_ops,
56984 + .as_ops = &directory_a_ops,
56985 +
56986 + .get_parent = get_parent_common,
56987 + .is_name_acceptable = is_name_acceptable_common,
56988 + .build_entry_key = build_entry_key_hashed,
56989 + .build_readdir_key = build_readdir_key_common,
56990 + .add_entry = reiser4_add_entry_common,
56991 + .rem_entry = reiser4_rem_entry_common,
56992 + .init = reiser4_dir_init_common,
56993 + .done = reiser4_dir_done_common,
56994 + .attach = reiser4_attach_common,
56995 + .detach = reiser4_detach_common,
56996 + .estimate = {
56997 + .add_entry = estimate_add_entry_common,
56998 + .rem_entry = estimate_rem_entry_common,
56999 + .unlink = dir_estimate_unlink_common
57000 + }
57001 + },
57002 + /* hashed directory for which seekdir/telldir are guaranteed to
57003 + * work. Brain-damage. */
57004 + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
57005 + .h = {
57006 + .type_id = REISER4_DIR_PLUGIN_TYPE,
57007 + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
57008 + .pops = &dir_plugin_ops,
57009 + .label = "dir32",
57010 + .desc = "directory hashed with 31 bit hash",
57011 + .linkage = {NULL, NULL}
57012 + },
57013 + .inode_ops = &directory_i_ops,
57014 + .file_ops = &directory_f_ops,
57015 + .as_ops = &directory_a_ops,
57016 +
57017 + .get_parent = get_parent_common,
57018 + .is_name_acceptable = is_name_acceptable_common,
57019 + .build_entry_key = build_entry_key_seekable,
57020 + .build_readdir_key = build_readdir_key_common,
57021 + .add_entry = reiser4_add_entry_common,
57022 + .rem_entry = reiser4_rem_entry_common,
57023 + .init = reiser4_dir_init_common,
57024 + .done = reiser4_dir_done_common,
57025 + .attach = reiser4_attach_common,
57026 + .detach = reiser4_detach_common,
57027 + .estimate = {
57028 + .add_entry = estimate_add_entry_common,
57029 + .rem_entry = estimate_rem_entry_common,
57030 + .unlink = dir_estimate_unlink_common
57031 + }
57032 + }
57033 +};
57034 +
57035 +/* Make Linus happy.
57036 + Local variables:
57037 + c-indentation-style: "K&R"
57038 + mode-name: "LC"
57039 + c-basic-offset: 8
57040 + tab-width: 8
57041 + fill-column: 120
57042 + End:
57043 +*/
57044 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/object.h linux-2.6.33/fs/reiser4/plugin/object.h
57045 --- linux-2.6.33.orig/fs/reiser4/plugin/object.h 1970-01-01 01:00:00.000000000 +0100
57046 +++ linux-2.6.33/fs/reiser4/plugin/object.h 2010-03-04 19:33:22.000000000 +0100
57047 @@ -0,0 +1,117 @@
57048 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
57049 + * reiser4/README */
57050 +
57051 +/* Declaration of object plugin functions. */
57052 +
57053 +#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__)
57054 +#define __FS_REISER4_PLUGIN_OBJECT_H__
57055 +
57056 +#include "../type_safe_hash.h"
57057 +
57058 +/* common implementations of inode operations */
57059 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
57060 + int mode, struct nameidata *);
57061 +struct dentry *reiser4_lookup_common(struct inode *parent,
57062 + struct dentry *dentry,
57063 + struct nameidata *nameidata);
57064 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
57065 + struct dentry *newname);
57066 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
57067 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
57068 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
57069 + const char *linkname);
57070 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
57071 + int mode, dev_t rdev);
57072 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
57073 + struct inode *new_dir, struct dentry *new_name);
57074 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
57075 +int reiser4_permission_common(struct inode *, int mask);
57076 +int reiser4_setattr_common(struct dentry *, struct iattr *);
57077 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
57078 + struct kstat *);
57079 +
57080 +/* common implementations of file operations */
57081 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
57082 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
57083 +int reiser4_release_dir_common(struct inode *, struct file *);
57084 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
57085 +
57086 +
57087 +/* file plugin operations: common implementations */
57088 +int write_sd_by_inode_common(struct inode *);
57089 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
57090 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
57091 + reiser4_object_create_data *);
57092 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
57093 + struct inode *root);
57094 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
57095 + struct inode *root);
57096 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
57097 + struct inode *root);
57098 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
57099 + reiser4_object_create_data *);
57100 +int reiser4_delete_object_common(struct inode *);
57101 +int reiser4_delete_dir_common(struct inode *);
57102 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
57103 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
57104 +int rem_link_common_dir(struct inode *object, struct inode *parent);
57105 +int owns_item_common(const struct inode *, const coord_t *);
57106 +int owns_item_common_dir(const struct inode *, const coord_t *);
57107 +int can_add_link_common(const struct inode *);
57108 +int can_rem_link_common_dir(const struct inode *);
57109 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
57110 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
57111 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
57112 +reiser4_block_nr estimate_create_common(const struct inode *);
57113 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
57114 +reiser4_block_nr estimate_update_common(const struct inode *);
57115 +reiser4_block_nr estimate_unlink_common(const struct inode *,
57116 + const struct inode *);
57117 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
57118 + const struct inode *);
57119 +char *wire_write_common(struct inode *, char *start);
57120 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
57121 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
57122 +int wire_size_common(struct inode *);
57123 +void wire_done_common(reiser4_object_on_wire *);
57124 +
57125 +/* dir plugin operations: common implementations */
57126 +struct dentry *get_parent_common(struct inode *child);
57127 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
57128 +void build_entry_key_common(const struct inode *,
57129 + const struct qstr *qname, reiser4_key *);
57130 +int build_readdir_key_common(struct file *dir, reiser4_key *);
57131 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
57132 + reiser4_object_create_data * , reiser4_dir_entry_desc *);
57133 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
57134 + reiser4_dir_entry_desc *);
57135 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
57136 + reiser4_object_create_data *);
57137 +int reiser4_dir_done_common(struct inode *);
57138 +int reiser4_attach_common(struct inode *child, struct inode *parent);
57139 +int reiser4_detach_common(struct inode *object, struct inode *parent);
57140 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
57141 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
57142 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
57143 + const struct inode *);
57144 +
57145 +/* these are essential parts of common implementations, they are to make
57146 + customized implementations easier */
57147 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
57148 +
57149 +/* merely useful functions */
57150 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * ,
57151 + const reiser4_key * , int silent);
57152 +
57153 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
57154 +#endif
57155 +
57156 +/* Make Linus happy.
57157 + Local variables:
57158 + c-indentation-style: "K&R"
57159 + mode-name: "LC"
57160 + c-basic-offset: 8
57161 + tab-width: 8
57162 + fill-column: 120
57163 + End:
57164 +*/
57165 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin.c linux-2.6.33/fs/reiser4/plugin/plugin.c
57166 --- linux-2.6.33.orig/fs/reiser4/plugin/plugin.c 1970-01-01 01:00:00.000000000 +0100
57167 +++ linux-2.6.33/fs/reiser4/plugin/plugin.c 2010-03-04 19:33:22.000000000 +0100
57168 @@ -0,0 +1,560 @@
57169 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
57170 + * reiser4/README */
57171 +
57172 +/* Basic plugin infrastructure, lookup etc. */
57173 +
57174 +/* PLUGINS:
57175 +
57176 + Plugins are internal Reiser4 "modules" or "objects" used to increase
57177 + extensibility and allow external users to easily adapt reiser4 to
57178 + their needs.
57179 +
57180 + Plugins are classified into several disjoint "types". Plugins
57181 + belonging to the particular plugin type are termed "instances" of
57182 + this type. Existing types are listed by enum reiser4_plugin_type
57183 + (see plugin/plugin_header.h)
57184 +
57185 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
57186 +
57187 + Object (file) plugin determines how given file-system object serves
57188 + standard VFS requests for read, write, seek, mmap etc. Instances of
57189 + file plugins are: regular file, directory, symlink. Another example
57190 + of file plugin is audit plugin, that optionally records accesses to
57191 + underlying object and forwards requests to it.
57192 +
57193 + Hash plugins compute hashes used by reiser4 to store and locate
57194 + files within directories. Instances of hash plugin type are: r5,
57195 + tea, rupasov.
57196 +
57197 + Tail plugins (or, more precisely, tail policy plugins) determine
57198 + when last part of the file should be stored in a formatted item.
57199 +
57200 + Scope and lookup:
57201 +
57202 + label such that pair ( type_label, plugin_label ) is unique. This
57203 + pair is a globally persistent and user-visible plugin
57204 + identifier. Internally kernel maintains plugins and plugin types in
57205 + arrays using an index into those arrays as plugin and plugin type
57206 + identifiers. File-system in turn, also maintains persistent
57207 + "dictionary" which is mapping from plugin label to numerical
57208 + identifier which is stored in file-system objects. That is, we
57209 + store the offset into the plugin array for that plugin type as the
57210 + plugin id in the stat data of the filesystem object.
57211 +
57212 + Internal kernel plugin type identifier (index in plugins[] array) is
57213 + of type reiser4_plugin_type. Set of available plugin types is
57214 + currently static, but dynamic loading doesn't seem to pose
57215 + insurmountable problems.
57216 +
57217 + Within each type plugins are addressed by the identifiers of type
57218 + reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
57219 + Such identifiers are only required to be unique within one type,
57220 + not globally.
57221 +
57222 + Thus, plugin in memory is uniquely identified by the pair (type_id,
57223 + id).
57224 +
57225 + Usage:
57226 +
57227 + There exists only one instance of each plugin instance, but this
57228 + single instance can be associated with many entities (file-system
57229 + objects, items, nodes, transactions, file-descriptors etc.). Entity
57230 + to which plugin of given type is termed (due to the lack of
57231 + imagination) "subject" of this plugin type and, by abuse of
57232 + terminology, subject of particular instance of this type to which
57233 + it's attached currently. For example, inode is subject of object
57234 + plugin type. Inode representing directory is subject of directory
57235 + plugin, hash plugin type and some particular instance of hash plugin
57236 + type. Inode, representing regular file is subject of "regular file"
57237 + plugin, tail-policy plugin type etc.
57238 +
57239 + With each subject the plugin possibly stores some state. For example,
57240 + the state of a directory plugin (instance of object plugin type) is pointer
57241 + to hash plugin (if directories always use hashing that is).
57242 +
57243 + Interface:
57244 +
57245 + In addition to a scalar identifier, each plugin type and plugin
57246 + proper has a "label": short string and a "description"---longer
57247 + descriptive string. Labels and descriptions of plugin types are
57248 + hard-coded into plugins[] array, declared and defined in
57249 + plugin.c. Label and description of plugin are stored in .label and
57250 + .desc fields of reiser4_plugin_header respectively. It's possible to
57251 + locate plugin by the pair of labels.
57252 +
57253 + Features (not implemented):
57254 +
57255 + . user-level plugin manipulations:
57256 + + reiser4("filename/..file_plugin<='audit'");
57257 + + write(open("filename/..file_plugin"), "audit", 8);
57258 +
57259 + . user level utilities lsplug and chplug to manipulate plugins.
57260 + Utilities are not of primary priority. Possibly they will be not
57261 + working on v4.0
57262 +
57263 + NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
57264 + option, do you agree? I don't think that specifying it at mount time,
57265 + and then changing it with each mount, is a good model for usage.
57266 +
57267 + . mount option "plug" to set-up plugins of root-directory.
57268 + "plug=foo:bar" will set "bar" as default plugin of type "foo".
57269 +
57270 + Limitations:
57271 +
57272 + . each plugin type has to provide at least one builtin
57273 + plugin. This is technical limitation and it can be lifted in the
57274 + future.
57275 +
57276 + TODO:
57277 +
57278 + New plugin types/plugings:
57279 + Things we should be able to separately choose to inherit:
57280 +
57281 + security plugins
57282 +
57283 + stat data
57284 +
57285 + file bodies
57286 +
57287 + file plugins
57288 +
57289 + dir plugins
57290 +
57291 + . perm:acl
57292 +
57293 + . audi---audit plugin intercepting and possibly logging all
57294 + accesses to object. Requires to put stub functions in file_operations
57295 + in stead of generic_file_*.
57296 +
57297 +NIKITA-FIXME-HANS: why make overflows a plugin?
57298 + . over---handle hash overflows
57299 +
57300 + . sqnt---handle different access patterns and instruments read-ahead
57301 +
57302 +NIKITA-FIXME-HANS: describe the line below in more detail.
57303 +
57304 + . hier---handle inheritance of plugins along file-system hierarchy
57305 +
57306 + Different kinds of inheritance: on creation vs. on access.
57307 + Compatible/incompatible plugins.
57308 + Inheritance for multi-linked files.
57309 + Layered plugins.
57310 + Notion of plugin context is abandoned.
57311 +
57312 +Each file is associated
57313 + with one plugin and dependant plugins (hash, etc.) are stored as
57314 + main plugin state. Now, if we have plugins used for regular files
57315 + but not for directories, how such plugins would be inherited?
57316 + . always store them with directories also
57317 +
57318 +NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
57319 +the line below which is also useful.
57320 +
57321 + . use inheritance hierarchy, independent of file-system namespace
57322 +*/
57323 +
57324 +#include "../debug.h"
57325 +#include "../dformat.h"
57326 +#include "plugin_header.h"
57327 +#include "item/static_stat.h"
57328 +#include "node/node.h"
57329 +#include "security/perm.h"
57330 +#include "space/space_allocator.h"
57331 +#include "disk_format/disk_format.h"
57332 +#include "plugin.h"
57333 +#include "../reiser4.h"
57334 +#include "../jnode.h"
57335 +#include "../inode.h"
57336 +
57337 +#include <linux/fs.h> /* for struct super_block */
57338 +
57339 +/*
57340 + * init_plugins - initialize plugin sub-system.
57341 + * Just call this once on reiser4 startup.
57342 + *
57343 + * Initializes plugin sub-system. It is part of reiser4 module
57344 + * initialization. For each plugin of each type init method is called and each
57345 + * plugin is put into list of plugins.
57346 + */
57347 +int init_plugins(void)
57348 +{
57349 + reiser4_plugin_type type_id;
57350 +
57351 + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
57352 + struct reiser4_plugin_type_data *ptype;
57353 + int i;
57354 +
57355 + ptype = &plugins[type_id];
57356 + assert("nikita-3508", ptype->label != NULL);
57357 + assert("nikita-3509", ptype->type_id == type_id);
57358 +
57359 + INIT_LIST_HEAD(&ptype->plugins_list);
57360 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term
57361 + * builtin. */
57362 + for (i = 0; i < ptype->builtin_num; ++i) {
57363 + reiser4_plugin *plugin;
57364 +
57365 + plugin = plugin_at(ptype, i);
57366 +
57367 + if (plugin->h.label == NULL)
57368 + /* uninitialized slot encountered */
57369 + continue;
57370 + assert("nikita-3445", plugin->h.type_id == type_id);
57371 + plugin->h.id = i;
57372 + if (plugin->h.pops != NULL &&
57373 + plugin->h.pops->init != NULL) {
57374 + int result;
57375 +
57376 + result = plugin->h.pops->init(plugin);
57377 + if (result != 0)
57378 + return result;
57379 + }
57380 + INIT_LIST_HEAD(&plugin->h.linkage);
57381 + list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
57382 + }
57383 + }
57384 + return 0;
57385 +}
57386 +
57387 +/* true if plugin type id is valid */
57388 +int is_plugin_type_valid(reiser4_plugin_type type)
57389 +{
57390 + /* "type" is unsigned, so no comparison with 0 is
57391 + necessary */
57392 + return (type < REISER4_PLUGIN_TYPES);
57393 +}
57394 +
57395 +/* true if plugin id is valid */
57396 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
57397 +{
57398 + assert("nikita-1653", is_plugin_type_valid(type));
57399 + return id < plugins[type].builtin_num;
57400 +}
57401 +
57402 +/* return plugin by its @type and @id.
57403 +
57404 + Both arguments are checked for validness: this is supposed to be called
57405 + from user-level.
57406 +
57407 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57408 +user space, and passed to the filesystem by use of method files? Your
57409 +comment really confused me on the first reading....
57410 +
57411 +*/
57412 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57413 + * unchecked */,
57414 + reiser4_plugin_id id /* plugin id,
57415 + * unchecked */)
57416 +{
57417 + if (is_plugin_type_valid(type)) {
57418 + if (is_plugin_id_valid(type, id))
57419 + return plugin_at(&plugins[type], id);
57420 + else
57421 + /* id out of bounds */
57422 + warning("nikita-2913",
57423 + "Invalid plugin id: [%i:%i]", type, id);
57424 + } else
57425 + /* type_id out of bounds */
57426 + warning("nikita-2914", "Invalid type_id: %i", type);
57427 + return NULL;
57428 +}
57429 +
57430 +/**
57431 + * save_plugin_id - store plugin id in disk format
57432 + * @plugin: plugin to convert
57433 + * @area: where to store result
57434 + *
57435 + * Puts id of @plugin in little endian format to address @area.
57436 + */
57437 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57438 + d16 * area/* where to store result */)
57439 +{
57440 + assert("nikita-1261", plugin != NULL);
57441 + assert("nikita-1262", area != NULL);
57442 +
57443 + put_unaligned(cpu_to_le16(plugin->h.id), area);
57444 + return 0;
57445 +}
57446 +
57447 +/* list of all plugins of given type */
57448 +struct list_head *get_plugin_list(reiser4_plugin_type type)
57449 +{
57450 + assert("nikita-1056", is_plugin_type_valid(type));
57451 + return &plugins[type].plugins_list;
57452 +}
57453 +
57454 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
57455 +{
57456 + struct dentry *rootdir;
57457 + reiser4_inode *root;
57458 +
57459 + assert("edward-1443", memb != PSET_FILE);
57460 +
57461 + rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57462 + if (rootdir != NULL) {
57463 + root = reiser4_inode_data(rootdir->d_inode);
57464 + /*
57465 + * if inode is different from the default one, or we are
57466 + * changing plugin of root directory, update plugin_mask
57467 + */
57468 + if (aset_get(info->pset, memb) !=
57469 + aset_get(root->pset, memb) ||
57470 + info == root)
57471 + info->plugin_mask |= (1 << memb);
57472 + else
57473 + info->plugin_mask &= ~(1 << memb);
57474 + }
57475 +}
57476 +
57477 +/* Get specified plugin set member from parent,
57478 + or from fs-defaults (if no parent is given) and
57479 + install the result to pset of @self */
57480 +int grab_plugin_pset(struct inode *self,
57481 + struct inode *ancestor,
57482 + pset_member memb)
57483 +{
57484 + reiser4_plugin *plug;
57485 + reiser4_inode *info;
57486 + int result = 0;
57487 +
57488 + /* Do not grab if initialised already. */
57489 + info = reiser4_inode_data(self);
57490 + if (aset_get(info->pset, memb) != NULL)
57491 + return 0;
57492 + if (ancestor) {
57493 + reiser4_inode *parent;
57494 +
57495 + parent = reiser4_inode_data(ancestor);
57496 + plug = aset_get(parent->hset, memb) ? :
57497 + aset_get(parent->pset, memb);
57498 + } else
57499 + plug = get_default_plugin(memb);
57500 +
57501 + result = set_plugin(&info->pset, memb, plug);
57502 + if (result == 0) {
57503 + if (!ancestor || self->i_sb->s_root->d_inode != self)
57504 + update_pset_mask(info, memb);
57505 + }
57506 + return result;
57507 +}
57508 +
57509 +/* Take missing pset members from root inode */
57510 +int finish_pset(struct inode *inode)
57511 +{
57512 + reiser4_plugin *plug;
57513 + reiser4_inode *root;
57514 + reiser4_inode *info;
57515 + pset_member memb;
57516 + int result = 0;
57517 +
57518 + root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57519 + info = reiser4_inode_data(inode);
57520 +
57521 + assert("edward-1455", root != NULL);
57522 + assert("edward-1456", info != NULL);
57523 +
57524 + /* file and directory plugins are already initialized. */
57525 + for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57526 +
57527 + /* Do not grab if initialised already. */
57528 + if (aset_get(info->pset, memb) != NULL)
57529 + continue;
57530 +
57531 + plug = aset_get(root->pset, memb);
57532 + result = set_plugin(&info->pset, memb, plug);
57533 + if (result != 0)
57534 + break;
57535 + }
57536 + if (result != 0) {
57537 + warning("nikita-3447",
57538 + "Cannot set up plugins for %lli",
57539 + (unsigned long long)
57540 + get_inode_oid(inode));
57541 + }
57542 + return result;
57543 +}
57544 +
57545 +int force_plugin_pset(struct inode *self, pset_member memb,
57546 + reiser4_plugin * plug)
57547 +{
57548 + reiser4_inode *info;
57549 + int result = 0;
57550 +
57551 + if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57552 + /* Changing pset in the root object. */
57553 + return RETERR(-EINVAL);
57554 + }
57555 +
57556 + info = reiser4_inode_data(self);
57557 + if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57558 + result = plug->h.pops->change(self, plug, memb);
57559 + else
57560 + result = aset_set_unsafe(&info->pset, memb, plug);
57561 + if (result == 0) {
57562 + __u16 oldmask = info->plugin_mask;
57563 +
57564 + update_pset_mask(info, memb);
57565 + if (oldmask != info->plugin_mask)
57566 + reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57567 + }
57568 + return result;
57569 +}
57570 +
57571 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57572 + /* C90 initializers */
57573 + [REISER4_FILE_PLUGIN_TYPE] = {
57574 + .type_id = REISER4_FILE_PLUGIN_TYPE,
57575 + .label = "file",
57576 + .desc = "Object plugins",
57577 + .builtin_num = sizeof_array(file_plugins),
57578 + .builtin = file_plugins,
57579 + .plugins_list = {NULL, NULL},
57580 + .size = sizeof(file_plugin)
57581 + },
57582 + [REISER4_DIR_PLUGIN_TYPE] = {
57583 + .type_id = REISER4_DIR_PLUGIN_TYPE,
57584 + .label = "dir",
57585 + .desc = "Directory plugins",
57586 + .builtin_num = sizeof_array(dir_plugins),
57587 + .builtin = dir_plugins,
57588 + .plugins_list = {NULL, NULL},
57589 + .size = sizeof(dir_plugin)
57590 + },
57591 + [REISER4_HASH_PLUGIN_TYPE] = {
57592 + .type_id = REISER4_HASH_PLUGIN_TYPE,
57593 + .label = "hash",
57594 + .desc = "Directory hashes",
57595 + .builtin_num = sizeof_array(hash_plugins),
57596 + .builtin = hash_plugins,
57597 + .plugins_list = {NULL, NULL},
57598 + .size = sizeof(hash_plugin)
57599 + },
57600 + [REISER4_FIBRATION_PLUGIN_TYPE] = {
57601 + .type_id =
57602 + REISER4_FIBRATION_PLUGIN_TYPE,
57603 + .label = "fibration",
57604 + .desc = "Directory fibrations",
57605 + .builtin_num = sizeof_array(fibration_plugins),
57606 + .builtin = fibration_plugins,
57607 + .plugins_list = {NULL, NULL},
57608 + .size = sizeof(fibration_plugin)
57609 + },
57610 + [REISER4_CIPHER_PLUGIN_TYPE] = {
57611 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57612 + .label = "cipher",
57613 + .desc = "Cipher plugins",
57614 + .builtin_num = sizeof_array(cipher_plugins),
57615 + .builtin = cipher_plugins,
57616 + .plugins_list = {NULL, NULL},
57617 + .size = sizeof(cipher_plugin)
57618 + },
57619 + [REISER4_DIGEST_PLUGIN_TYPE] = {
57620 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57621 + .label = "digest",
57622 + .desc = "Digest plugins",
57623 + .builtin_num = sizeof_array(digest_plugins),
57624 + .builtin = digest_plugins,
57625 + .plugins_list = {NULL, NULL},
57626 + .size = sizeof(digest_plugin)
57627 + },
57628 + [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57629 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57630 + .label = "compression",
57631 + .desc = "Compression plugins",
57632 + .builtin_num = sizeof_array(compression_plugins),
57633 + .builtin = compression_plugins,
57634 + .plugins_list = {NULL, NULL},
57635 + .size = sizeof(compression_plugin)
57636 + },
57637 + [REISER4_FORMATTING_PLUGIN_TYPE] = {
57638 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57639 + .label = "formatting",
57640 + .desc = "Tail inlining policies",
57641 + .builtin_num = sizeof_array(formatting_plugins),
57642 + .builtin = formatting_plugins,
57643 + .plugins_list = {NULL, NULL},
57644 + .size = sizeof(formatting_plugin)
57645 + },
57646 + [REISER4_PERM_PLUGIN_TYPE] = {
57647 + .type_id = REISER4_PERM_PLUGIN_TYPE,
57648 + .label = "perm",
57649 + .desc = "Permission checks",
57650 + .builtin_num = sizeof_array(perm_plugins),
57651 + .builtin = perm_plugins,
57652 + .plugins_list = {NULL, NULL},
57653 + .size = sizeof(perm_plugin)
57654 + },
57655 + [REISER4_ITEM_PLUGIN_TYPE] = {
57656 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
57657 + .label = "item",
57658 + .desc = "Item handlers",
57659 + .builtin_num = sizeof_array(item_plugins),
57660 + .builtin = item_plugins,
57661 + .plugins_list = {NULL, NULL},
57662 + .size = sizeof(item_plugin)
57663 + },
57664 + [REISER4_NODE_PLUGIN_TYPE] = {
57665 + .type_id = REISER4_NODE_PLUGIN_TYPE,
57666 + .label = "node",
57667 + .desc = "node layout handlers",
57668 + .builtin_num = sizeof_array(node_plugins),
57669 + .builtin = node_plugins,
57670 + .plugins_list = {NULL, NULL},
57671 + .size = sizeof(node_plugin)
57672 + },
57673 + [REISER4_SD_EXT_PLUGIN_TYPE] = {
57674 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57675 + .label = "sd_ext",
57676 + .desc = "Parts of stat-data",
57677 + .builtin_num = sizeof_array(sd_ext_plugins),
57678 + .builtin = sd_ext_plugins,
57679 + .plugins_list = {NULL, NULL},
57680 + .size = sizeof(sd_ext_plugin)
57681 + },
57682 + [REISER4_FORMAT_PLUGIN_TYPE] = {
57683 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57684 + .label = "disk_layout",
57685 + .desc = "defines filesystem on disk layout",
57686 + .builtin_num = sizeof_array(format_plugins),
57687 + .builtin = format_plugins,
57688 + .plugins_list = {NULL, NULL},
57689 + .size = sizeof(disk_format_plugin)
57690 + },
57691 + [REISER4_JNODE_PLUGIN_TYPE] = {
57692 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
57693 + .label = "jnode",
57694 + .desc = "defines kind of jnode",
57695 + .builtin_num = sizeof_array(jnode_plugins),
57696 + .builtin = jnode_plugins,
57697 + .plugins_list = {NULL, NULL},
57698 + .size = sizeof(jnode_plugin)
57699 + },
57700 + [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57701 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57702 + .label = "compression_mode",
57703 + .desc = "Defines compression mode",
57704 + .builtin_num = sizeof_array(compression_mode_plugins),
57705 + .builtin = compression_mode_plugins,
57706 + .plugins_list = {NULL, NULL},
57707 + .size = sizeof(compression_mode_plugin)
57708 + },
57709 + [REISER4_CLUSTER_PLUGIN_TYPE] = {
57710 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57711 + .label = "cluster",
57712 + .desc = "Defines cluster size",
57713 + .builtin_num = sizeof_array(cluster_plugins),
57714 + .builtin = cluster_plugins,
57715 + .plugins_list = {NULL, NULL},
57716 + .size = sizeof(cluster_plugin)
57717 + }
57718 +};
57719 +
57720 +/*
57721 + * Local variables:
57722 + * c-indentation-style: "K&R"
57723 + * mode-name: "LC"
57724 + * c-basic-offset: 8
57725 + * tab-width: 8
57726 + * fill-column: 120
57727 + * End:
57728 + */
57729 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin.h linux-2.6.33/fs/reiser4/plugin/plugin.h
57730 --- linux-2.6.33.orig/fs/reiser4/plugin/plugin.h 1970-01-01 01:00:00.000000000 +0100
57731 +++ linux-2.6.33/fs/reiser4/plugin/plugin.h 2010-03-04 19:33:22.000000000 +0100
57732 @@ -0,0 +1,942 @@
57733 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
57734 + * reiser4/README */
57735 +
57736 +/* Basic plugin data-types.
57737 + see fs/reiser4/plugin/plugin.c for details */
57738 +
57739 +#if !defined(__FS_REISER4_PLUGIN_TYPES_H__)
57740 +#define __FS_REISER4_PLUGIN_TYPES_H__
57741 +
57742 +#include "../forward.h"
57743 +#include "../debug.h"
57744 +#include "../dformat.h"
57745 +#include "../key.h"
57746 +#include "compress/compress.h"
57747 +#include "crypto/cipher.h"
57748 +#include "plugin_header.h"
57749 +#include "item/static_stat.h"
57750 +#include "item/internal.h"
57751 +#include "item/sde.h"
57752 +#include "item/cde.h"
57753 +#include "item/item.h"
57754 +#include "node/node.h"
57755 +#include "node/node40.h"
57756 +#include "security/perm.h"
57757 +#include "fibration.h"
57758 +
57759 +#include "space/bitmap.h"
57760 +#include "space/space_allocator.h"
57761 +
57762 +#include "disk_format/disk_format40.h"
57763 +#include "disk_format/disk_format.h"
57764 +
57765 +#include <linux/fs.h> /* for struct super_block, address_space */
57766 +#include <linux/mm.h> /* for struct page */
57767 +#include <linux/buffer_head.h> /* for struct buffer_head */
57768 +#include <linux/dcache.h> /* for struct dentry */
57769 +#include <linux/types.h>
57770 +#include <linux/crypto.h>
57771 +
57772 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57773 +
57774 +/*
57775 + * File plugin. Defines the set of methods that file plugins implement, some
57776 + * of which are optional.
57777 + *
57778 + * A file plugin offers to the caller an interface for IO ( writing to and/or
57779 + * reading from) to what the caller sees as one sequence of bytes. An IO to it
57780 + * may affect more than one physical sequence of bytes, or no physical sequence
57781 + * of bytes, it may affect sequences of bytes offered by other file plugins to
57782 + * the semantic layer, and the file plugin may invoke other plugins and
57783 + * delegate work to them, but its interface is structured for offering the
57784 + * caller the ability to read and/or write what the caller sees as being a
57785 + * single sequence of bytes.
57786 + *
57787 + * The file plugin must present a sequence of bytes to the caller, but it does
57788 + * not necessarily have to store a sequence of bytes, it does not necessarily
57789 + * have to support efficient tree traversal to any offset in the sequence of
57790 + * bytes (tail and extent items, whose keys contain offsets, do however provide
57791 + * efficient non-sequential lookup of any offset in the sequence of bytes).
57792 + *
57793 + * Directory plugins provide methods for selecting file plugins by resolving a
57794 + * name for them.
57795 + *
57796 + * The functionality other filesystems call an attribute, and rigidly tie
57797 + * together, we decompose into orthogonal selectable features of files. Using
57798 + * the terminology we will define next, an attribute is a perhaps constrained,
57799 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
57800 + * which might be grandparent-major-packed, and whose parent has a deletion
57801 + * method that deletes it.
57802 + *
57803 + * File plugins can implement constraints.
57804 + *
57805 + * Files can be of variable length (e.g. regular unix files), or of static
57806 + * length (e.g. static sized attributes).
57807 + *
57808 + * An object may have many sequences of bytes, and many file plugins, but, it
57809 + * has exactly one objectid. It is usually desirable that an object has a
57810 + * deletion method which deletes every item with that objectid. Items cannot
57811 + * in general be found by just their objectids. This means that an object must
57812 + * have either a method built into its deletion plugin method for knowing what
57813 + * items need to be deleted, or links stored with the object that provide the
57814 + * plugin with a method for finding those items. Deleting a file within an
57815 + * object may or may not have the effect of deleting the entire object,
57816 + * depending on the file plugin's deletion method.
57817 + *
57818 + * LINK TAXONOMY:
57819 + *
57820 + * Many objects have a reference count, and when the reference count reaches 0
57821 + * the object's deletion method is invoked. Some links embody a reference
57822 + * count increase ("countlinks"), and others do not ("nocountlinks").
57823 + *
57824 + * Some links are bi-directional links ("bilinks"), and some are
57825 + * uni-directional("unilinks").
57826 + *
57827 + * Some links are between parts of the same object ("intralinks"), and some are
57828 + * between different objects ("interlinks").
57829 + *
57830 + * PACKING TAXONOMY:
57831 + *
57832 + * Some items of an object are stored with a major packing locality based on
57833 + * their object's objectid (e.g. unix directory items in plan A), and these are
57834 + * called "self-major-packed".
57835 + *
57836 + * Some items of an object are stored with a major packing locality based on
57837 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57838 + * and these are called "parent-major-packed".
57839 + *
57840 + * Some items of an object are stored with a major packing locality based on
57841 + * their semantic grandparent, and these are called "grandparent-major-packed".
57842 + * Now carefully notice that we run into trouble with key length if we have to
57843 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57844 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57845 + * a 24 byte key. One of these fields must be sacrificed if an item is to be
57846 + * grandparent-major-packed, and which to sacrifice is left to the item author
57847 + * choosing to make the item grandparent-major-packed. You cannot make tail
57848 + * items and extent items grandparent-major-packed, though you could make them
57849 + * self-major-packed (usually they are parent-major-packed).
57850 + *
57851 + * In the case of ACLs (which are composed of fixed length ACEs which consist
57852 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
57853 + * to not have an offset field in the ACE item key, and to allow duplicate keys
57854 + * for ACEs. Thus, the set of ACES for a given file is found by looking for a
57855 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57856 + * a directory together), the minor packing locality of ACE, the objectid of
57857 + * the file, and 0.
57858 + *
57859 + * IO involves moving data from one location to another, which means that two
57860 + * locations must be specified, source and destination.
57861 + *
57862 + * This source and destination can be in the filesystem, or they can be a
57863 + * pointer in the user process address space plus a byte count.
57864 + *
57865 + * If both source and destination are in the filesystem, then at least one of
57866 + * them must be representable as a pure stream of bytes (which we call a flow,
57867 + * and define as a struct containing a key, a data pointer, and a length).
57868 + * This may mean converting one of them into a flow. We provide a generic
57869 + * cast_into_flow() method, which will work for any plugin supporting
57870 + * read_flow(), though it is inefficiently implemented in that it temporarily
57871 + * stores the flow in a buffer (Question: what to do with huge flows that
57872 + * cannot fit into memory? Answer: we must not convert them all at once. )
57873 + *
57874 + * Performing a write requires resolving the write request into a flow defining
57875 + * the source, and a method that performs the write, and a key that defines
57876 + * where in the tree the write is to go.
57877 + *
57878 + * Performing a read requires resolving the read request into a flow defining
57879 + * the target, and a method that performs the read, and a key that defines
57880 + * where in the tree the read is to come from.
57881 + *
57882 + * There will exist file plugins which have no pluginid stored on the disk for
57883 + * them, and which are only invoked by other plugins.
57884 + */
57885 +
57886 +/* This should be incremented with each new contributed
57887 + pair (plugin type, plugin id).
57888 + NOTE: Make sure there is a release of reiser4progs
57889 + with the corresponding version number */
57890 +#define PLUGIN_LIBRARY_VERSION 0
57891 +
57892 + /* enumeration of fields within plugin_set */
57893 +typedef enum {
57894 + PSET_FILE,
57895 + PSET_DIR, /* PSET_FILE and PSET_DIR should be first
57896 + * elements: inode.c:read_inode() depends on
57897 + * this. */
57898 + PSET_PERM,
57899 + PSET_FORMATTING,
57900 + PSET_HASH,
57901 + PSET_FIBRATION,
57902 + PSET_SD,
57903 + PSET_DIR_ITEM,
57904 + PSET_CIPHER,
57905 + PSET_DIGEST,
57906 + PSET_COMPRESSION,
57907 + PSET_COMPRESSION_MODE,
57908 + PSET_CLUSTER,
57909 + PSET_CREATE,
57910 + PSET_LAST
57911 +} pset_member;
57912 +
57913 +/* builtin file-plugins */
57914 +typedef enum {
57915 + /* regular file */
57916 + UNIX_FILE_PLUGIN_ID,
57917 + /* directory */
57918 + DIRECTORY_FILE_PLUGIN_ID,
57919 + /* symlink */
57920 + SYMLINK_FILE_PLUGIN_ID,
57921 + /* for objects completely handled by the VFS: fifos, devices,
57922 + sockets */
57923 + SPECIAL_FILE_PLUGIN_ID,
57924 + /* regular cryptcompress file */
57925 + CRYPTCOMPRESS_FILE_PLUGIN_ID,
57926 + /* number of file plugins. Used as size of arrays to hold
57927 + file plugins. */
57928 + LAST_FILE_PLUGIN_ID
57929 +} reiser4_file_id;
57930 +
57931 +typedef struct file_plugin {
57932 +
57933 + /* generic fields */
57934 + plugin_header h;
57935 +
57936 + /* VFS methods.
57937 + * Must be invariant with respect to plugin conversion.
57938 + * It can be achieved by using "common" methods, which
57939 + * are the same for all plugins that take participation in
57940 + * conversion, or by using "generic" or "careful" methods,
57941 + * which provide automatic redirection to proper private
57942 + * plugin methods ("careful" are the same as "generic",
57943 + * but with protection of pset and other disk structures
57944 + * from being rebuilt during conversion.
57945 + */
57946 + struct inode_operations * inode_ops;
57947 + struct file_operations * file_ops;
57948 + struct address_space_operations * as_ops;
57949 + /**
57950 + * Private methods. These are optional. If used they will allow you
57951 + * to minimize the amount of code needed to implement a deviation
57952 + * from some other method that also uses them.
57953 + */
57954 + /*
57955 + * private inode_ops
57956 + */
57957 + int (*setattr)(struct dentry *, struct iattr *);
57958 + /*
57959 + * private file_ops
57960 + */
57961 + /* do whatever is necessary to do when object is opened */
57962 + int (*open) (struct inode *inode, struct file *file);
57963 + ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57964 + loff_t *off);
57965 + /* write as much as possible bytes from nominated @write_amount
57966 + * before plugin scheduling is occurred. Save scheduling state
57967 + * in @cont */
57968 + ssize_t (*write) (struct file *, const char __user *buf,
57969 + size_t write_amount, loff_t * off,
57970 + struct dispatch_context * cont);
57971 + int (*ioctl) (struct inode *inode, struct file *filp,
57972 + unsigned int cmd, unsigned long arg);
57973 + int (*mmap) (struct file *, struct vm_area_struct *);
57974 + int (*release) (struct inode *, struct file *);
57975 + /*
57976 + * private a_ops
57977 + */
57978 + int (*readpage) (struct file *file, struct page *page);
57979 + int (*readpages)(struct file *file, struct address_space *mapping,
57980 + struct list_head *pages, unsigned nr_pages);
57981 + int (*writepages)(struct address_space *mapping,
57982 + struct writeback_control *wbc);
57983 + int (*write_begin)(struct file *file, struct page *page,
57984 + unsigned from, unsigned to);
57985 + int (*write_end)(struct file *file, struct page *page,
57986 + unsigned from, unsigned to);
57987 + sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57988 + /* other private methods */
57989 + /* save inode cached stat-data onto disk. It was called
57990 + reiserfs_update_sd() in 3.x */
57991 + int (*write_sd_by_inode) (struct inode *);
57992 + /*
57993 + * Construct flow into @flow according to user-supplied data.
57994 + *
57995 + * This is used by read/write methods to construct a flow to
57996 + * write/read. ->flow_by_inode() is plugin method, rather than single
57997 + * global implementation, because key in a flow used by plugin may
57998 + * depend on data in a @buf.
57999 + *
58000 + * NIKITA-FIXME-HANS: please create statistics on what functions are
58001 + * dereferenced how often for the mongo benchmark. You can supervise
58002 + * Elena doing this for you if that helps. Email me the list of the
58003 + * top 10, with their counts, and an estimate of the total number of
58004 + * CPU cycles spent dereferencing as a percentage of CPU cycles spent
58005 + * processing (non-idle processing). If the total percent is, say,
58006 + * less than 1%, it will make our coding discussions much easier, and
58007 + * keep me from questioning whether functions like the below are too
58008 + * frequently called to be dereferenced. If the total percent is more
58009 + * than 1%, perhaps private methods should be listed in a "required"
58010 + * comment at the top of each plugin (with stern language about how if
58011 + * the comment is missing it will not be accepted by the maintainer),
58012 + * and implemented using macros not dereferenced functions. How about
58013 + * replacing this whole private methods part of the struct with a
58014 + * thorough documentation of what the standard helper functions are for
58015 + * use in constructing plugins? I think users have been asking for
58016 + * that, though not in so many words.
58017 + */
58018 + int (*flow_by_inode) (struct inode *, const char __user *buf,
58019 + int user, loff_t size,
58020 + loff_t off, rw_op op, flow_t *);
58021 + /*
58022 + * Return the key used to retrieve an offset of a file. It is used by
58023 + * default implementation of ->flow_by_inode() method
58024 + * (common_build_flow()) and, among other things, to get to the extent
58025 + * from jnode of unformatted node.
58026 + */
58027 + int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
58028 +
58029 + /* NIKITA-FIXME-HANS: this comment is not as clear to others as you
58030 + * think.... */
58031 + /*
58032 + * set the plugin for a file. Called during file creation in creat()
58033 + * but not reiser4() unless an inode already exists for the file.
58034 + */
58035 + int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
58036 + reiser4_object_create_data *);
58037 +
58038 + /* NIKITA-FIXME-HANS: comment and name seem to say different things,
58039 + * are you setting up the object itself also or just adjusting the
58040 + * parent?.... */
58041 + /* set up plugins for new @object created in @parent. @root is root
58042 + directory. */
58043 + int (*adjust_to_parent) (struct inode *object, struct inode *parent,
58044 + struct inode *root);
58045 + /*
58046 + * this does whatever is necessary to do when object is created. For
58047 + * instance, for unix files stat data is inserted. It is supposed to be
58048 + * called by create of struct inode_operations.
58049 + */
58050 + int (*create_object) (struct inode *object, struct inode *parent,
58051 + reiser4_object_create_data *);
58052 + /*
58053 + * this method should check REISER4_NO_SD and set REISER4_NO_SD on
58054 + * success. Deletion of an object usually includes removal of items
58055 + * building file body (for directories this is removal of "." and "..")
58056 + * and removal of stat-data item.
58057 + */
58058 + int (*delete_object) (struct inode *);
58059 +
58060 + /* add link from @parent to @object */
58061 + int (*add_link) (struct inode *object, struct inode *parent);
58062 +
58063 + /* remove link from @parent to @object */
58064 + int (*rem_link) (struct inode *object, struct inode *parent);
58065 +
58066 + /*
58067 + * return true if item addressed by @coord belongs to @inode. This is
58068 + * used by read/write to properly slice flow into items in presence of
58069 + * multiple key assignment policies, because items of a file are not
58070 + * necessarily contiguous in a key space, for example, in a plan-b.
58071 + */
58072 + int (*owns_item) (const struct inode *, const coord_t *);
58073 +
58074 + /* checks whether yet another hard links to this object can be
58075 + added */
58076 + int (*can_add_link) (const struct inode *);
58077 +
58078 + /* checks whether hard links to this object can be removed */
58079 + int (*can_rem_link) (const struct inode *);
58080 +
58081 + /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
58082 + detach of directory plugin to remove ".." */
58083 + int (*detach) (struct inode *child, struct inode *parent);
58084 +
58085 + /* called when @child was just looked up in the @parent. It is not
58086 + empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
58087 + directory plugin */
58088 + int (*bind) (struct inode *child, struct inode *parent);
58089 +
58090 + /* process safe-link during mount */
58091 + int (*safelink) (struct inode *object, reiser4_safe_link_t link,
58092 + __u64 value);
58093 +
58094 + /* The couple of estimate methods for all file operations */
58095 + struct {
58096 + reiser4_block_nr(*create) (const struct inode *);
58097 + reiser4_block_nr(*update) (const struct inode *);
58098 + reiser4_block_nr(*unlink) (const struct inode *,
58099 + const struct inode *);
58100 + } estimate;
58101 +
58102 + /*
58103 + * reiser4 specific part of inode has a union of structures which are
58104 + * specific to a plugin. This method is called when inode is read
58105 + * (read_inode) and when file is created (common_create_child) so that
58106 + * file plugin could initialize its inode data
58107 + */
58108 + void (*init_inode_data) (struct inode *, reiser4_object_create_data * ,
58109 + int);
58110 +
58111 + /*
58112 + * This method performs progressive deletion of items and whole nodes
58113 + * from right to left.
58114 + *
58115 + * @tap: the point deletion process begins from,
58116 + * @from_key: the beginning of the deleted key range,
58117 + * @to_key: the end of the deleted key range,
58118 + * @smallest_removed: the smallest removed key,
58119 + *
58120 + * @return: 0 if success, error code otherwise, -E_REPEAT means that
58121 + * long cut_tree operation was interrupted for allowing atom commit .
58122 + */
58123 + int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
58124 + const reiser4_key * to_key,
58125 + reiser4_key * smallest_removed, struct inode *,
58126 + int, int *);
58127 +
58128 + /* called from ->destroy_inode() */
58129 + void (*destroy_inode) (struct inode *);
58130 +
58131 + /*
58132 + * methods to serialize object identify. This is used, for example, by
58133 + * reiser4_{en,de}code_fh().
58134 + */
58135 + struct {
58136 + /* store object's identity at @area */
58137 + char *(*write) (struct inode *inode, char *area);
58138 + /* parse object from wire to the @obj */
58139 + char *(*read) (char *area, reiser4_object_on_wire * obj);
58140 + /* given object identity in @obj, find or create its dentry */
58141 + struct dentry *(*get) (struct super_block *s,
58142 + reiser4_object_on_wire * obj);
58143 + /* how many bytes ->wire.write() consumes */
58144 + int (*size) (struct inode *inode);
58145 + /* finish with object identify */
58146 + void (*done) (reiser4_object_on_wire * obj);
58147 + } wire;
58148 +} file_plugin;
58149 +
58150 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58151 +
58152 +struct reiser4_object_on_wire {
58153 + file_plugin *plugin;
58154 + union {
58155 + struct {
58156 + obj_key_id key_id;
58157 + } std;
58158 + void *generic;
58159 + } u;
58160 +};
58161 +
58162 +/* builtin dir-plugins */
58163 +typedef enum {
58164 + HASHED_DIR_PLUGIN_ID,
58165 + SEEKABLE_HASHED_DIR_PLUGIN_ID,
58166 + LAST_DIR_ID
58167 +} reiser4_dir_id;
58168 +
58169 +typedef struct dir_plugin {
58170 + /* generic fields */
58171 + plugin_header h;
58172 +
58173 + struct inode_operations * inode_ops;
58174 + struct file_operations * file_ops;
58175 + struct address_space_operations * as_ops;
58176 +
58177 + /*
58178 + * private methods: These are optional. If used they will allow you to
58179 + * minimize the amount of code needed to implement a deviation from
58180 + * some other method that uses them. You could logically argue that
58181 + * they should be a separate type of plugin.
58182 + */
58183 +
58184 + struct dentry *(*get_parent) (struct inode *childdir);
58185 +
58186 + /*
58187 + * check whether "name" is acceptable name to be inserted into this
58188 + * object. Optionally implemented by directory-like objects. Can check
58189 + * for maximal length, reserved symbols etc
58190 + */
58191 + int (*is_name_acceptable) (const struct inode *inode, const char *name,
58192 + int len);
58193 +
58194 + void (*build_entry_key) (const struct inode *dir /* directory where
58195 + * entry is (or will
58196 + * be) in.*/ ,
58197 + const struct qstr *name /* name of file
58198 + * referenced by this
58199 + * entry */ ,
58200 + reiser4_key * result /* resulting key of
58201 + * directory entry */ );
58202 + int (*build_readdir_key) (struct file *dir, reiser4_key * result);
58203 + int (*add_entry) (struct inode *object, struct dentry *where,
58204 + reiser4_object_create_data * data,
58205 + reiser4_dir_entry_desc * entry);
58206 + int (*rem_entry) (struct inode *object, struct dentry *where,
58207 + reiser4_dir_entry_desc * entry);
58208 +
58209 + /*
58210 + * initialize directory structure for newly created object. For normal
58211 + * unix directories, insert dot and dotdot.
58212 + */
58213 + int (*init) (struct inode *object, struct inode *parent,
58214 + reiser4_object_create_data * data);
58215 +
58216 + /* destroy directory */
58217 + int (*done) (struct inode *child);
58218 +
58219 + /* called when @subdir was just looked up in the @dir */
58220 + int (*attach) (struct inode *subdir, struct inode *dir);
58221 + int (*detach) (struct inode *subdir, struct inode *dir);
58222 +
58223 + struct {
58224 + reiser4_block_nr(*add_entry) (const struct inode *);
58225 + reiser4_block_nr(*rem_entry) (const struct inode *);
58226 + reiser4_block_nr(*unlink) (const struct inode *,
58227 + const struct inode *);
58228 + } estimate;
58229 +} dir_plugin;
58230 +
58231 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58232 +
58233 +typedef struct formatting_plugin {
58234 + /* generic fields */
58235 + plugin_header h;
58236 + /* returns non-zero iff file's tail has to be stored
58237 + in a direct item. */
58238 + int (*have_tail) (const struct inode *inode, loff_t size);
58239 +} formatting_plugin;
58240 +
58241 +typedef struct hash_plugin {
58242 + /* generic fields */
58243 + plugin_header h;
58244 + /* computes hash of the given name */
58245 + __u64(*hash) (const unsigned char *name, int len);
58246 +} hash_plugin;
58247 +
58248 +typedef struct cipher_plugin {
58249 + /* generic fields */
58250 + plugin_header h;
58251 + struct crypto_blkcipher * (*alloc) (void);
58252 + void (*free) (struct crypto_blkcipher *tfm);
58253 + /* Offset translator. For each offset this returns (k * offset), where
58254 + k (k >= 1) is an expansion factor of the cipher algorithm.
58255 + For all symmetric algorithms k == 1. For asymmetric algorithms (which
58256 + inflate data) offset translation guarantees that all disk cluster's
58257 + units will have keys smaller then next cluster's one.
58258 + */
58259 + loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src);
58260 + /* Cipher algorithms can accept data only by chunks of cipher block
58261 + size. This method is to align any flow up to cipher block size when
58262 + we pass it to cipher algorithm. To align means to append padding of
58263 + special format specific to the cipher algorithm */
58264 + int (*align_stream) (__u8 *tail, int clust_size, int blocksize);
58265 + /* low-level key manager (check, install, etc..) */
58266 + int (*setkey) (struct crypto_tfm *tfm, const __u8 *key,
58267 + unsigned int keylen);
58268 + /* main text processing procedures */
58269 + void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
58270 + void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
58271 +} cipher_plugin;
58272 +
58273 +typedef struct digest_plugin {
58274 + /* generic fields */
58275 + plugin_header h;
58276 + /* fingerprint size in bytes */
58277 + int fipsize;
58278 + struct crypto_hash * (*alloc) (void);
58279 + void (*free) (struct crypto_hash *tfm);
58280 +} digest_plugin;
58281 +
58282 +typedef struct compression_plugin {
58283 + /* generic fields */
58284 + plugin_header h;
58285 + int (*init) (void);
58286 + /* the maximum number of bytes the size of the "compressed" data can
58287 + * exceed the uncompressed data. */
58288 + int (*overrun) (unsigned src_len);
58289 + coa_t(*alloc) (tfm_action act);
58290 + void (*free) (coa_t coa, tfm_action act);
58291 + /* minimal size of the flow we still try to compress */
58292 + int (*min_size_deflate) (void);
58293 + __u32(*checksum) (char *data, __u32 length);
58294 + /* main transform procedures */
58295 + void (*compress) (coa_t coa, __u8 *src_first, size_t src_len,
58296 + __u8 *dst_first, size_t *dst_len);
58297 + void (*decompress) (coa_t coa, __u8 *src_first, size_t src_len,
58298 + __u8 *dst_first, size_t *dst_len);
58299 +} compression_plugin;
58300 +
58301 +typedef struct compression_mode_plugin {
58302 + /* generic fields */
58303 + plugin_header h;
58304 + /* this is called when estimating compressibility
58305 + of a logical cluster by its content */
58306 + int (*should_deflate) (struct inode *inode, cloff_t index);
58307 + /* this is called when results of compression should be saved */
58308 + int (*accept_hook) (struct inode *inode, cloff_t index);
58309 + /* this is called when results of compression should be discarded */
58310 + int (*discard_hook) (struct inode *inode, cloff_t index);
58311 +} compression_mode_plugin;
58312 +
58313 +typedef struct cluster_plugin {
58314 + /* generic fields */
58315 + plugin_header h;
58316 + int shift;
58317 +} cluster_plugin;
58318 +
58319 +typedef struct sd_ext_plugin {
58320 + /* generic fields */
58321 + plugin_header h;
58322 + int (*present) (struct inode *inode, char **area, int *len);
58323 + int (*absent) (struct inode *inode);
58324 + int (*save_len) (struct inode *inode);
58325 + int (*save) (struct inode *inode, char **area);
58326 + /* alignment requirement for this stat-data part */
58327 + int alignment;
58328 +} sd_ext_plugin;
58329 +
58330 +/* this plugin contains methods to allocate objectid for newly created files,
58331 + to deallocate objectid when file gets removed, to report number of used and
58332 + free objectids */
58333 +typedef struct oid_allocator_plugin {
58334 + /* generic fields */
58335 + plugin_header h;
58336 + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
58337 + __u64 oids);
58338 + /* used to report statfs->f_files */
58339 + __u64(*oids_used) (reiser4_oid_allocator * map);
58340 + /* get next oid to use */
58341 + __u64(*next_oid) (reiser4_oid_allocator * map);
58342 + /* used to report statfs->f_ffree */
58343 + __u64(*oids_free) (reiser4_oid_allocator * map);
58344 + /* allocate new objectid */
58345 + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
58346 + /* release objectid */
58347 + int (*release_oid) (reiser4_oid_allocator * map, oid_t);
58348 + /* how many pages to reserve in transaction for allocation of new
58349 + objectid */
58350 + int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
58351 + /* how many pages to reserve in transaction for freeing of an
58352 + objectid */
58353 + int (*oid_reserve_release) (reiser4_oid_allocator * map);
58354 + void (*print_info) (const char *, reiser4_oid_allocator *);
58355 +} oid_allocator_plugin;
58356 +
58357 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
58358 + are any) locations, etc */
58359 +typedef struct disk_format_plugin {
58360 + /* generic fields */
58361 + plugin_header h;
58362 + /* replay journal, initialize super_info_data, etc */
58363 + int (*init_format) (struct super_block *, void *data);
58364 +
58365 + /* key of root directory stat data */
58366 + const reiser4_key * (*root_dir_key) (const struct super_block *);
58367 +
58368 + int (*release) (struct super_block *);
58369 + jnode * (*log_super) (struct super_block *);
58370 + int (*check_open) (const struct inode *object);
58371 + int (*version_update) (struct super_block *);
58372 +} disk_format_plugin;
58373 +
58374 +struct jnode_plugin {
58375 + /* generic fields */
58376 + plugin_header h;
58377 + int (*init) (jnode * node);
58378 + int (*parse) (jnode * node);
58379 + struct address_space *(*mapping) (const jnode * node);
58380 + unsigned long (*index) (const jnode * node);
58381 + jnode * (*clone) (jnode * node);
58382 +};
58383 +
58384 +/* plugin instance. */
58385 +/* */
58386 +/* This is "wrapper" union for all types of plugins. Most of the code uses */
58387 +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
58388 +/* operates with pointers to reiser4_plugin. This union is only used in */
58389 +/* some generic code in plugin/plugin.c that operates on all */
58390 +/* plugins. Technically speaking purpose of this union is to add type */
58391 +/* safety to said generic code: each plugin type (file_plugin, for */
58392 +/* example), contains plugin_header as its first memeber. This first member */
58393 +/* is located at the same place in memory as .h member of */
58394 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
58395 +/* looks in the .h which is header of plugin type located in union. This */
58396 +/* allows to avoid type-casts. */
58397 +union reiser4_plugin {
58398 + /* generic fields */
58399 + plugin_header h;
58400 + /* file plugin */
58401 + file_plugin file;
58402 + /* directory plugin */
58403 + dir_plugin dir;
58404 + /* hash plugin, used by directory plugin */
58405 + hash_plugin hash;
58406 + /* fibration plugin used by directory plugin */
58407 + fibration_plugin fibration;
58408 + /* cipher transform plugin, used by file plugin */
58409 + cipher_plugin cipher;
58410 + /* digest transform plugin, used by file plugin */
58411 + digest_plugin digest;
58412 + /* compression transform plugin, used by file plugin */
58413 + compression_plugin compression;
58414 + /* tail plugin, used by file plugin */
58415 + formatting_plugin formatting;
58416 + /* permission plugin */
58417 + perm_plugin perm;
58418 + /* node plugin */
58419 + node_plugin node;
58420 + /* item plugin */
58421 + item_plugin item;
58422 + /* stat-data extension plugin */
58423 + sd_ext_plugin sd_ext;
58424 + /* disk layout plugin */
58425 + disk_format_plugin format;
58426 + /* object id allocator plugin */
58427 + oid_allocator_plugin oid_allocator;
58428 + /* plugin for different jnode types */
58429 + jnode_plugin jnode;
58430 + /* compression mode plugin, used by object plugin */
58431 + compression_mode_plugin compression_mode;
58432 + /* cluster plugin, used by object plugin */
58433 + cluster_plugin clust;
58434 + /* place-holder for new plugin types that can be registered
58435 + dynamically, and used by other dynamically loaded plugins. */
58436 + void *generic;
58437 +};
58438 +
58439 +struct reiser4_plugin_ops {
58440 + /* called when plugin is initialized */
58441 + int (*init) (reiser4_plugin * plugin);
58442 + /* called when plugin is unloaded */
58443 + int (*done) (reiser4_plugin * plugin);
58444 + /* load given plugin from disk */
58445 + int (*load) (struct inode *inode,
58446 + reiser4_plugin * plugin, char **area, int *len);
58447 + /* how many space is required to store this plugin's state
58448 + in stat-data */
58449 + int (*save_len) (struct inode *inode, reiser4_plugin * plugin);
58450 + /* save persistent plugin-data to disk */
58451 + int (*save) (struct inode *inode, reiser4_plugin * plugin,
58452 + char **area);
58453 + /* alignment requirement for on-disk state of this plugin
58454 + in number of bytes */
58455 + int alignment;
58456 + /* install itself into given inode. This can return error
58457 + (e.g., you cannot change hash of non-empty directory). */
58458 + int (*change) (struct inode *inode, reiser4_plugin * plugin,
58459 + pset_member memb);
58460 + /* install itself into given inode. This can return error
58461 + (e.g., you cannot change hash of non-empty directory). */
58462 + int (*inherit) (struct inode *inode, struct inode *parent,
58463 + reiser4_plugin * plugin);
58464 +};
58465 +
58466 +/* functions implemented in fs/reiser4/plugin/plugin.c */
58467 +
58468 +/* stores plugin reference in reiser4-specific part of inode */
58469 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58470 +extern int init_plugins(void);
58471 +
58472 +/* builtin plugins */
58473 +
58474 +/* builtin hash-plugins */
58475 +
58476 +typedef enum {
58477 + RUPASOV_HASH_ID,
58478 + R5_HASH_ID,
58479 + TEA_HASH_ID,
58480 + FNV1_HASH_ID,
58481 + DEGENERATE_HASH_ID,
58482 + LAST_HASH_ID
58483 +} reiser4_hash_id;
58484 +
58485 +/* builtin cipher plugins */
58486 +
58487 +typedef enum {
58488 + NONE_CIPHER_ID,
58489 + LAST_CIPHER_ID
58490 +} reiser4_cipher_id;
58491 +
58492 +/* builtin digest plugins */
58493 +
58494 +typedef enum {
58495 + SHA256_32_DIGEST_ID,
58496 + LAST_DIGEST_ID
58497 +} reiser4_digest_id;
58498 +
58499 +/* builtin compression mode plugins */
58500 +typedef enum {
58501 + NONE_COMPRESSION_MODE_ID,
58502 + LATTD_COMPRESSION_MODE_ID,
58503 + ULTIM_COMPRESSION_MODE_ID,
58504 + FORCE_COMPRESSION_MODE_ID,
58505 + CONVX_COMPRESSION_MODE_ID,
58506 + LAST_COMPRESSION_MODE_ID
58507 +} reiser4_compression_mode_id;
58508 +
58509 +/* builtin cluster plugins */
58510 +typedef enum {
58511 + CLUSTER_64K_ID,
58512 + CLUSTER_32K_ID,
58513 + CLUSTER_16K_ID,
58514 + CLUSTER_8K_ID,
58515 + CLUSTER_4K_ID,
58516 + LAST_CLUSTER_ID
58517 +} reiser4_cluster_id;
58518 +
58519 +/* builtin tail-plugins */
58520 +
58521 +typedef enum {
58522 + NEVER_TAILS_FORMATTING_ID,
58523 + ALWAYS_TAILS_FORMATTING_ID,
58524 + SMALL_FILE_FORMATTING_ID,
58525 + LAST_TAIL_FORMATTING_ID
58526 +} reiser4_formatting_id;
58527 +
58528 +/* data type used to pack parameters that we pass to vfs object creation
58529 + function create_object() */
58530 +struct reiser4_object_create_data {
58531 + /* plugin to control created object */
58532 + reiser4_file_id id;
58533 + /* mode of regular file, directory or special file */
58534 +/* what happens if some other sort of perm plugin is in use? */
58535 + int mode;
58536 + /* rdev of special file */
58537 + dev_t rdev;
58538 + /* symlink target */
58539 + const char *name;
58540 + /* add here something for non-standard objects you invent, like
58541 + query for interpolation file etc. */
58542 +
58543 + struct reiser4_crypto_info *crypto;
58544 +
58545 + struct inode *parent;
58546 + struct dentry *dentry;
58547 +};
58548 +
58549 +/* description of directory entry being created/destroyed/sought for
58550 +
58551 + It is passed down to the directory plugin and farther to the
58552 + directory item plugin methods. Creation of new directory is done in
58553 + several stages: first we search for an entry with the same name, then
58554 + create new one. reiser4_dir_entry_desc is used to store some information
58555 + collected at some stage of this process and required later: key of
58556 + item that we want to insert/delete and pointer to an object that will
58557 + be bound by the new directory entry. Probably some more fields will
58558 + be added there.
58559 +
58560 +*/
58561 +struct reiser4_dir_entry_desc {
58562 + /* key of directory entry */
58563 + reiser4_key key;
58564 + /* object bound by this entry. */
58565 + struct inode *obj;
58566 +};
58567 +
58568 +#define MAX_PLUGIN_TYPE_LABEL_LEN 32
58569 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32
58570 +
58571 +#define PLUGIN_BY_ID(TYPE, ID, FIELD) \
58572 +static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id) \
58573 +{ \
58574 + reiser4_plugin *plugin = plugin_by_id(ID, id); \
58575 + return plugin ? &plugin->FIELD : NULL; \
58576 +} \
58577 +static inline TYPE *TYPE ## _by_disk_id(reiser4_tree * tree, d16 *id) \
58578 +{ \
58579 + reiser4_plugin *plugin = plugin_by_disk_id(tree, ID, id); \
58580 + return plugin ? &plugin->FIELD : NULL; \
58581 +} \
58582 +static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id) \
58583 +{ \
58584 + reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id); \
58585 + return plugin ? &plugin->FIELD : NULL; \
58586 +} \
58587 +static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin) \
58588 +{ \
58589 + return (reiser4_plugin *) plugin; \
58590 +} \
58591 +static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin) \
58592 +{ \
58593 + return TYPE ## _to_plugin(plugin)->h.id; \
58594 +} \
58595 +typedef struct { int foo; } TYPE ## _plugin_dummy
58596 +
58597 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58598 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58599 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58600 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58601 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58602 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58603 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58604 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58605 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58606 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58607 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58608 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58609 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58610 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58611 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58612 + compression_mode);
58613 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58614 +
58615 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58616 +
58617 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58618 +
58619 +#define for_all_plugins(ptype, plugin) \
58620 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
58621 + get_plugin_list(ptype) != &plugin->h.linkage; \
58622 + plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58623 +
58624 +
58625 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor,
58626 + pset_member memb);
58627 +extern int force_plugin_pset(struct inode *self, pset_member memb,
58628 + reiser4_plugin *plug);
58629 +extern int finish_pset(struct inode *inode);
58630 +
58631 +/* defined in fs/reiser4/plugin/object.c */
58632 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58633 +/* defined in fs/reiser4/plugin/object.c */
58634 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58635 +/* defined in fs/reiser4/plugin/item/static_stat.c */
58636 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58637 +/* defined in fs/reiser4/plugin/hash.c */
58638 +extern hash_plugin hash_plugins[LAST_HASH_ID];
58639 +/* defined in fs/reiser4/plugin/fibration.c */
58640 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58641 +/* defined in fs/reiser4/plugin/crypt.c */
58642 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58643 +/* defined in fs/reiser4/plugin/digest.c */
58644 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58645 +/* defined in fs/reiser4/plugin/compress/compress.c */
58646 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58647 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58648 +extern compression_mode_plugin
58649 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58650 +/* defined in fs/reiser4/plugin/cluster.c */
58651 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58652 +/* defined in fs/reiser4/plugin/tail.c */
58653 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58654 +/* defined in fs/reiser4/plugin/security/security.c */
58655 +extern perm_plugin perm_plugins[LAST_PERM_ID];
58656 +/* defined in fs/reiser4/plugin/item/item.c */
58657 +extern item_plugin item_plugins[LAST_ITEM_ID];
58658 +/* defined in fs/reiser4/plugin/node/node.c */
58659 +extern node_plugin node_plugins[LAST_NODE_ID];
58660 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58661 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58662 +
58663 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
58664 +#endif
58665 +
58666 +/* Make Linus happy.
58667 + Local variables:
58668 + c-indentation-style: "K&R"
58669 + mode-name: "LC"
58670 + c-basic-offset: 8
58671 + tab-width: 8
58672 + fill-column: 120
58673 + End:
58674 +*/
58675 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.33/fs/reiser4/plugin/plugin_header.h
58676 --- linux-2.6.33.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 01:00:00.000000000 +0100
58677 +++ linux-2.6.33/fs/reiser4/plugin/plugin_header.h 2010-03-04 19:33:22.000000000 +0100
58678 @@ -0,0 +1,149 @@
58679 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58680 +
58681 +/* plugin header. Data structures required by all plugin types. */
58682 +
58683 +#if !defined(__PLUGIN_HEADER_H__)
58684 +#define __PLUGIN_HEADER_H__
58685 +
58686 +/* plugin data-types and constants */
58687 +
58688 +#include "../debug.h"
58689 +#include "../dformat.h"
58690 +
58691 +/* The list of Reiser4 interfaces */
58692 +typedef enum {
58693 + REISER4_FILE_PLUGIN_TYPE, /* manage VFS objects */
58694 + REISER4_DIR_PLUGIN_TYPE, /* manage directories */
58695 + REISER4_ITEM_PLUGIN_TYPE, /* manage items */
58696 + REISER4_NODE_PLUGIN_TYPE, /* manage formatted nodes */
58697 + REISER4_HASH_PLUGIN_TYPE, /* hash methods */
58698 + REISER4_FIBRATION_PLUGIN_TYPE, /* directory fibrations */
58699 + REISER4_FORMATTING_PLUGIN_TYPE, /* dispatching policy */
58700 + REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
58701 + REISER4_SD_EXT_PLUGIN_TYPE, /* manage stat-data extensions */
58702 + REISER4_FORMAT_PLUGIN_TYPE, /* disk format specifications */
58703 + REISER4_JNODE_PLUGIN_TYPE, /* manage in-memory headers */
58704 + REISER4_CIPHER_PLUGIN_TYPE, /* cipher transform methods */
58705 + REISER4_DIGEST_PLUGIN_TYPE, /* digest transform methods */
58706 + REISER4_COMPRESSION_PLUGIN_TYPE, /* compression methods */
58707 + REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* dispatching policies */
58708 + REISER4_CLUSTER_PLUGIN_TYPE, /* manage logical clusters */
58709 + REISER4_PLUGIN_TYPES
58710 +} reiser4_plugin_type;
58711 +
58712 +/* Supported plugin groups */
58713 +typedef enum {
58714 + REISER4_DIRECTORY_FILE,
58715 + REISER4_REGULAR_FILE,
58716 + REISER4_SYMLINK_FILE,
58717 + REISER4_SPECIAL_FILE,
58718 +} file_plugin_group;
58719 +
58720 +struct reiser4_plugin_ops;
58721 +/* generic plugin operations, supported by each
58722 + plugin type. */
58723 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58724 +
58725 +/* the common part of all plugin instances. */
58726 +typedef struct plugin_header {
58727 + /* plugin type */
58728 + reiser4_plugin_type type_id;
58729 + /* id of this plugin */
58730 + reiser4_plugin_id id;
58731 + /* bitmask of groups the plugin belongs to. */
58732 + reiser4_plugin_groups groups;
58733 + /* plugin operations */
58734 + reiser4_plugin_ops *pops;
58735 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and
58736 + * defined. */
58737 + /* short label of this plugin */
58738 + const char *label;
58739 + /* descriptive string.. */
58740 + const char *desc;
58741 + /* list linkage */
58742 + struct list_head linkage;
58743 +} plugin_header;
58744 +
58745 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58746 +
58747 +/* PRIVATE INTERFACES */
58748 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in
58749 + * plugin_header? */
58750 +/* plugin type representation. */
58751 +struct reiser4_plugin_type_data {
58752 + /* internal plugin type identifier. Should coincide with
58753 + index of this item in plugins[] array. */
58754 + reiser4_plugin_type type_id;
58755 + /* short symbolic label of this plugin type. Should be no longer
58756 + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58757 + const char *label;
58758 + /* plugin type description longer than .label */
58759 + const char *desc;
58760 +
58761 +/* NIKITA-FIXME-HANS: define built-in */
58762 + /* number of built-in plugin instances of this type */
58763 + int builtin_num;
58764 + /* array of built-in plugins */
58765 + void *builtin;
58766 + struct list_head plugins_list;
58767 + size_t size;
58768 +};
58769 +
58770 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58771 +
58772 +int is_plugin_type_valid(reiser4_plugin_type type);
58773 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58774 +
58775 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype,
58776 + int i)
58777 +{
58778 + char *builtin;
58779 +
58780 + builtin = ptype->builtin;
58781 + return (reiser4_plugin *) (builtin + i * ptype->size);
58782 +}
58783 +
58784 +/* return plugin by its @type_id and @id */
58785 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58786 + reiser4_plugin_id id)
58787 +{
58788 + assert("nikita-1651", is_plugin_type_valid(type));
58789 + assert("nikita-1652", is_plugin_id_valid(type, id));
58790 + return plugin_at(&plugins[type], id);
58791 +}
58792 +
58793 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58794 + reiser4_plugin_id id);
58795 +
58796 +/**
58797 + * plugin_by_disk_id - get reiser4_plugin
58798 + * @type_id: plugin type id
58799 + * @did: plugin id in disk format
58800 + *
58801 + * Returns reiser4_plugin by plugin type id an dplugin_id.
58802 + */
58803 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58804 + reiser4_plugin_type type_id,
58805 + __le16 *plugin_id)
58806 +{
58807 + /*
58808 + * what we should do properly is to maintain within each file-system a
58809 + * dictionary that maps on-disk plugin ids to "universal" ids. This
58810 + * dictionary will be resolved on mount time, so that this function
58811 + * will perform just one additional array lookup.
58812 + */
58813 + return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58814 +}
58815 +
58816 +/* __PLUGIN_HEADER_H__ */
58817 +#endif
58818 +
58819 +/*
58820 + * Local variables:
58821 + * c-indentation-style: "K&R"
58822 + * mode-name: "LC"
58823 + * c-basic-offset: 8
58824 + * tab-width: 8
58825 + * fill-column: 79
58826 + * End:
58827 + */
58828 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.33/fs/reiser4/plugin/plugin_set.c
58829 --- linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 01:00:00.000000000 +0100
58830 +++ linux-2.6.33/fs/reiser4/plugin/plugin_set.c 2010-03-04 19:33:22.000000000 +0100
58831 @@ -0,0 +1,380 @@
58832 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58833 + * reiser4/README */
58834 +/* This file contains Reiser4 plugin set operations */
58835 +
58836 +/* plugin sets
58837 + *
58838 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58839 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58840 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58841 + * set of plugins (so called pset) is described by structure plugin_set (see
58842 + * plugin/plugin_set.h), which contains pointers to all required plugins.
58843 + *
58844 + * Children can inherit some pset members from their parent, however sometimes
58845 + * it is useful to specify members different from parent ones. Since object's
58846 + * pset can not be easily changed without fatal consequences, we use for this
58847 + * purpose another special plugin table (so called hset, or heir set) described
58848 + * by the same structure.
58849 + *
58850 + * Inode only stores a pointers to pset and hset. Different inodes with the
58851 + * same set of pset (hset) members point to the same pset (hset). This is
58852 + * archived by storing psets and hsets in global hash table. Races are avoided
58853 + * by simple (and efficient so far) solution of never recycling psets, even
58854 + * when last inode pointing to it is destroyed.
58855 + */
58856 +
58857 +#include "../debug.h"
58858 +#include "../super.h"
58859 +#include "plugin_set.h"
58860 +
58861 +#include <linux/slab.h>
58862 +#include <linux/stddef.h>
58863 +
58864 +/* slab for plugin sets */
58865 +static struct kmem_cache *plugin_set_slab;
58866 +
58867 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58868 + [0 ... 7] = SPIN_LOCK_UNLOCKED
58869 +};
58870 +
58871 +/* hash table support */
58872 +
58873 +#define PS_TABLE_SIZE (32)
58874 +
58875 +static inline plugin_set *cast_to(const unsigned long *a)
58876 +{
58877 + return container_of(a, plugin_set, hashval);
58878 +}
58879 +
58880 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58881 +{
58882 + plugin_set *set1;
58883 + plugin_set *set2;
58884 +
58885 + /* make sure fields are not missed in the code below */
58886 + cassert(sizeof *set1 ==
58887 + sizeof set1->hashval +
58888 + sizeof set1->link +
58889 + sizeof set1->file +
58890 + sizeof set1->dir +
58891 + sizeof set1->perm +
58892 + sizeof set1->formatting +
58893 + sizeof set1->hash +
58894 + sizeof set1->fibration +
58895 + sizeof set1->sd +
58896 + sizeof set1->dir_item +
58897 + sizeof set1->cipher +
58898 + sizeof set1->digest +
58899 + sizeof set1->compression +
58900 + sizeof set1->compression_mode +
58901 + sizeof set1->cluster +
58902 + sizeof set1->create);
58903 +
58904 + set1 = cast_to(a1);
58905 + set2 = cast_to(a2);
58906 + return
58907 + set1->hashval == set2->hashval &&
58908 + set1->file == set2->file &&
58909 + set1->dir == set2->dir &&
58910 + set1->perm == set2->perm &&
58911 + set1->formatting == set2->formatting &&
58912 + set1->hash == set2->hash &&
58913 + set1->fibration == set2->fibration &&
58914 + set1->sd == set2->sd &&
58915 + set1->dir_item == set2->dir_item &&
58916 + set1->cipher == set2->cipher &&
58917 + set1->digest == set2->digest &&
58918 + set1->compression == set2->compression &&
58919 + set1->compression_mode == set2->compression_mode &&
58920 + set1->cluster == set2->cluster &&
58921 + set1->create == set2->create;
58922 +}
58923 +
58924 +#define HASH_FIELD(hash, set, field) \
58925 +({ \
58926 + (hash) += (unsigned long)(set)->field >> 2; \
58927 +})
58928 +
58929 +static inline unsigned long calculate_hash(const plugin_set * set)
58930 +{
58931 + unsigned long result;
58932 +
58933 + result = 0;
58934 + HASH_FIELD(result, set, file);
58935 + HASH_FIELD(result, set, dir);
58936 + HASH_FIELD(result, set, perm);
58937 + HASH_FIELD(result, set, formatting);
58938 + HASH_FIELD(result, set, hash);
58939 + HASH_FIELD(result, set, fibration);
58940 + HASH_FIELD(result, set, sd);
58941 + HASH_FIELD(result, set, dir_item);
58942 + HASH_FIELD(result, set, cipher);
58943 + HASH_FIELD(result, set, digest);
58944 + HASH_FIELD(result, set, compression);
58945 + HASH_FIELD(result, set, compression_mode);
58946 + HASH_FIELD(result, set, cluster);
58947 + HASH_FIELD(result, set, create);
58948 + return result & (PS_TABLE_SIZE - 1);
58949 +}
58950 +
58951 +static inline unsigned long
58952 +pshash(ps_hash_table * table, const unsigned long *a)
58953 +{
58954 + return *a;
58955 +}
58956 +
58957 +/* The hash table definition */
58958 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58959 +#define KFREE(ptr, size) kfree(ptr)
58960 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58961 + pseq);
58962 +#undef KFREE
58963 +#undef KMALLOC
58964 +
58965 +static ps_hash_table ps_table;
58966 +static plugin_set empty_set = {
58967 + .hashval = 0,
58968 + .file = NULL,
58969 + .dir = NULL,
58970 + .perm = NULL,
58971 + .formatting = NULL,
58972 + .hash = NULL,
58973 + .fibration = NULL,
58974 + .sd = NULL,
58975 + .dir_item = NULL,
58976 + .cipher = NULL,
58977 + .digest = NULL,
58978 + .compression = NULL,
58979 + .compression_mode = NULL,
58980 + .cluster = NULL,
58981 + .create = NULL,
58982 + .link = {NULL}
58983 +};
58984 +
58985 +plugin_set *plugin_set_get_empty(void)
58986 +{
58987 + return &empty_set;
58988 +}
58989 +
58990 +void plugin_set_put(plugin_set * set)
58991 +{
58992 +}
58993 +
58994 +static inline unsigned long *pset_field(plugin_set * set, int offset)
58995 +{
58996 + return (unsigned long *)(((char *)set) + offset);
58997 +}
58998 +
58999 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
59000 + const int offset)
59001 +{
59002 + unsigned long *spot;
59003 + spinlock_t *lock;
59004 + plugin_set replica;
59005 + plugin_set *twin;
59006 + plugin_set *psal;
59007 + plugin_set *orig;
59008 +
59009 + assert("nikita-2902", set != NULL);
59010 + assert("nikita-2904", *set != NULL);
59011 +
59012 + spot = pset_field(*set, offset);
59013 + if (unlikely(*spot == val))
59014 + return 0;
59015 +
59016 + replica = *(orig = *set);
59017 + *pset_field(&replica, offset) = val;
59018 + replica.hashval = calculate_hash(&replica);
59019 + rcu_read_lock();
59020 + twin = ps_hash_find(&ps_table, &replica.hashval);
59021 + if (unlikely(twin == NULL)) {
59022 + rcu_read_unlock();
59023 + psal = kmem_cache_alloc(plugin_set_slab,
59024 + reiser4_ctx_gfp_mask_get());
59025 + if (psal == NULL)
59026 + return RETERR(-ENOMEM);
59027 + *psal = replica;
59028 + lock = &plugin_set_lock[replica.hashval & 7];
59029 + spin_lock(lock);
59030 + twin = ps_hash_find(&ps_table, &replica.hashval);
59031 + if (likely(twin == NULL)) {
59032 + *set = psal;
59033 + ps_hash_insert_rcu(&ps_table, psal);
59034 + } else {
59035 + *set = twin;
59036 + kmem_cache_free(plugin_set_slab, psal);
59037 + }
59038 + spin_unlock(lock);
59039 + } else {
59040 + rcu_read_unlock();
59041 + *set = twin;
59042 + }
59043 + return 0;
59044 +}
59045 +
59046 +static struct {
59047 + int offset;
59048 + reiser4_plugin_groups groups;
59049 + reiser4_plugin_type type;
59050 +} pset_descr[PSET_LAST] = {
59051 + [PSET_FILE] = {
59052 + .offset = offsetof(plugin_set, file),
59053 + .type = REISER4_FILE_PLUGIN_TYPE,
59054 + .groups = 0
59055 + },
59056 + [PSET_DIR] = {
59057 + .offset = offsetof(plugin_set, dir),
59058 + .type = REISER4_DIR_PLUGIN_TYPE,
59059 + .groups = 0
59060 + },
59061 + [PSET_PERM] = {
59062 + .offset = offsetof(plugin_set, perm),
59063 + .type = REISER4_PERM_PLUGIN_TYPE,
59064 + .groups = 0
59065 + },
59066 + [PSET_FORMATTING] = {
59067 + .offset = offsetof(plugin_set, formatting),
59068 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
59069 + .groups = 0
59070 + },
59071 + [PSET_HASH] = {
59072 + .offset = offsetof(plugin_set, hash),
59073 + .type = REISER4_HASH_PLUGIN_TYPE,
59074 + .groups = 0
59075 + },
59076 + [PSET_FIBRATION] = {
59077 + .offset = offsetof(plugin_set, fibration),
59078 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
59079 + .groups = 0
59080 + },
59081 + [PSET_SD] = {
59082 + .offset = offsetof(plugin_set, sd),
59083 + .type = REISER4_ITEM_PLUGIN_TYPE,
59084 + .groups = (1 << STAT_DATA_ITEM_TYPE)
59085 + },
59086 + [PSET_DIR_ITEM] = {
59087 + .offset = offsetof(plugin_set, dir_item),
59088 + .type = REISER4_ITEM_PLUGIN_TYPE,
59089 + .groups = (1 << DIR_ENTRY_ITEM_TYPE)
59090 + },
59091 + [PSET_CIPHER] = {
59092 + .offset = offsetof(plugin_set, cipher),
59093 + .type = REISER4_CIPHER_PLUGIN_TYPE,
59094 + .groups = 0
59095 + },
59096 + [PSET_DIGEST] = {
59097 + .offset = offsetof(plugin_set, digest),
59098 + .type = REISER4_DIGEST_PLUGIN_TYPE,
59099 + .groups = 0
59100 + },
59101 + [PSET_COMPRESSION] = {
59102 + .offset = offsetof(plugin_set, compression),
59103 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
59104 + .groups = 0
59105 + },
59106 + [PSET_COMPRESSION_MODE] = {
59107 + .offset = offsetof(plugin_set, compression_mode),
59108 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59109 + .groups = 0
59110 + },
59111 + [PSET_CLUSTER] = {
59112 + .offset = offsetof(plugin_set, cluster),
59113 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
59114 + .groups = 0
59115 + },
59116 + [PSET_CREATE] = {
59117 + .offset = offsetof(plugin_set, create),
59118 + .type = REISER4_FILE_PLUGIN_TYPE,
59119 + .groups = (1 << REISER4_REGULAR_FILE)
59120 + }
59121 +};
59122 +
59123 +#define DEFINE_PSET_OPS(PREFIX) \
59124 + reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
59125 +{ \
59126 + if (memb > PSET_LAST) \
59127 + return REISER4_PLUGIN_TYPES; \
59128 + return pset_descr[memb].type; \
59129 +} \
59130 + \
59131 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
59132 + reiser4_plugin * plugin) \
59133 +{ \
59134 + assert("nikita-3492", set != NULL); \
59135 + assert("nikita-3493", *set != NULL); \
59136 + assert("nikita-3494", plugin != NULL); \
59137 + assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
59138 + assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
59139 + \
59140 + if (pset_descr[memb].groups) \
59141 + if (!(pset_descr[memb].groups & plugin->h.groups)) \
59142 + return -EINVAL; \
59143 + \
59144 + return plugin_set_field(set, \
59145 + (unsigned long)plugin, pset_descr[memb].offset); \
59146 +} \
59147 + \
59148 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
59149 +{ \
59150 + assert("nikita-3497", set != NULL); \
59151 + assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
59152 + \
59153 + return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
59154 +}
59155 +
59156 +DEFINE_PSET_OPS(aset);
59157 +
59158 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
59159 +{
59160 + return plugin_set_field(set,
59161 + (unsigned long)plugin, pset_descr[memb].offset);
59162 +}
59163 +
59164 +/**
59165 + * init_plugin_set - create plugin set cache and hash table
59166 + *
59167 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
59168 + * reiser4 module initialization.
59169 + */
59170 +int init_plugin_set(void)
59171 +{
59172 + int result;
59173 +
59174 + result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
59175 + if (result == 0) {
59176 + plugin_set_slab = kmem_cache_create("plugin_set",
59177 + sizeof(plugin_set), 0,
59178 + SLAB_HWCACHE_ALIGN,
59179 + NULL);
59180 + if (plugin_set_slab == NULL)
59181 + result = RETERR(-ENOMEM);
59182 + }
59183 + return result;
59184 +}
59185 +
59186 +/**
59187 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
59188 + *
59189 + * This is called on reiser4 module unloading or system shutdown.
59190 + */
59191 +void done_plugin_set(void)
59192 +{
59193 + plugin_set *cur, *next;
59194 +
59195 + for_all_in_htable(&ps_table, ps, cur, next) {
59196 + ps_hash_remove(&ps_table, cur);
59197 + kmem_cache_free(plugin_set_slab, cur);
59198 + }
59199 + destroy_reiser4_cache(&plugin_set_slab);
59200 + ps_hash_done(&ps_table);
59201 +}
59202 +
59203 +/*
59204 + * Local variables:
59205 + * c-indentation-style: "K&R"
59206 + * mode-name: "LC"
59207 + * c-basic-offset: 8
59208 + * tab-width: 8
59209 + * fill-column: 120
59210 + * End:
59211 + */
59212 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.33/fs/reiser4/plugin/plugin_set.h
59213 --- linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 01:00:00.000000000 +0100
59214 +++ linux-2.6.33/fs/reiser4/plugin/plugin_set.h 2010-03-04 19:33:22.000000000 +0100
59215 @@ -0,0 +1,78 @@
59216 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59217 + * reiser4/README */
59218 +
59219 +/* Reiser4 plugin set definition.
59220 + See fs/reiser4/plugin/plugin_set.c for details */
59221 +
59222 +#if !defined(__PLUGIN_SET_H__)
59223 +#define __PLUGIN_SET_H__
59224 +
59225 +#include "../type_safe_hash.h"
59226 +#include "plugin.h"
59227 +
59228 +#include <linux/rcupdate.h>
59229 +
59230 +struct plugin_set;
59231 +typedef struct plugin_set plugin_set;
59232 +
59233 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
59234 +
59235 +struct plugin_set {
59236 + unsigned long hashval;
59237 + /* plugin of file */
59238 + file_plugin *file;
59239 + /* plugin of dir */
59240 + dir_plugin *dir;
59241 + /* perm plugin for this file */
59242 + perm_plugin *perm;
59243 + /* tail policy plugin. Only meaningful for regular files */
59244 + formatting_plugin *formatting;
59245 + /* hash plugin. Only meaningful for directories. */
59246 + hash_plugin *hash;
59247 + /* fibration plugin. Only meaningful for directories. */
59248 + fibration_plugin *fibration;
59249 + /* plugin of stat-data */
59250 + item_plugin *sd;
59251 + /* plugin of items a directory is built of */
59252 + item_plugin *dir_item;
59253 + /* cipher plugin */
59254 + cipher_plugin *cipher;
59255 + /* digest plugin */
59256 + digest_plugin *digest;
59257 + /* compression plugin */
59258 + compression_plugin *compression;
59259 + /* compression mode plugin */
59260 + compression_mode_plugin *compression_mode;
59261 + /* cluster plugin */
59262 + cluster_plugin *cluster;
59263 + /* this specifies file plugin of regular children.
59264 + only meaningful for directories */
59265 + file_plugin *create;
59266 + ps_hash_link link;
59267 +};
59268 +
59269 +extern plugin_set *plugin_set_get_empty(void);
59270 +extern void plugin_set_put(plugin_set * set);
59271 +
59272 +extern int init_plugin_set(void);
59273 +extern void done_plugin_set(void);
59274 +
59275 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
59276 +extern int set_plugin(plugin_set ** set, pset_member memb,
59277 + reiser4_plugin * plugin);
59278 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
59279 + reiser4_plugin * plugin);
59280 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
59281 +
59282 +/* __PLUGIN_SET_H__ */
59283 +#endif
59284 +
59285 +/* Make Linus happy.
59286 + Local variables:
59287 + c-indentation-style: "K&R"
59288 + mode-name: "LC"
59289 + c-basic-offset: 8
59290 + tab-width: 8
59291 + fill-column: 120
59292 + End:
59293 +*/
59294 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/Makefile linux-2.6.33/fs/reiser4/plugin/security/Makefile
59295 --- linux-2.6.33.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 01:00:00.000000000 +0100
59296 +++ linux-2.6.33/fs/reiser4/plugin/security/Makefile 2010-03-04 19:33:22.000000000 +0100
59297 @@ -0,0 +1,4 @@
59298 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
59299 +
59300 +security_plugins-objs := \
59301 + perm.o
59302 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/perm.c linux-2.6.33/fs/reiser4/plugin/security/perm.c
59303 --- linux-2.6.33.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 01:00:00.000000000 +0100
59304 +++ linux-2.6.33/fs/reiser4/plugin/security/perm.c 2010-03-04 19:33:22.000000000 +0100
59305 @@ -0,0 +1,33 @@
59306 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59307 +
59308 +/*
59309 + * This file contains implementation of permission plugins.
59310 + * See the comments in perm.h
59311 + */
59312 +
59313 +#include "../plugin.h"
59314 +#include "../plugin_header.h"
59315 +#include "../../debug.h"
59316 +
59317 +perm_plugin perm_plugins[LAST_PERM_ID] = {
59318 + [NULL_PERM_ID] = {
59319 + .h = {
59320 + .type_id = REISER4_PERM_PLUGIN_TYPE,
59321 + .id = NULL_PERM_ID,
59322 + .pops = NULL,
59323 + .label = "null",
59324 + .desc = "stub permission plugin",
59325 + .linkage = {NULL, NULL}
59326 + }
59327 + }
59328 +};
59329 +
59330 +/*
59331 + * Local variables:
59332 + * c-indentation-style: "K&R"
59333 + * mode-name: "LC"
59334 + * c-basic-offset: 8
59335 + * tab-width: 8
59336 + * fill-column: 79
59337 + * End:
59338 + */
59339 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/perm.h linux-2.6.33/fs/reiser4/plugin/security/perm.h
59340 --- linux-2.6.33.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 01:00:00.000000000 +0100
59341 +++ linux-2.6.33/fs/reiser4/plugin/security/perm.h 2010-03-04 19:33:22.000000000 +0100
59342 @@ -0,0 +1,38 @@
59343 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59344 +
59345 +/* Perm (short for "permissions") plugins common stuff. */
59346 +
59347 +#if !defined( __REISER4_PERM_H__ )
59348 +#define __REISER4_PERM_H__
59349 +
59350 +#include "../../forward.h"
59351 +#include "../plugin_header.h"
59352 +
59353 +#include <linux/types.h>
59354 +
59355 +/* Definition of permission plugin */
59356 +/* NIKITA-FIXME-HANS: define what this is targeted for.
59357 + It does not seem to be intended for use with sys_reiser4. Explain. */
59358 +
59359 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
59360 + Consider it like a temporary "seam" and reserved pset member.
59361 + If you have something usefull to add, then rename this plugin and add here */
59362 +typedef struct perm_plugin {
59363 + /* generic plugin fields */
59364 + plugin_header h;
59365 +} perm_plugin;
59366 +
59367 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
59368 +
59369 +/* __REISER4_PERM_H__ */
59370 +#endif
59371 +
59372 +/* Make Linus happy.
59373 + Local variables:
59374 + c-indentation-style: "K&R"
59375 + mode-name: "LC"
59376 + c-basic-offset: 8
59377 + tab-width: 8
59378 + fill-column: 120
59379 + End:
59380 +*/
59381 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.33/fs/reiser4/plugin/space/bitmap.c
59382 --- linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 01:00:00.000000000 +0100
59383 +++ linux-2.6.33/fs/reiser4/plugin/space/bitmap.c 2010-03-04 19:33:22.000000000 +0100
59384 @@ -0,0 +1,1585 @@
59385 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59386 +
59387 +#include "../../debug.h"
59388 +#include "../../dformat.h"
59389 +#include "../../txnmgr.h"
59390 +#include "../../jnode.h"
59391 +#include "../../block_alloc.h"
59392 +#include "../../tree.h"
59393 +#include "../../super.h"
59394 +#include "../plugin.h"
59395 +#include "space_allocator.h"
59396 +#include "bitmap.h"
59397 +
59398 +#include <linux/types.h>
59399 +#include <linux/fs.h> /* for struct super_block */
59400 +#include <linux/mutex.h>
59401 +#include <asm/div64.h>
59402 +
59403 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
59404 + * blocks
59405 +
59406 + A useful optimization of reiser4 bitmap handling would be dynamic bitmap
59407 + blocks loading/unloading which is different from v3.x where all bitmap
59408 + blocks are loaded at mount time.
59409 +
59410 + To implement bitmap blocks unloading we need to count bitmap block usage
59411 + and detect currently unused blocks allowing them to be unloaded. It is not
59412 + a simple task since we allow several threads to modify one bitmap block
59413 + simultaneously.
59414 +
59415 + Briefly speaking, the following schema is proposed: we count in special
59416 + variable associated with each bitmap block. That is for counting of block
59417 + alloc/dealloc operations on that bitmap block. With a deferred block
59418 + deallocation feature of reiser4 all those operation will be represented in
59419 + atom dirty/deleted lists as jnodes for freshly allocated or deleted
59420 + nodes.
59421 +
59422 + So, we increment usage counter for each new node allocated or deleted, and
59423 + decrement it at atom commit one time for each node from the dirty/deleted
59424 + atom's list. Of course, freshly allocated node deletion and node reusing
59425 + from atom deleted (if we do so) list should decrement bitmap usage counter
59426 + also.
59427 +
59428 + This schema seems to be working but that reference counting is
59429 + not easy to debug. I think we should agree with Hans and do not implement
59430 + it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59431 +
59432 + For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59433 + loaded into memory on fs mount time or each bitmap nodes are loaded at the
59434 + first access to it, the "dont_load_bitmap" mount option controls whether
59435 + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59436 + nodes currently is not supported. */
59437 +
59438 +#define CHECKSUM_SIZE 4
59439 +
59440 +#define BYTES_PER_LONG (sizeof(long))
59441 +
59442 +#if BITS_PER_LONG == 64
59443 +# define LONG_INT_SHIFT (6)
59444 +#else
59445 +# define LONG_INT_SHIFT (5)
59446 +#endif
59447 +
59448 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59449 +
59450 +typedef unsigned long ulong_t;
59451 +
59452 +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
59453 +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
59454 +
59455 +/* Block allocation/deallocation are done through special bitmap objects which
59456 + are allocated in an array at fs mount. */
59457 +struct bitmap_node {
59458 + struct mutex mutex; /* long term lock object */
59459 +
59460 + jnode *wjnode; /* j-nodes for WORKING ... */
59461 + jnode *cjnode; /* ... and COMMIT bitmap blocks */
59462 +
59463 + bmap_off_t first_zero_bit; /* for skip_busy option implementation */
59464 +
59465 + atomic_t loaded; /* a flag which shows that bnode is loaded
59466 + * already */
59467 +};
59468 +
59469 +static inline char *bnode_working_data(struct bitmap_node *bnode)
59470 +{
59471 + char *data;
59472 +
59473 + data = jdata(bnode->wjnode);
59474 + assert("zam-429", data != NULL);
59475 +
59476 + return data + CHECKSUM_SIZE;
59477 +}
59478 +
59479 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59480 +{
59481 + char *data;
59482 +
59483 + data = jdata(bnode->cjnode);
59484 + assert("zam-430", data != NULL);
59485 +
59486 + return data + CHECKSUM_SIZE;
59487 +}
59488 +
59489 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59490 +{
59491 + char *data;
59492 +
59493 + data = jdata(bnode->cjnode);
59494 + assert("vpf-261", data != NULL);
59495 +
59496 + return le32_to_cpu(get_unaligned((d32 *)data));
59497 +}
59498 +
59499 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59500 +{
59501 + char *data;
59502 +
59503 + data = jdata(bnode->cjnode);
59504 + assert("vpf-261", data != NULL);
59505 +
59506 + put_unaligned(cpu_to_le32(crc), (d32 *)data);
59507 +}
59508 +
59509 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59510 + * written the code, does this added abstraction still have */
59511 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59512 + * reiser4_space_allocator structure) */
59513 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59514 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59515 + * someday?". What they about? If there is a reason to have a union, it should
59516 + * be a union, if not, it should not be a union. "..might be someday" means no
59517 + * reason. */
59518 +struct bitmap_allocator_data {
59519 + /* an array for bitmap blocks direct access */
59520 + struct bitmap_node *bitmap;
59521 +};
59522 +
59523 +#define get_barray(super) \
59524 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59525 +
59526 +#define get_bnode(super, i) (get_barray(super) + i)
59527 +
59528 +/* allocate and initialize jnode with JNODE_BITMAP type */
59529 +static jnode *bnew(void)
59530 +{
59531 + jnode *jal = jalloc();
59532 +
59533 + if (jal)
59534 + jnode_init(jal, current_tree, JNODE_BITMAP);
59535 +
59536 + return jal;
59537 +}
59538 +
59539 +/* this file contains:
59540 + - bitmap based implementation of space allocation plugin
59541 + - all the helper functions like set bit, find_first_zero_bit, etc */
59542 +
59543 +/* Audited by: green(2002.06.12) */
59544 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59545 +{
59546 + ulong_t mask = 1UL << start_bit;
59547 + int i = start_bit;
59548 +
59549 + while ((word & mask) != 0) {
59550 + mask <<= 1;
59551 + if (++i >= BITS_PER_LONG)
59552 + break;
59553 + }
59554 +
59555 + return i;
59556 +}
59557 +
59558 +#include <linux/bitops.h>
59559 +
59560 +#if BITS_PER_LONG == 64
59561 +
59562 +#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59563 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59564 +
59565 +static inline void reiser4_set_bit(int nr, void *addr)
59566 +{
59567 + ext2_set_bit(nr + OFF(addr), BASE(addr));
59568 +}
59569 +
59570 +static inline void reiser4_clear_bit(int nr, void *addr)
59571 +{
59572 + ext2_clear_bit(nr + OFF(addr), BASE(addr));
59573 +}
59574 +
59575 +static inline int reiser4_test_bit(int nr, void *addr)
59576 +{
59577 + return ext2_test_bit(nr + OFF(addr), BASE(addr));
59578 +}
59579 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59580 + int offset)
59581 +{
59582 + int off = OFF(addr);
59583 +
59584 + return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59585 + offset + off) - off;
59586 +}
59587 +
59588 +#else
59589 +
59590 +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
59591 +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
59592 +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
59593 +
59594 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59595 +ext2_find_next_zero_bit(addr, maxoffset, offset)
59596 +#endif
59597 +
59598 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59599 + * are counted from @addr, return the offset of the first bit if it is found,
59600 + * @maxoffset otherwise. */
59601 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59602 + bmap_off_t start_offset)
59603 +{
59604 + ulong_t *base = addr;
59605 + /* start_offset is in bits, convert it to byte offset within bitmap. */
59606 + int word_nr = start_offset >> LONG_INT_SHIFT;
59607 + /* bit number within the byte. */
59608 + int bit_nr = start_offset & LONG_INT_MASK;
59609 + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59610 +
59611 + assert("zam-387", max_offset != 0);
59612 +
59613 + /* Unaligned @start_offset case. */
59614 + if (bit_nr != 0) {
59615 + bmap_nr_t nr;
59616 +
59617 + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59618 +
59619 + if (nr < BITS_PER_LONG)
59620 + return (word_nr << LONG_INT_SHIFT) + nr;
59621 +
59622 + ++word_nr;
59623 + }
59624 +
59625 + /* Fast scan trough aligned words. */
59626 + while (word_nr <= max_word_nr) {
59627 + if (base[word_nr] != 0) {
59628 + return (word_nr << LONG_INT_SHIFT)
59629 + + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59630 + }
59631 +
59632 + ++word_nr;
59633 + }
59634 +
59635 + return max_offset;
59636 +}
59637 +
59638 +#if BITS_PER_LONG == 64
59639 +
59640 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59641 + bmap_off_t start_offset)
59642 +{
59643 + bmap_off_t off = OFF(addr);
59644 +
59645 + return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59646 + start_offset + off) - off;
59647 +}
59648 +
59649 +#else
59650 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59651 + __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59652 +#endif
59653 +
59654 +/* search for the first set bit in single word. */
59655 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59656 +{
59657 + ulong_t bit_mask;
59658 + int nr = start_bit;
59659 +
59660 + assert("zam-965", start_bit < BITS_PER_LONG);
59661 + assert("zam-966", start_bit >= 0);
59662 +
59663 + bit_mask = (1UL << nr);
59664 +
59665 + while (bit_mask != 0) {
59666 + if (bit_mask & word)
59667 + return nr;
59668 + bit_mask >>= 1;
59669 + nr--;
59670 + }
59671 + return BITS_PER_LONG;
59672 +}
59673 +
59674 +/* Search bitmap for a set bit in backward direction from the end to the
59675 + * beginning of given region
59676 + *
59677 + * @result: result offset of the last set bit
59678 + * @addr: base memory address,
59679 + * @low_off: low end of the search region, edge bit included into the region,
59680 + * @high_off: high end of the search region, edge bit included into the region,
59681 + *
59682 + * @return: 0 - set bit was found, -1 otherwise.
59683 + */
59684 +static int
59685 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59686 + bmap_off_t high_off)
59687 +{
59688 + ulong_t *base = addr;
59689 + int last_word;
59690 + int first_word;
59691 + int last_bit;
59692 + int nr;
59693 +
59694 + assert("zam-962", high_off >= low_off);
59695 +
59696 + last_word = high_off >> LONG_INT_SHIFT;
59697 + last_bit = high_off & LONG_INT_MASK;
59698 + first_word = low_off >> LONG_INT_SHIFT;
59699 +
59700 + if (last_bit < BITS_PER_LONG) {
59701 + nr = find_last_set_bit_in_word(base[last_word], last_bit);
59702 + if (nr < BITS_PER_LONG) {
59703 + *result = (last_word << LONG_INT_SHIFT) + nr;
59704 + return 0;
59705 + }
59706 + --last_word;
59707 + }
59708 + while (last_word >= first_word) {
59709 + if (base[last_word] != 0x0) {
59710 + last_bit =
59711 + find_last_set_bit_in_word(base[last_word],
59712 + BITS_PER_LONG - 1);
59713 + assert("zam-972", last_bit < BITS_PER_LONG);
59714 + *result = (last_word << LONG_INT_SHIFT) + last_bit;
59715 + return 0;
59716 + }
59717 + --last_word;
59718 + }
59719 +
59720 + return -1; /* set bit not found */
59721 +}
59722 +
59723 +/* Search bitmap for a clear bit in backward direction from the end to the
59724 + * beginning of given region */
59725 +static int
59726 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59727 + bmap_off_t high_off)
59728 +{
59729 + ulong_t *base = addr;
59730 + int last_word;
59731 + int first_word;
59732 + int last_bit;
59733 + int nr;
59734 +
59735 + last_word = high_off >> LONG_INT_SHIFT;
59736 + last_bit = high_off & LONG_INT_MASK;
59737 + first_word = low_off >> LONG_INT_SHIFT;
59738 +
59739 + if (last_bit < BITS_PER_LONG) {
59740 + nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59741 + if (nr < BITS_PER_LONG) {
59742 + *result = (last_word << LONG_INT_SHIFT) + nr;
59743 + return 0;
59744 + }
59745 + --last_word;
59746 + }
59747 + while (last_word >= first_word) {
59748 + if (base[last_word] != (ulong_t) (-1)) {
59749 + *result = (last_word << LONG_INT_SHIFT) +
59750 + find_last_set_bit_in_word(~base[last_word],
59751 + BITS_PER_LONG - 1);
59752 + return 0;
59753 + }
59754 + --last_word;
59755 + }
59756 +
59757 + return -1; /* zero bit not found */
59758 +}
59759 +
59760 +/* Audited by: green(2002.06.12) */
59761 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59762 +{
59763 + int first_byte;
59764 + int last_byte;
59765 +
59766 + unsigned char first_byte_mask = 0xFF;
59767 + unsigned char last_byte_mask = 0xFF;
59768 +
59769 + assert("zam-410", start < end);
59770 +
59771 + first_byte = start >> 3;
59772 + last_byte = (end - 1) >> 3;
59773 +
59774 + if (last_byte > first_byte + 1)
59775 + memset(addr + first_byte + 1, 0,
59776 + (size_t) (last_byte - first_byte - 1));
59777 +
59778 + first_byte_mask >>= 8 - (start & 0x7);
59779 + last_byte_mask <<= ((end - 1) & 0x7) + 1;
59780 +
59781 + if (first_byte == last_byte) {
59782 + addr[first_byte] &= (first_byte_mask | last_byte_mask);
59783 + } else {
59784 + addr[first_byte] &= first_byte_mask;
59785 + addr[last_byte] &= last_byte_mask;
59786 + }
59787 +}
59788 +
59789 +/* Audited by: green(2002.06.12) */
59790 +/* ZAM-FIXME-HANS: comment this */
59791 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59792 +{
59793 + int first_byte;
59794 + int last_byte;
59795 +
59796 + unsigned char first_byte_mask = 0xFF;
59797 + unsigned char last_byte_mask = 0xFF;
59798 +
59799 + assert("zam-386", start < end);
59800 +
59801 + first_byte = start >> 3;
59802 + last_byte = (end - 1) >> 3;
59803 +
59804 + if (last_byte > first_byte + 1)
59805 + memset(addr + first_byte + 1, 0xFF,
59806 + (size_t) (last_byte - first_byte - 1));
59807 +
59808 + first_byte_mask <<= start & 0x7;
59809 + last_byte_mask >>= 7 - ((end - 1) & 0x7);
59810 +
59811 + if (first_byte == last_byte) {
59812 + addr[first_byte] |= (first_byte_mask & last_byte_mask);
59813 + } else {
59814 + addr[first_byte] |= first_byte_mask;
59815 + addr[last_byte] |= last_byte_mask;
59816 + }
59817 +}
59818 +
59819 +#define ADLER_BASE 65521
59820 +#define ADLER_NMAX 5552
59821 +
59822 +/* Calculates the adler32 checksum for the data pointed by `data` of the
59823 + length `len`. This function was originally taken from zlib, version 1.1.3,
59824 + July 9th, 1998.
59825 +
59826 + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59827 +
59828 + This software is provided 'as-is', without any express or implied
59829 + warranty. In no event will the authors be held liable for any damages
59830 + arising from the use of this software.
59831 +
59832 + Permission is granted to anyone to use this software for any purpose,
59833 + including commercial applications, and to alter it and redistribute it
59834 + freely, subject to the following restrictions:
59835 +
59836 + 1. The origin of this software must not be misrepresented; you must not
59837 + claim that you wrote the original software. If you use this software
59838 + in a product, an acknowledgment in the product documentation would be
59839 + appreciated but is not required.
59840 + 2. Altered source versions must be plainly marked as such, and must not be
59841 + misrepresented as being the original software.
59842 + 3. This notice may not be removed or altered from any source distribution.
59843 +
59844 + Jean-loup Gailly Mark Adler
59845 + jloup@gzip.org madler@alumni.caltech.edu
59846 +
59847 + The above comment applies only to the reiser4_adler32 function.
59848 +*/
59849 +
59850 +__u32 reiser4_adler32(char *data, __u32 len)
59851 +{
59852 + unsigned char *t = data;
59853 + __u32 s1 = 1;
59854 + __u32 s2 = 0;
59855 + int k;
59856 +
59857 + while (len > 0) {
59858 + k = len < ADLER_NMAX ? len : ADLER_NMAX;
59859 + len -= k;
59860 +
59861 + while (k--) {
59862 + s1 += *t++;
59863 + s2 += s1;
59864 + }
59865 +
59866 + s1 %= ADLER_BASE;
59867 + s2 %= ADLER_BASE;
59868 + }
59869 + return (s2 << 16) | s1;
59870 +}
59871 +
59872 +#define sb_by_bnode(bnode) \
59873 + ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59874 +
59875 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59876 +{
59877 + return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59878 +}
59879 +
59880 +static int
59881 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59882 +{
59883 + if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59884 + bmap_nr_t bmap;
59885 +
59886 + bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59887 +
59888 + warning("vpf-263",
59889 + "Checksum for the bitmap block %llu is incorrect",
59890 + bmap);
59891 +
59892 + return RETERR(-EIO);
59893 + }
59894 +
59895 + return 0;
59896 +}
59897 +
59898 +#define REISER4_CHECK_BMAP_CRC (0)
59899 +
59900 +#if REISER4_CHECK_BMAP_CRC
59901 +static int bnode_check_crc(const struct bitmap_node *bnode)
59902 +{
59903 + return bnode_check_adler32(bnode,
59904 + bmap_size(sb_by_bnode(bnode)->s_blocksize));
59905 +}
59906 +
59907 +/* REISER4_CHECK_BMAP_CRC */
59908 +#else
59909 +
59910 +#define bnode_check_crc(bnode) (0)
59911 +
59912 +/* REISER4_CHECK_BMAP_CRC */
59913 +#endif
59914 +
59915 +/* Recalculates the adler32 checksum for only 1 byte change.
59916 + adler - previous adler checksum
59917 + old_data, data - old, new byte values.
59918 + tail == (chunk - offset) : length, checksum was calculated for, - offset of
59919 + the changed byte within this chunk.
59920 + This function can be used for checksum calculation optimisation.
59921 +*/
59922 +
59923 +static __u32
59924 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59925 + __u32 tail)
59926 +{
59927 + __u32 delta = data - old_data + 2 * ADLER_BASE;
59928 + __u32 s1 = adler & 0xffff;
59929 + __u32 s2 = (adler >> 16) & 0xffff;
59930 +
59931 + s1 = (delta + s1) % ADLER_BASE;
59932 + s2 = (delta * tail + s2) % ADLER_BASE;
59933 +
59934 + return (s2 << 16) | s1;
59935 +}
59936 +
59937 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59938 +
59939 +/**
59940 + * get_nr_bitmap - calculate number of bitmap blocks
59941 + * @super: super block with initialized blocksize and block count
59942 + *
59943 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59944 + * maintain free disk space. It assumes that each bitmap addresses the same
59945 + * number of blocks which is calculated by bmap_block_count macro defined in
59946 + * above. Number of blocks in the filesystem has to be initialized in reiser4
59947 + * private data of super block already so that it can be obtained via
59948 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59949 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59950 + * to use special function to divide and modulo 64bits filesystem block
59951 + * counters.
59952 + *
59953 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59954 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59955 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59956 + */
59957 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
59958 +{
59959 + u64 quotient;
59960 +
59961 + assert("zam-393", reiser4_block_count(super) != 0);
59962 +
59963 + quotient = reiser4_block_count(super) - 1;
59964 + do_div(quotient, bmap_bit_count(super->s_blocksize));
59965 + return quotient + 1;
59966 +}
59967 +
59968 +/**
59969 + * parse_blocknr - calculate bitmap number and offset in it by block number
59970 + * @block: pointer to block number to calculate location in bitmap of
59971 + * @bmap: pointer where to store bitmap block number
59972 + * @offset: pointer where to store offset within bitmap block
59973 + *
59974 + * Calculates location of bit which is responsible for allocation/freeing of
59975 + * block @*block. That location is represented by bitmap block number and offset
59976 + * within that bitmap block.
59977 + */
59978 +static void
59979 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59980 + bmap_off_t *offset)
59981 +{
59982 + struct super_block *super = get_current_context()->super;
59983 + u64 quotient = *block;
59984 +
59985 + *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59986 + *bmap = quotient;
59987 +
59988 + assert("zam-433", *bmap < get_nr_bmap(super));
59989 + assert("", *offset < bmap_bit_count(super->s_blocksize));
59990 +}
59991 +
59992 +#if REISER4_DEBUG
59993 +/* Audited by: green(2002.06.12) */
59994 +static void
59995 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59996 +{
59997 + struct super_block *sb = reiser4_get_current_sb();
59998 +
59999 + assert("zam-436", sb != NULL);
60000 +
60001 + assert("zam-455", start != NULL);
60002 + assert("zam-437", *start != 0);
60003 + assert("zam-541", !reiser4_blocknr_is_fake(start));
60004 + assert("zam-441", *start < reiser4_block_count(sb));
60005 +
60006 + if (len != NULL) {
60007 + assert("zam-438", *len != 0);
60008 + assert("zam-442", *start + *len <= reiser4_block_count(sb));
60009 + }
60010 +}
60011 +
60012 +static void check_bnode_loaded(const struct bitmap_node *bnode)
60013 +{
60014 + assert("zam-485", bnode != NULL);
60015 + assert("zam-483", jnode_page(bnode->wjnode) != NULL);
60016 + assert("zam-484", jnode_page(bnode->cjnode) != NULL);
60017 + assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
60018 + assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
60019 +}
60020 +
60021 +#else
60022 +
60023 +# define check_block_range(start, len) do { /* nothing */} while(0)
60024 +# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
60025 +
60026 +#endif
60027 +
60028 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
60029 + spin-locked */
60030 +static inline void
60031 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
60032 +{
60033 + if (offset < bnode->first_zero_bit)
60034 + bnode->first_zero_bit = offset;
60035 +}
60036 +
60037 +/* return a physical disk address for logical bitmap number @bmap */
60038 +/* FIXME-VS: this is somehow related to disk layout? */
60039 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
60040 + * per block allocation so that performance is not affected. Probably this
60041 + * whole file should be considered part of the disk layout plugin, and other
60042 + * disk layouts can use other defines and efficiency will not be significantly
60043 + * affected. */
60044 +
60045 +#define REISER4_FIRST_BITMAP_BLOCK \
60046 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
60047 +
60048 +/* Audited by: green(2002.06.12) */
60049 +static void
60050 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
60051 + reiser4_block_nr * bnr)
60052 +{
60053 +
60054 + assert("zam-390", bmap < get_nr_bmap(super));
60055 +
60056 +#ifdef CONFIG_REISER4_BADBLOCKS
60057 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
60058 + /* Check if the diskmap have this already, first. */
60059 + if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
60060 + return; /* Found it in diskmap */
60061 +#endif
60062 + /* FIXME_ZAM: before discussing of disk layouts and disk format
60063 + plugins I implement bitmap location scheme which is close to scheme
60064 + used in reiser 3.6 */
60065 + if (bmap == 0) {
60066 + *bnr = REISER4_FIRST_BITMAP_BLOCK;
60067 + } else {
60068 + *bnr = bmap * bmap_bit_count(super->s_blocksize);
60069 + }
60070 +}
60071 +
60072 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
60073 +/* Audited by: green(2002.06.12) */
60074 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
60075 +{
60076 + *bnr =
60077 + (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
60078 + REISER4_BITMAP_BLOCKS_STATUS_VALUE);
60079 +}
60080 +
60081 +/* bnode structure initialization */
60082 +static void
60083 +init_bnode(struct bitmap_node *bnode,
60084 + struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
60085 +{
60086 + memset(bnode, 0, sizeof(struct bitmap_node));
60087 +
60088 + mutex_init(&bnode->mutex);
60089 + atomic_set(&bnode->loaded, 0);
60090 +}
60091 +
60092 +static void release(jnode * node)
60093 +{
60094 + jrelse(node);
60095 + JF_SET(node, JNODE_HEARD_BANSHEE);
60096 + jput(node);
60097 +}
60098 +
60099 +/* This function is for internal bitmap.c use because it assumes that jnode is
60100 + in under full control of this thread */
60101 +static void done_bnode(struct bitmap_node *bnode)
60102 +{
60103 + if (bnode) {
60104 + atomic_set(&bnode->loaded, 0);
60105 + if (bnode->wjnode != NULL)
60106 + release(bnode->wjnode);
60107 + if (bnode->cjnode != NULL)
60108 + release(bnode->cjnode);
60109 + bnode->wjnode = bnode->cjnode = NULL;
60110 + }
60111 +}
60112 +
60113 +/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
60114 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
60115 + jnode **wjnode_ret)
60116 +{
60117 + struct super_block *super;
60118 + jnode *cjnode;
60119 + jnode *wjnode;
60120 + bmap_nr_t bmap;
60121 + int ret;
60122 +
60123 + super = reiser4_get_current_sb();
60124 +
60125 + *wjnode_ret = wjnode = bnew();
60126 + if (wjnode == NULL) {
60127 + *cjnode_ret = NULL;
60128 + return RETERR(-ENOMEM);
60129 + }
60130 +
60131 + *cjnode_ret = cjnode = bnew();
60132 + if (cjnode == NULL)
60133 + return RETERR(-ENOMEM);
60134 +
60135 + bmap = bnode - get_bnode(super, 0);
60136 +
60137 + get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
60138 + get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
60139 +
60140 + jref(cjnode);
60141 + jref(wjnode);
60142 +
60143 + /* load commit bitmap */
60144 + ret = jload_gfp(cjnode, GFP_NOFS, 1);
60145 +
60146 + if (ret)
60147 + goto error;
60148 +
60149 + /* allocate memory for working bitmap block. Note that for
60150 + * bitmaps jinit_new() doesn't actually modifies node content,
60151 + * so parallel calls to this are ok. */
60152 + ret = jinit_new(wjnode, GFP_NOFS);
60153 +
60154 + if (ret != 0) {
60155 + jrelse(cjnode);
60156 + goto error;
60157 + }
60158 +
60159 + return 0;
60160 +
60161 + error:
60162 + jput(cjnode);
60163 + jput(wjnode);
60164 + *wjnode_ret = *cjnode_ret = NULL;
60165 + return ret;
60166 +
60167 +}
60168 +
60169 +/* Check the bnode data on read. */
60170 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
60171 +{
60172 + void *data;
60173 + int ret;
60174 +
60175 + /* Check CRC */
60176 + ret = bnode_check_adler32(bnode, blksize);
60177 +
60178 + if (ret) {
60179 + return ret;
60180 + }
60181 +
60182 + data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
60183 +
60184 + /* Check the very first bit -- it must be busy. */
60185 + if (!reiser4_test_bit(0, data)) {
60186 + warning("vpf-1362", "The allocator block %llu is not marked "
60187 + "as used.", (unsigned long long)bnode->cjnode->blocknr);
60188 +
60189 + return -EINVAL;
60190 + }
60191 +
60192 + return 0;
60193 +}
60194 +
60195 +/* load bitmap blocks "on-demand" */
60196 +static int load_and_lock_bnode(struct bitmap_node *bnode)
60197 +{
60198 + int ret;
60199 +
60200 + jnode *cjnode;
60201 + jnode *wjnode;
60202 +
60203 + assert("nikita-3040", reiser4_schedulable());
60204 +
60205 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
60206 + * need to be atomic, right? Just leave a comment that if bitmaps were
60207 + * unloadable, this would need to be atomic. */
60208 + if (atomic_read(&bnode->loaded)) {
60209 + /* bitmap is already loaded, nothing to do */
60210 + check_bnode_loaded(bnode);
60211 + mutex_lock(&bnode->mutex);
60212 + assert("nikita-2827", atomic_read(&bnode->loaded));
60213 + return 0;
60214 + }
60215 +
60216 + ret = prepare_bnode(bnode, &cjnode, &wjnode);
60217 + if (ret == 0) {
60218 + mutex_lock(&bnode->mutex);
60219 +
60220 + if (!atomic_read(&bnode->loaded)) {
60221 + assert("nikita-2822", cjnode != NULL);
60222 + assert("nikita-2823", wjnode != NULL);
60223 + assert("nikita-2824", jnode_is_loaded(cjnode));
60224 + assert("nikita-2825", jnode_is_loaded(wjnode));
60225 +
60226 + bnode->wjnode = wjnode;
60227 + bnode->cjnode = cjnode;
60228 +
60229 + ret = check_struct_bnode(bnode, current_blocksize);
60230 + if (!ret) {
60231 + cjnode = wjnode = NULL;
60232 + atomic_set(&bnode->loaded, 1);
60233 + /* working bitmap is initialized by on-disk
60234 + * commit bitmap. This should be performed
60235 + * under mutex. */
60236 + memcpy(bnode_working_data(bnode),
60237 + bnode_commit_data(bnode),
60238 + bmap_size(current_blocksize));
60239 + } else
60240 + mutex_unlock(&bnode->mutex);
60241 + } else
60242 + /* race: someone already loaded bitmap while we were
60243 + * busy initializing data. */
60244 + check_bnode_loaded(bnode);
60245 + }
60246 +
60247 + if (wjnode != NULL) {
60248 + release(wjnode);
60249 + bnode->wjnode = NULL;
60250 + }
60251 + if (cjnode != NULL) {
60252 + release(cjnode);
60253 + bnode->cjnode = NULL;
60254 + }
60255 +
60256 + return ret;
60257 +}
60258 +
60259 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
60260 +{
60261 + check_bnode_loaded(bnode);
60262 + mutex_unlock(&bnode->mutex);
60263 +}
60264 +
60265 +/* This function does all block allocation work but only for one bitmap
60266 + block.*/
60267 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
60268 + block responsibility zone boundaries. This had no sense in v3.6 but may
60269 + have it in v4.x */
60270 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
60271 +static int
60272 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
60273 + bmap_off_t max_offset, int min_len, int max_len)
60274 +{
60275 + struct super_block *super = get_current_context()->super;
60276 + struct bitmap_node *bnode = get_bnode(super, bmap);
60277 +
60278 + char *data;
60279 +
60280 + bmap_off_t search_end;
60281 + bmap_off_t start;
60282 + bmap_off_t end;
60283 +
60284 + int set_first_zero_bit = 0;
60285 +
60286 + int ret;
60287 +
60288 + assert("zam-364", min_len > 0);
60289 + assert("zam-365", max_len >= min_len);
60290 + assert("zam-366", *offset <= max_offset);
60291 +
60292 + ret = load_and_lock_bnode(bnode);
60293 +
60294 + if (ret)
60295 + return ret;
60296 +
60297 + data = bnode_working_data(bnode);
60298 +
60299 + start = *offset;
60300 +
60301 + if (bnode->first_zero_bit >= start) {
60302 + start = bnode->first_zero_bit;
60303 + set_first_zero_bit = 1;
60304 + }
60305 +
60306 + while (start + min_len < max_offset) {
60307 +
60308 + start =
60309 + reiser4_find_next_zero_bit((long *)data, max_offset, start);
60310 + if (set_first_zero_bit) {
60311 + bnode->first_zero_bit = start;
60312 + set_first_zero_bit = 0;
60313 + }
60314 + if (start >= max_offset)
60315 + break;
60316 +
60317 + search_end = LIMIT(start + max_len, max_offset);
60318 + end =
60319 + reiser4_find_next_set_bit((long *)data, search_end, start);
60320 + if (end >= start + min_len) {
60321 + /* we can't trust find_next_set_bit result if set bit
60322 + was not fount, result may be bigger than
60323 + max_offset */
60324 + if (end > search_end)
60325 + end = search_end;
60326 +
60327 + ret = end - start;
60328 + *offset = start;
60329 +
60330 + reiser4_set_bits(data, start, end);
60331 +
60332 + /* FIXME: we may advance first_zero_bit if [start,
60333 + end] region overlaps the first_zero_bit point */
60334 +
60335 + break;
60336 + }
60337 +
60338 + start = end + 1;
60339 + }
60340 +
60341 + release_and_unlock_bnode(bnode);
60342 +
60343 + return ret;
60344 +}
60345 +
60346 +static int
60347 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
60348 + bmap_off_t end_offset, int min_len, int max_len)
60349 +{
60350 + struct super_block *super = get_current_context()->super;
60351 + struct bitmap_node *bnode = get_bnode(super, bmap);
60352 + char *data;
60353 + bmap_off_t start;
60354 + int ret;
60355 +
60356 + assert("zam-958", min_len > 0);
60357 + assert("zam-959", max_len >= min_len);
60358 + assert("zam-960", *start_offset >= end_offset);
60359 +
60360 + ret = load_and_lock_bnode(bnode);
60361 + if (ret)
60362 + return ret;
60363 +
60364 + data = bnode_working_data(bnode);
60365 + start = *start_offset;
60366 +
60367 + while (1) {
60368 + bmap_off_t end, search_end;
60369 +
60370 + /* Find the beginning of the zero filled region */
60371 + if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
60372 + break;
60373 + /* Is there more than `min_len' bits from `start' to
60374 + * `end_offset'? */
60375 + if (start < end_offset + min_len - 1)
60376 + break;
60377 +
60378 + /* Do not search to `end_offset' if we need to find less than
60379 + * `max_len' zero bits. */
60380 + if (end_offset + max_len - 1 < start)
60381 + search_end = start - max_len + 1;
60382 + else
60383 + search_end = end_offset;
60384 +
60385 + if (reiser4_find_last_set_bit(&end, data, search_end, start))
60386 + end = search_end;
60387 + else
60388 + end++;
60389 +
60390 + if (end + min_len <= start + 1) {
60391 + if (end < search_end)
60392 + end = search_end;
60393 + ret = start - end + 1;
60394 + *start_offset = end; /* `end' is lowest offset */
60395 + assert("zam-987",
60396 + reiser4_find_next_set_bit(data, start + 1,
60397 + end) >= start + 1);
60398 + reiser4_set_bits(data, end, start + 1);
60399 + break;
60400 + }
60401 +
60402 + if (end <= end_offset)
60403 + /* left search boundary reached. */
60404 + break;
60405 + start = end - 1;
60406 + }
60407 +
60408 + release_and_unlock_bnode(bnode);
60409 + return ret;
60410 +}
60411 +
60412 +/* allocate contiguous range of blocks in bitmap */
60413 +static int bitmap_alloc_forward(reiser4_block_nr * start,
60414 + const reiser4_block_nr * end, int min_len,
60415 + int max_len)
60416 +{
60417 + bmap_nr_t bmap, end_bmap;
60418 + bmap_off_t offset, end_offset;
60419 + int len;
60420 +
60421 + reiser4_block_nr tmp;
60422 +
60423 + struct super_block *super = get_current_context()->super;
60424 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60425 +
60426 + parse_blocknr(start, &bmap, &offset);
60427 +
60428 + tmp = *end - 1;
60429 + parse_blocknr(&tmp, &end_bmap, &end_offset);
60430 + ++end_offset;
60431 +
60432 + assert("zam-358", end_bmap >= bmap);
60433 + assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60434 +
60435 + for (; bmap < end_bmap; bmap++, offset = 0) {
60436 + len =
60437 + search_one_bitmap_forward(bmap, &offset, max_offset,
60438 + min_len, max_len);
60439 + if (len != 0)
60440 + goto out;
60441 + }
60442 +
60443 + len =
60444 + search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60445 + max_len);
60446 + out:
60447 + *start = bmap * max_offset + offset;
60448 + return len;
60449 +}
60450 +
60451 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
60452 + * backward direction) */
60453 +static int bitmap_alloc_backward(reiser4_block_nr * start,
60454 + const reiser4_block_nr * end, int min_len,
60455 + int max_len)
60456 +{
60457 + bmap_nr_t bmap, end_bmap;
60458 + bmap_off_t offset, end_offset;
60459 + int len;
60460 + struct super_block *super = get_current_context()->super;
60461 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60462 +
60463 + parse_blocknr(start, &bmap, &offset);
60464 + parse_blocknr(end, &end_bmap, &end_offset);
60465 +
60466 + assert("zam-961", end_bmap <= bmap);
60467 + assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60468 +
60469 + for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60470 + len =
60471 + search_one_bitmap_backward(bmap, &offset, 0, min_len,
60472 + max_len);
60473 + if (len != 0)
60474 + goto out;
60475 + }
60476 +
60477 + len =
60478 + search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60479 + max_len);
60480 + out:
60481 + *start = bmap * max_offset + offset;
60482 + return len;
60483 +}
60484 +
60485 +/* plugin->u.space_allocator.alloc_blocks() */
60486 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60487 + reiser4_block_nr *start, reiser4_block_nr *len)
60488 +{
60489 + struct super_block *super = get_current_context()->super;
60490 + int actual_len;
60491 +
60492 + reiser4_block_nr search_start;
60493 + reiser4_block_nr search_end;
60494 +
60495 + assert("zam-398", super != NULL);
60496 + assert("zam-412", hint != NULL);
60497 + assert("zam-397", hint->blk <= reiser4_block_count(super));
60498 +
60499 + if (hint->max_dist == 0)
60500 + search_end = reiser4_block_count(super);
60501 + else
60502 + search_end =
60503 + LIMIT(hint->blk + hint->max_dist,
60504 + reiser4_block_count(super));
60505 +
60506 + /* We use @hint -> blk as a search start and search from it to the end
60507 + of the disk or in given region if @hint -> max_dist is not zero */
60508 + search_start = hint->blk;
60509 +
60510 + actual_len =
60511 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60512 +
60513 + /* There is only one bitmap search if max_dist was specified or first
60514 + pass was from the beginning of the bitmap. We also do one pass for
60515 + scanning bitmap in backward direction. */
60516 + if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60517 + /* next step is a scanning from 0 to search_start */
60518 + search_end = search_start;
60519 + search_start = 0;
60520 + actual_len =
60521 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60522 + }
60523 + if (actual_len == 0)
60524 + return RETERR(-ENOSPC);
60525 + if (actual_len < 0)
60526 + return RETERR(actual_len);
60527 + *len = actual_len;
60528 + *start = search_start;
60529 + return 0;
60530 +}
60531 +
60532 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60533 + reiser4_block_nr * start,
60534 + reiser4_block_nr * len)
60535 +{
60536 + reiser4_block_nr search_start;
60537 + reiser4_block_nr search_end;
60538 + int actual_len;
60539 +
60540 + ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60541 +
60542 + assert("zam-969", super != NULL);
60543 + assert("zam-970", hint != NULL);
60544 + assert("zam-971", hint->blk <= reiser4_block_count(super));
60545 +
60546 + search_start = hint->blk;
60547 + if (hint->max_dist == 0 || search_start <= hint->max_dist)
60548 + search_end = 0;
60549 + else
60550 + search_end = search_start - hint->max_dist;
60551 +
60552 + actual_len =
60553 + bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60554 + if (actual_len == 0)
60555 + return RETERR(-ENOSPC);
60556 + if (actual_len < 0)
60557 + return RETERR(actual_len);
60558 + *len = actual_len;
60559 + *start = search_start;
60560 + return 0;
60561 +}
60562 +
60563 +/* plugin->u.space_allocator.alloc_blocks() */
60564 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60565 + reiser4_blocknr_hint * hint, int needed,
60566 + reiser4_block_nr * start, reiser4_block_nr * len)
60567 +{
60568 + if (hint->backward)
60569 + return alloc_blocks_backward(hint, needed, start, len);
60570 + return alloc_blocks_forward(hint, needed, start, len);
60571 +}
60572 +
60573 +/* plugin->u.space_allocator.dealloc_blocks(). */
60574 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60575 + nodes deletion is deferred until transaction commit. However, deallocation
60576 + of temporary objects like wandered blocks and transaction commit records
60577 + requires immediate node deletion from WORKING BITMAP.*/
60578 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60579 + reiser4_block_nr start, reiser4_block_nr len)
60580 +{
60581 + struct super_block *super = reiser4_get_current_sb();
60582 +
60583 + bmap_nr_t bmap;
60584 + bmap_off_t offset;
60585 +
60586 + struct bitmap_node *bnode;
60587 + int ret;
60588 +
60589 + assert("zam-468", len != 0);
60590 + check_block_range(&start, &len);
60591 +
60592 + parse_blocknr(&start, &bmap, &offset);
60593 +
60594 + assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60595 +
60596 + bnode = get_bnode(super, bmap);
60597 +
60598 + assert("zam-470", bnode != NULL);
60599 +
60600 + ret = load_and_lock_bnode(bnode);
60601 + assert("zam-481", ret == 0);
60602 +
60603 + reiser4_clear_bits(bnode_working_data(bnode), offset,
60604 + (bmap_off_t) (offset + len));
60605 +
60606 + adjust_first_zero_bit(bnode, offset);
60607 +
60608 + release_and_unlock_bnode(bnode);
60609 +}
60610 +
60611 +/* plugin->u.space_allocator.check_blocks(). */
60612 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60613 + const reiser4_block_nr * len, int desired)
60614 +{
60615 +#if REISER4_DEBUG
60616 + struct super_block *super = reiser4_get_current_sb();
60617 +
60618 + bmap_nr_t bmap;
60619 + bmap_off_t start_offset;
60620 + bmap_off_t end_offset;
60621 +
60622 + struct bitmap_node *bnode;
60623 + int ret;
60624 +
60625 + assert("zam-622", len != NULL);
60626 + check_block_range(start, len);
60627 + parse_blocknr(start, &bmap, &start_offset);
60628 +
60629 + end_offset = start_offset + *len;
60630 + assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60631 +
60632 + bnode = get_bnode(super, bmap);
60633 +
60634 + assert("nikita-2215", bnode != NULL);
60635 +
60636 + ret = load_and_lock_bnode(bnode);
60637 + assert("zam-626", ret == 0);
60638 +
60639 + assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60640 +
60641 + if (desired) {
60642 + assert("zam-623",
60643 + reiser4_find_next_zero_bit(bnode_working_data(bnode),
60644 + end_offset, start_offset)
60645 + >= end_offset);
60646 + } else {
60647 + assert("zam-624",
60648 + reiser4_find_next_set_bit(bnode_working_data(bnode),
60649 + end_offset, start_offset)
60650 + >= end_offset);
60651 + }
60652 +
60653 + release_and_unlock_bnode(bnode);
60654 +#endif
60655 +}
60656 +
60657 +/* conditional insertion of @node into atom's overwrite set if it was not there */
60658 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60659 +{
60660 + assert("zam-546", atom != NULL);
60661 + assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60662 + assert("zam-548", node != NULL);
60663 +
60664 + spin_lock_atom(atom);
60665 + spin_lock_jnode(node);
60666 +
60667 + if (node->atom == NULL) {
60668 + JF_SET(node, JNODE_OVRWR);
60669 + insert_into_atom_ovrwr_list(atom, node);
60670 + } else {
60671 + assert("zam-549", node->atom == atom);
60672 + }
60673 +
60674 + spin_unlock_jnode(node);
60675 + spin_unlock_atom(atom);
60676 +}
60677 +
60678 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
60679 + pages in a single-linked list */
60680 +static int
60681 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60682 + const reiser4_block_nr * len, void *data)
60683 +{
60684 +
60685 + bmap_nr_t bmap;
60686 + bmap_off_t offset;
60687 + int ret;
60688 +
60689 + long long *blocks_freed_p = data;
60690 +
60691 + struct bitmap_node *bnode;
60692 +
60693 + struct super_block *sb = reiser4_get_current_sb();
60694 +
60695 + check_block_range(start, len);
60696 +
60697 + parse_blocknr(start, &bmap, &offset);
60698 +
60699 + /* FIXME-ZAM: we assume that all block ranges are allocated by this
60700 + bitmap-based allocator and each block range can't go over a zone of
60701 + responsibility of one bitmap block; same assumption is used in
60702 + other journal hooks in bitmap code. */
60703 + bnode = get_bnode(sb, bmap);
60704 + assert("zam-448", bnode != NULL);
60705 +
60706 + /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60707 + assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60708 + ret = load_and_lock_bnode(bnode);
60709 + if (ret)
60710 + return ret;
60711 +
60712 + /* put bnode into atom's overwrite set */
60713 + cond_add_to_overwrite_set(atom, bnode->cjnode);
60714 +
60715 + data = bnode_commit_data(bnode);
60716 +
60717 + ret = bnode_check_crc(bnode);
60718 + if (ret != 0)
60719 + return ret;
60720 +
60721 + if (len != NULL) {
60722 + /* FIXME-ZAM: a check that all bits are set should be there */
60723 + assert("zam-443",
60724 + offset + *len <= bmap_bit_count(sb->s_blocksize));
60725 + reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60726 +
60727 + (*blocks_freed_p) += *len;
60728 + } else {
60729 + reiser4_clear_bit(offset, data);
60730 + (*blocks_freed_p)++;
60731 + }
60732 +
60733 + bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60734 +
60735 + release_and_unlock_bnode(bnode);
60736 +
60737 + return 0;
60738 +}
60739 +
60740 +/* plugin->u.space_allocator.pre_commit_hook(). */
60741 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60742 + rest is done by transaction manager (allocate wandered locations for COMMIT
60743 + BITMAP blocks, copy COMMIT BITMAP blocks data). */
60744 +/* Only one instance of this function can be running at one given time, because
60745 + only one transaction can be committed a time, therefore it is safe to access
60746 + some global variables without any locking */
60747 +
60748 +int reiser4_pre_commit_hook_bitmap(void)
60749 +{
60750 + struct super_block *super = reiser4_get_current_sb();
60751 + txn_atom *atom;
60752 +
60753 + long long blocks_freed = 0;
60754 +
60755 + atom = get_current_atom_locked();
60756 + assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60757 + spin_unlock_atom(atom);
60758 +
60759 + { /* scan atom's captured list and find all freshly allocated nodes,
60760 + * mark corresponded bits in COMMIT BITMAP as used */
60761 + struct list_head *head = ATOM_CLEAN_LIST(atom);
60762 + jnode *node = list_entry(head->next, jnode, capture_link);
60763 +
60764 + while (head != &node->capture_link) {
60765 + /* we detect freshly allocated jnodes */
60766 + if (JF_ISSET(node, JNODE_RELOC)) {
60767 + int ret;
60768 + bmap_nr_t bmap;
60769 +
60770 + bmap_off_t offset;
60771 + bmap_off_t index;
60772 + struct bitmap_node *bn;
60773 + __u32 size = bmap_size(super->s_blocksize);
60774 + __u32 crc;
60775 + char byte;
60776 +
60777 + assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60778 + assert("zam-460",
60779 + !reiser4_blocknr_is_fake(&node->blocknr));
60780 +
60781 + parse_blocknr(&node->blocknr, &bmap, &offset);
60782 + bn = get_bnode(super, bmap);
60783 +
60784 + index = offset >> 3;
60785 + assert("vpf-276", index < size);
60786 +
60787 + ret = bnode_check_crc(bnode);
60788 + if (ret != 0)
60789 + return ret;
60790 +
60791 + check_bnode_loaded(bn);
60792 + load_and_lock_bnode(bn);
60793 +
60794 + byte = *(bnode_commit_data(bn) + index);
60795 + reiser4_set_bit(offset, bnode_commit_data(bn));
60796 +
60797 + crc = adler32_recalc(bnode_commit_crc(bn), byte,
60798 + *(bnode_commit_data(bn) +
60799 + index),
60800 + size - index),
60801 + bnode_set_commit_crc(bn, crc);
60802 +
60803 + release_and_unlock_bnode(bn);
60804 +
60805 + ret = bnode_check_crc(bn);
60806 + if (ret != 0)
60807 + return ret;
60808 +
60809 + /* working of this depends on how it inserts
60810 + new j-node into clean list, because we are
60811 + scanning the same list now. It is OK, if
60812 + insertion is done to the list front */
60813 + cond_add_to_overwrite_set(atom, bn->cjnode);
60814 + }
60815 +
60816 + node = list_entry(node->capture_link.next, jnode, capture_link);
60817 + }
60818 + }
60819 +
60820 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60821 + &blocks_freed, 0);
60822 +
60823 + blocks_freed -= atom->nr_blocks_allocated;
60824 +
60825 + {
60826 + reiser4_super_info_data *sbinfo;
60827 +
60828 + sbinfo = get_super_private(super);
60829 +
60830 + spin_lock_reiser4_super(sbinfo);
60831 + sbinfo->blocks_free_committed += blocks_freed;
60832 + spin_unlock_reiser4_super(sbinfo);
60833 + }
60834 +
60835 + return 0;
60836 +}
60837 +
60838 +/* plugin->u.space_allocator.init_allocator
60839 + constructor of reiser4_space_allocator object. It is called on fs mount */
60840 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60841 + struct super_block *super, void *arg)
60842 +{
60843 + struct bitmap_allocator_data *data = NULL;
60844 + bmap_nr_t bitmap_blocks_nr;
60845 + bmap_nr_t i;
60846 +
60847 + assert("nikita-3039", reiser4_schedulable());
60848 +
60849 + /* getting memory for bitmap allocator private data holder */
60850 + data =
60851 + kmalloc(sizeof(struct bitmap_allocator_data),
60852 + reiser4_ctx_gfp_mask_get());
60853 +
60854 + if (data == NULL)
60855 + return RETERR(-ENOMEM);
60856 +
60857 + /* allocation and initialization for the array of bnodes */
60858 + bitmap_blocks_nr = get_nr_bmap(super);
60859 +
60860 + /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60861 + which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60862 + may I never meet someone who still uses the ia32 architecture when
60863 + storage devices of that size enter the market, and wants to use ia32
60864 + with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60865 + probably, another dynamic data structure should replace a static
60866 + array of bnodes. */
60867 + /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60868 + data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60869 + if (data->bitmap == NULL) {
60870 + kfree(data);
60871 + return RETERR(-ENOMEM);
60872 + }
60873 +
60874 + for (i = 0; i < bitmap_blocks_nr; i++)
60875 + init_bnode(data->bitmap + i, super, i);
60876 +
60877 + allocator->u.generic = data;
60878 +
60879 +#if REISER4_DEBUG
60880 + get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60881 +#endif
60882 +
60883 + /* Load all bitmap blocks at mount time. */
60884 + if (!test_bit
60885 + (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60886 + __u64 start_time, elapsed_time;
60887 + struct bitmap_node *bnode;
60888 + int ret;
60889 +
60890 + if (REISER4_DEBUG)
60891 + printk(KERN_INFO "loading reiser4 bitmap...");
60892 + start_time = jiffies;
60893 +
60894 + for (i = 0; i < bitmap_blocks_nr; i++) {
60895 + bnode = data->bitmap + i;
60896 + ret = load_and_lock_bnode(bnode);
60897 + if (ret) {
60898 + reiser4_destroy_allocator_bitmap(allocator,
60899 + super);
60900 + return ret;
60901 + }
60902 + release_and_unlock_bnode(bnode);
60903 + }
60904 +
60905 + elapsed_time = jiffies - start_time;
60906 + if (REISER4_DEBUG)
60907 + printk("...done (%llu jiffies)\n",
60908 + (unsigned long long)elapsed_time);
60909 + }
60910 +
60911 + return 0;
60912 +}
60913 +
60914 +/* plugin->u.space_allocator.destroy_allocator
60915 + destructor. It is called on fs unmount */
60916 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60917 + struct super_block *super)
60918 +{
60919 + bmap_nr_t bitmap_blocks_nr;
60920 + bmap_nr_t i;
60921 +
60922 + struct bitmap_allocator_data *data = allocator->u.generic;
60923 +
60924 + assert("zam-414", data != NULL);
60925 + assert("zam-376", data->bitmap != NULL);
60926 +
60927 + bitmap_blocks_nr = get_nr_bmap(super);
60928 +
60929 + for (i = 0; i < bitmap_blocks_nr; i++) {
60930 + struct bitmap_node *bnode = data->bitmap + i;
60931 +
60932 + mutex_lock(&bnode->mutex);
60933 +
60934 +#if REISER4_DEBUG
60935 + if (atomic_read(&bnode->loaded)) {
60936 + jnode *wj = bnode->wjnode;
60937 + jnode *cj = bnode->cjnode;
60938 +
60939 + assert("zam-480", jnode_page(cj) != NULL);
60940 + assert("zam-633", jnode_page(wj) != NULL);
60941 +
60942 + assert("zam-634",
60943 + memcmp(jdata(wj), jdata(wj),
60944 + bmap_size(super->s_blocksize)) == 0);
60945 +
60946 + }
60947 +#endif
60948 + done_bnode(bnode);
60949 + mutex_unlock(&bnode->mutex);
60950 + }
60951 +
60952 + vfree(data->bitmap);
60953 + kfree(data);
60954 +
60955 + allocator->u.generic = NULL;
60956 +
60957 + return 0;
60958 +}
60959 +
60960 +/*
60961 + * Local variables:
60962 + * c-indentation-style: "K&R"
60963 + * mode-name: "LC"
60964 + * c-basic-offset: 8
60965 + * tab-width: 8
60966 + * fill-column: 79
60967 + * scroll-step: 1
60968 + * End:
60969 + */
60970 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.33/fs/reiser4/plugin/space/bitmap.h
60971 --- linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 01:00:00.000000000 +0100
60972 +++ linux-2.6.33/fs/reiser4/plugin/space/bitmap.h 2010-03-04 19:33:22.000000000 +0100
60973 @@ -0,0 +1,47 @@
60974 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60975 +
60976 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60977 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60978 +
60979 +#include "../../dformat.h"
60980 +#include "../../block_alloc.h"
60981 +
60982 +#include <linux/types.h> /* for __u?? */
60983 +#include <linux/fs.h> /* for struct super_block */
60984 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60985 +/* declarations of functions implementing methods of space allocator plugin for
60986 + bitmap based allocator. The functions themselves are in bitmap.c */
60987 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60988 + struct super_block *, void *);
60989 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60990 + struct super_block *);
60991 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60992 + reiser4_blocknr_hint *, int needed,
60993 + reiser4_block_nr * start,
60994 + reiser4_block_nr * len);
60995 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60996 + const reiser4_block_nr *, int);
60997 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60998 + reiser4_block_nr,
60999 + reiser4_block_nr);
61000 +extern int reiser4_pre_commit_hook_bitmap(void);
61001 +
61002 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
61003 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
61004 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
61005 +
61006 +typedef __u64 bmap_nr_t;
61007 +typedef __u32 bmap_off_t;
61008 +
61009 +#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
61010 +
61011 +/* Make Linus happy.
61012 + Local variables:
61013 + c-indentation-style: "K&R"
61014 + mode-name: "LC"
61015 + c-basic-offset: 8
61016 + tab-width: 8
61017 + fill-column: 120
61018 + scroll-step: 1
61019 + End:
61020 +*/
61021 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/Makefile linux-2.6.33/fs/reiser4/plugin/space/Makefile
61022 --- linux-2.6.33.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 01:00:00.000000000 +0100
61023 +++ linux-2.6.33/fs/reiser4/plugin/space/Makefile 2010-03-04 19:33:22.000000000 +0100
61024 @@ -0,0 +1,4 @@
61025 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
61026 +
61027 +space_plugins-objs := \
61028 + bitmap.o
61029 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.33/fs/reiser4/plugin/space/space_allocator.h
61030 --- linux-2.6.33.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 01:00:00.000000000 +0100
61031 +++ linux-2.6.33/fs/reiser4/plugin/space/space_allocator.h 2010-03-04 19:33:22.000000000 +0100
61032 @@ -0,0 +1,80 @@
61033 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61034 +
61035 +#ifndef __SPACE_ALLOCATOR_H__
61036 +#define __SPACE_ALLOCATOR_H__
61037 +
61038 +#include "../../forward.h"
61039 +#include "bitmap.h"
61040 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
61041 + * but... */
61042 +#define DEF_SPACE_ALLOCATOR(allocator) \
61043 + \
61044 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
61045 +{ \
61046 + return reiser4_init_allocator_##allocator (al, s, opaque); \
61047 +} \
61048 + \
61049 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
61050 +{ \
61051 + reiser4_destroy_allocator_##allocator (al, s); \
61052 +} \
61053 + \
61054 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
61055 + int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
61056 +{ \
61057 + return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
61058 +} \
61059 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
61060 +{ \
61061 + reiser4_dealloc_blocks_##allocator (al, start, len); \
61062 +} \
61063 + \
61064 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
61065 +{ \
61066 + reiser4_check_blocks_##allocator (start, end, desired); \
61067 +} \
61068 + \
61069 +static inline void sa_pre_commit_hook (void) \
61070 +{ \
61071 + reiser4_pre_commit_hook_##allocator (); \
61072 +} \
61073 + \
61074 +static inline void sa_post_commit_hook (void) \
61075 +{ \
61076 + reiser4_post_commit_hook_##allocator (); \
61077 +} \
61078 + \
61079 +static inline void sa_post_write_back_hook (void) \
61080 +{ \
61081 + reiser4_post_write_back_hook_##allocator(); \
61082 +} \
61083 + \
61084 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
61085 +{ \
61086 + reiser4_print_info_##allocator (prefix, al); \
61087 +}
61088 +
61089 +DEF_SPACE_ALLOCATOR(bitmap)
61090 +
61091 +/* this object is part of reiser4 private in-core super block */
61092 +struct reiser4_space_allocator {
61093 + union {
61094 + /* space allocators might use this pointer to reference their
61095 + * data. */
61096 + void *generic;
61097 + } u;
61098 +};
61099 +
61100 +/* __SPACE_ALLOCATOR_H__ */
61101 +#endif
61102 +
61103 +/* Make Linus happy.
61104 + Local variables:
61105 + c-indentation-style: "K&R"
61106 + mode-name: "LC"
61107 + c-basic-offset: 8
61108 + tab-width: 8
61109 + fill-column: 120
61110 + scroll-step: 1
61111 + End:
61112 +*/
61113 diff -urN linux-2.6.33.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.33/fs/reiser4/plugin/tail_policy.c
61114 --- linux-2.6.33.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 01:00:00.000000000 +0100
61115 +++ linux-2.6.33/fs/reiser4/plugin/tail_policy.c 2010-03-04 19:33:22.000000000 +0100
61116 @@ -0,0 +1,113 @@
61117 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61118 + * reiser4/README */
61119 +
61120 +/* Formatting policy plugins */
61121 +
61122 +/*
61123 + * Formatting policy plugin is used by object plugin (of regular file) to
61124 + * convert file between two representations.
61125 + *
61126 + * Currently following policies are implemented:
61127 + * never store file in formatted nodes
61128 + * always store file in formatted nodes
61129 + * store file in formatted nodes if file is smaller than 4 blocks (default)
61130 + */
61131 +
61132 +#include "../tree.h"
61133 +#include "../inode.h"
61134 +#include "../super.h"
61135 +#include "object.h"
61136 +#include "plugin.h"
61137 +#include "node/node.h"
61138 +#include "plugin_header.h"
61139 +
61140 +#include <linux/pagemap.h>
61141 +#include <linux/fs.h> /* For struct inode */
61142 +
61143 +/**
61144 + * have_formatting_never -
61145 + * @inode:
61146 + * @size:
61147 + *
61148 + *
61149 + */
61150 +/* Never store file's tail as direct item */
61151 +/* Audited by: green(2002.06.12) */
61152 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
61153 + /* inode to operate on */ ,
61154 + loff_t size UNUSED_ARG/* new object size */)
61155 +{
61156 + return 0;
61157 +}
61158 +
61159 +/* Always store file's tail as direct item */
61160 +/* Audited by: green(2002.06.12) */
61161 +static int
61162 +have_formatting_always(const struct inode *inode UNUSED_ARG
61163 + /* inode to operate on */ ,
61164 + loff_t size UNUSED_ARG/* new object size */)
61165 +{
61166 + return 1;
61167 +}
61168 +
61169 +/* This function makes test if we should store file denoted @inode as tails only
61170 + or as extents only. */
61171 +static int
61172 +have_formatting_default(const struct inode *inode UNUSED_ARG
61173 + /* inode to operate on */ ,
61174 + loff_t size/* new object size */)
61175 +{
61176 + assert("umka-1253", inode != NULL);
61177 +
61178 + if (size > inode->i_sb->s_blocksize * 4)
61179 + return 0;
61180 +
61181 + return 1;
61182 +}
61183 +
61184 +/* tail plugins */
61185 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
61186 + [NEVER_TAILS_FORMATTING_ID] = {
61187 + .h = {
61188 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61189 + .id = NEVER_TAILS_FORMATTING_ID,
61190 + .pops = NULL,
61191 + .label = "never",
61192 + .desc = "Never store file's tail",
61193 + .linkage = {NULL, NULL}
61194 + },
61195 + .have_tail = have_formatting_never
61196 + },
61197 + [ALWAYS_TAILS_FORMATTING_ID] = {
61198 + .h = {
61199 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61200 + .id = ALWAYS_TAILS_FORMATTING_ID,
61201 + .pops = NULL,
61202 + .label = "always",
61203 + .desc = "Always store file's tail",
61204 + .linkage = {NULL, NULL}
61205 + },
61206 + .have_tail = have_formatting_always
61207 + },
61208 + [SMALL_FILE_FORMATTING_ID] = {
61209 + .h = {
61210 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61211 + .id = SMALL_FILE_FORMATTING_ID,
61212 + .pops = NULL,
61213 + .label = "4blocks",
61214 + .desc = "store files shorter than 4 blocks in tail items",
61215 + .linkage = {NULL, NULL}
61216 + },
61217 + .have_tail = have_formatting_default
61218 + }
61219 +};
61220 +
61221 +/*
61222 + * Local variables:
61223 + * c-indentation-style: "K&R"
61224 + * mode-name: "LC"
61225 + * c-basic-offset: 8
61226 + * tab-width: 8
61227 + * fill-column: 79
61228 + * End:
61229 + */
61230 diff -urN linux-2.6.33.orig/fs/reiser4/pool.c linux-2.6.33/fs/reiser4/pool.c
61231 --- linux-2.6.33.orig/fs/reiser4/pool.c 1970-01-01 01:00:00.000000000 +0100
61232 +++ linux-2.6.33/fs/reiser4/pool.c 2010-03-04 19:33:22.000000000 +0100
61233 @@ -0,0 +1,231 @@
61234 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61235 + * reiser4/README */
61236 +
61237 +/* Fast pool allocation.
61238 +
61239 + There are situations when some sub-system normally asks memory allocator
61240 + for only few objects, but under some circumstances could require much
61241 + more. Typical and actually motivating example is tree balancing. It needs
61242 + to keep track of nodes that were involved into it, and it is well-known
61243 + that in reasonable packed balanced tree most (92.938121%) percent of all
61244 + balancings end up after working with only few nodes (3.141592 on
61245 + average). But in rare cases balancing can involve much more nodes
61246 + (3*tree_height+1 in extremal situation).
61247 +
61248 + On the one hand, we don't want to resort to dynamic allocation (slab,
61249 + malloc(), etc.) to allocate data structures required to keep track of
61250 + nodes during balancing. On the other hand, we cannot statically allocate
61251 + required amount of space on the stack, because first: it is useless wastage
61252 + of precious resource, and second: this amount is unknown in advance (tree
61253 + height can change).
61254 +
61255 + Pools, implemented in this file are solution for this problem:
61256 +
61257 + - some configurable amount of objects is statically preallocated on the
61258 + stack
61259 +
61260 + - if this preallocated pool is exhausted and more objects is requested
61261 + they are allocated dynamically.
61262 +
61263 + Pools encapsulate distinction between statically and dynamically allocated
61264 + objects. Both allocation and recycling look exactly the same.
61265 +
61266 + To keep track of dynamically allocated objects, pool adds its own linkage
61267 + to each object.
61268 +
61269 + NOTE-NIKITA This linkage also contains some balancing-specific data. This
61270 + is not perfect. On the other hand, balancing is currently the only client
61271 + of pool code.
61272 +
61273 + NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
61274 + functions in the style of tslist/tshash, i.e., make them unreadable, but
61275 + type-safe.
61276 +
61277 +*/
61278 +
61279 +#include "debug.h"
61280 +#include "pool.h"
61281 +#include "super.h"
61282 +
61283 +#include <linux/types.h>
61284 +#include <linux/err.h>
61285 +
61286 +/* initialize new pool object @h */
61287 +static void reiser4_init_pool_obj(struct reiser4_pool_header *h)
61288 +{
61289 + INIT_LIST_HEAD(&h->usage_linkage);
61290 + INIT_LIST_HEAD(&h->level_linkage);
61291 + INIT_LIST_HEAD(&h->extra_linkage);
61292 +}
61293 +
61294 +/* initialize new pool */
61295 +void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ ,
61296 + size_t obj_size /* size of objects in @pool */ ,
61297 + int num_of_objs /* number of preallocated objects */ ,
61298 + char *data/* area for preallocated objects */)
61299 +{
61300 + struct reiser4_pool_header *h;
61301 + int i;
61302 +
61303 + assert("nikita-955", pool != NULL);
61304 + assert("nikita-1044", obj_size > 0);
61305 + assert("nikita-956", num_of_objs >= 0);
61306 + assert("nikita-957", data != NULL);
61307 +
61308 + memset(pool, 0, sizeof *pool);
61309 + pool->obj_size = obj_size;
61310 + pool->data = data;
61311 + INIT_LIST_HEAD(&pool->free);
61312 + INIT_LIST_HEAD(&pool->used);
61313 + INIT_LIST_HEAD(&pool->extra);
61314 + memset(data, 0, obj_size * num_of_objs);
61315 + for (i = 0; i < num_of_objs; ++i) {
61316 + h = (struct reiser4_pool_header *) (data + i * obj_size);
61317 + reiser4_init_pool_obj(h);
61318 + /* add pool header to the end of pool's free list */
61319 + list_add_tail(&h->usage_linkage, &pool->free);
61320 + }
61321 +}
61322 +
61323 +/* release pool resources
61324 +
61325 + Release all resources acquired by this pool, specifically, dynamically
61326 + allocated objects.
61327 +
61328 +*/
61329 +void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG)
61330 +{
61331 +}
61332 +
61333 +/* allocate carry object from @pool
61334 +
61335 + First, try to get preallocated object. If this fails, resort to dynamic
61336 + allocation.
61337 +
61338 +*/
61339 +static void *reiser4_pool_alloc(struct reiser4_pool *pool)
61340 +{
61341 + struct reiser4_pool_header *result;
61342 +
61343 + assert("nikita-959", pool != NULL);
61344 +
61345 + if (!list_empty(&pool->free)) {
61346 + struct list_head *linkage;
61347 +
61348 + linkage = pool->free.next;
61349 + list_del(linkage);
61350 + INIT_LIST_HEAD(linkage);
61351 + result = list_entry(linkage, struct reiser4_pool_header,
61352 + usage_linkage);
61353 + BUG_ON(!list_empty(&result->level_linkage) ||
61354 + !list_empty(&result->extra_linkage));
61355 + } else {
61356 + /* pool is empty. Extra allocations don't deserve dedicated
61357 + slab to be served from, as they are expected to be rare. */
61358 + result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
61359 + if (result != 0) {
61360 + reiser4_init_pool_obj(result);
61361 + list_add(&result->extra_linkage, &pool->extra);
61362 + } else
61363 + return ERR_PTR(RETERR(-ENOMEM));
61364 + BUG_ON(!list_empty(&result->usage_linkage) ||
61365 + !list_empty(&result->level_linkage));
61366 + }
61367 + ++pool->objs;
61368 + list_add(&result->usage_linkage, &pool->used);
61369 + memset(result + 1, 0, pool->obj_size - sizeof *result);
61370 + return result;
61371 +}
61372 +
61373 +/* return object back to the pool */
61374 +void reiser4_pool_free(struct reiser4_pool *pool,
61375 + struct reiser4_pool_header *h)
61376 +{
61377 + assert("nikita-961", h != NULL);
61378 + assert("nikita-962", pool != NULL);
61379 +
61380 + --pool->objs;
61381 + assert("nikita-963", pool->objs >= 0);
61382 +
61383 + list_del_init(&h->usage_linkage);
61384 + list_del_init(&h->level_linkage);
61385 +
61386 + if (list_empty(&h->extra_linkage))
61387 + /*
61388 + * pool header is not an extra one. Push it onto free list
61389 + * using usage_linkage
61390 + */
61391 + list_add(&h->usage_linkage, &pool->free);
61392 + else {
61393 + /* remove pool header from pool's extra list and kfree it */
61394 + list_del(&h->extra_linkage);
61395 + kfree(h);
61396 + }
61397 +}
61398 +
61399 +/* add new object to the carry level list
61400 +
61401 + Carry level is FIFO most of the time, but not always. Complications arise
61402 + when make_space() function tries to go to the left neighbor and thus adds
61403 + carry node before existing nodes, and also, when updating delimiting keys
61404 + after moving data between two nodes, we want left node to be locked before
61405 + right node.
61406 +
61407 + Latter case is confusing at the first glance. Problem is that COP_UPDATE
61408 + opration that updates delimiting keys is sometimes called with two nodes
61409 + (when data are moved between two nodes) and sometimes with only one node
61410 + (when leftmost item is deleted in a node). In any case operation is
61411 + supplied with at least node whose left delimiting key is to be updated
61412 + (that is "right" node).
61413 +
61414 + @pool - from which to allocate new object;
61415 + @list - where to add object;
61416 + @reference - after (or before) which existing object to add
61417 +*/
61418 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
61419 + struct list_head *list,
61420 + pool_ordering order,
61421 + struct reiser4_pool_header *reference)
61422 +{
61423 + struct reiser4_pool_header *result;
61424 +
61425 + assert("nikita-972", pool != NULL);
61426 +
61427 + result = reiser4_pool_alloc(pool);
61428 + if (IS_ERR(result))
61429 + return result;
61430 +
61431 + assert("nikita-973", result != NULL);
61432 +
61433 + switch (order) {
61434 + case POOLO_BEFORE:
61435 + __list_add(&result->level_linkage,
61436 + reference->level_linkage.prev,
61437 + &reference->level_linkage);
61438 + break;
61439 + case POOLO_AFTER:
61440 + __list_add(&result->level_linkage,
61441 + &reference->level_linkage,
61442 + reference->level_linkage.next);
61443 + break;
61444 + case POOLO_LAST:
61445 + list_add_tail(&result->level_linkage, list);
61446 + break;
61447 + case POOLO_FIRST:
61448 + list_add(&result->level_linkage, list);
61449 + break;
61450 + default:
61451 + wrong_return_value("nikita-927", "order");
61452 + }
61453 + return result;
61454 +}
61455 +
61456 +/* Make Linus happy.
61457 + Local variables:
61458 + c-indentation-style: "K&R"
61459 + mode-name: "LC"
61460 + c-basic-offset: 8
61461 + tab-width: 8
61462 + fill-column: 120
61463 + End:
61464 +*/
61465 diff -urN linux-2.6.33.orig/fs/reiser4/pool.h linux-2.6.33/fs/reiser4/pool.h
61466 --- linux-2.6.33.orig/fs/reiser4/pool.h 1970-01-01 01:00:00.000000000 +0100
61467 +++ linux-2.6.33/fs/reiser4/pool.h 2010-03-04 19:33:22.000000000 +0100
61468 @@ -0,0 +1,57 @@
61469 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61470 + * reiser4/README */
61471 +
61472 +/* Fast pool allocation */
61473 +
61474 +#ifndef __REISER4_POOL_H__
61475 +#define __REISER4_POOL_H__
61476 +
61477 +#include <linux/types.h>
61478 +
61479 +struct reiser4_pool {
61480 + size_t obj_size;
61481 + int objs;
61482 + char *data;
61483 + struct list_head free;
61484 + struct list_head used;
61485 + struct list_head extra;
61486 +};
61487 +
61488 +struct reiser4_pool_header {
61489 + /* object is either on free or "used" lists */
61490 + struct list_head usage_linkage;
61491 + struct list_head level_linkage;
61492 + struct list_head extra_linkage;
61493 +};
61494 +
61495 +typedef enum {
61496 + POOLO_BEFORE,
61497 + POOLO_AFTER,
61498 + POOLO_LAST,
61499 + POOLO_FIRST
61500 +} pool_ordering;
61501 +
61502 +/* pool manipulation functions */
61503 +
61504 +extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size,
61505 + int num_of_objs, char *data);
61506 +extern void reiser4_done_pool(struct reiser4_pool *pool);
61507 +extern void reiser4_pool_free(struct reiser4_pool *pool,
61508 + struct reiser4_pool_header *h);
61509 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
61510 + struct list_head *list,
61511 + pool_ordering order,
61512 + struct reiser4_pool_header *reference);
61513 +
61514 +/* __REISER4_POOL_H__ */
61515 +#endif
61516 +
61517 +/* Make Linus happy.
61518 + Local variables:
61519 + c-indentation-style: "K&R"
61520 + mode-name: "LC"
61521 + c-basic-offset: 8
61522 + tab-width: 8
61523 + fill-column: 120
61524 + End:
61525 +*/
61526 diff -urN linux-2.6.33.orig/fs/reiser4/readahead.c linux-2.6.33/fs/reiser4/readahead.c
61527 --- linux-2.6.33.orig/fs/reiser4/readahead.c 1970-01-01 01:00:00.000000000 +0100
61528 +++ linux-2.6.33/fs/reiser4/readahead.c 2010-03-04 19:33:22.000000000 +0100
61529 @@ -0,0 +1,140 @@
61530 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61531 + * reiser4/README */
61532 +
61533 +#include "forward.h"
61534 +#include "tree.h"
61535 +#include "tree_walk.h"
61536 +#include "super.h"
61537 +#include "inode.h"
61538 +#include "key.h"
61539 +#include "znode.h"
61540 +
61541 +#include <linux/swap.h> /* for totalram_pages */
61542 +
61543 +void reiser4_init_ra_info(ra_info_t *rai)
61544 +{
61545 + rai->key_to_stop = *reiser4_min_key();
61546 +}
61547 +
61548 +/* global formatted node readahead parameter. It can be set by mount option
61549 + * -o readahead:NUM:1 */
61550 +static inline int ra_adjacent_only(int flags)
61551 +{
61552 + return flags & RA_ADJACENT_ONLY;
61553 +}
61554 +
61555 +/* this is used by formatted_readahead to decide whether read for right neighbor
61556 + * of node is to be issued. It returns 1 if right neighbor's first key is less
61557 + * or equal to readahead's stop key */
61558 +static int should_readahead_neighbor(znode * node, ra_info_t *info)
61559 +{
61560 + int result;
61561 +
61562 + read_lock_dk(znode_get_tree(node));
61563 + result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61564 + read_unlock_dk(znode_get_tree(node));
61565 + return result;
61566 +}
61567 +
61568 +#define LOW_MEM_PERCENTAGE (5)
61569 +
61570 +static int low_on_memory(void)
61571 +{
61572 + unsigned int freepages;
61573 +
61574 + freepages = nr_free_pages();
61575 + return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61576 +}
61577 +
61578 +/* start read for @node and for a few of its right neighbors */
61579 +void formatted_readahead(znode * node, ra_info_t *info)
61580 +{
61581 + struct formatted_ra_params *ra_params;
61582 + znode *cur;
61583 + int i;
61584 + int grn_flags;
61585 + lock_handle next_lh;
61586 +
61587 + /* do nothing if node block number has not been assigned to node (which
61588 + * means it is still in cache). */
61589 + if (reiser4_blocknr_is_fake(znode_get_block(node)))
61590 + return;
61591 +
61592 + ra_params = get_current_super_ra_params();
61593 +
61594 + if (znode_page(node) == NULL)
61595 + jstartio(ZJNODE(node));
61596 +
61597 + if (znode_get_level(node) != LEAF_LEVEL)
61598 + return;
61599 +
61600 + /* don't waste memory for read-ahead when low on memory */
61601 + if (low_on_memory())
61602 + return;
61603 +
61604 + /* We can have locked nodes on upper tree levels, in this situation lock
61605 + priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61606 + here. */
61607 + grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61608 +
61609 + i = 0;
61610 + cur = zref(node);
61611 + init_lh(&next_lh);
61612 + while (i < ra_params->max) {
61613 + const reiser4_block_nr * nextblk;
61614 +
61615 + if (!should_readahead_neighbor(cur, info))
61616 + break;
61617 +
61618 + if (reiser4_get_right_neighbor
61619 + (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61620 + break;
61621 +
61622 + nextblk = znode_get_block(next_lh.node);
61623 + if (reiser4_blocknr_is_fake(nextblk) ||
61624 + (ra_adjacent_only(ra_params->flags)
61625 + && *nextblk != *znode_get_block(cur) + 1))
61626 + break;
61627 +
61628 + zput(cur);
61629 + cur = zref(next_lh.node);
61630 + done_lh(&next_lh);
61631 + if (znode_page(cur) == NULL)
61632 + jstartio(ZJNODE(cur));
61633 + else
61634 + /* Do not scan read-ahead window if pages already
61635 + * allocated (and i/o already started). */
61636 + break;
61637 +
61638 + i++;
61639 + }
61640 + zput(cur);
61641 + done_lh(&next_lh);
61642 +}
61643 +
61644 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap)
61645 +{
61646 + reiser4_key *stop_key;
61647 +
61648 + assert("nikita-3542", dir != NULL);
61649 + assert("nikita-3543", tap != NULL);
61650 +
61651 + stop_key = &tap->ra_info.key_to_stop;
61652 + /* initialize readdir readahead information: include into readahead
61653 + * stat data of all files of the directory */
61654 + set_key_locality(stop_key, get_inode_oid(dir));
61655 + set_key_type(stop_key, KEY_SD_MINOR);
61656 + set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61657 + set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61658 + set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61659 +}
61660 +
61661 +/*
61662 + Local variables:
61663 + c-indentation-style: "K&R"
61664 + mode-name: "LC"
61665 + c-basic-offset: 8
61666 + tab-width: 8
61667 + fill-column: 80
61668 + End:
61669 +*/
61670 diff -urN linux-2.6.33.orig/fs/reiser4/readahead.h linux-2.6.33/fs/reiser4/readahead.h
61671 --- linux-2.6.33.orig/fs/reiser4/readahead.h 1970-01-01 01:00:00.000000000 +0100
61672 +++ linux-2.6.33/fs/reiser4/readahead.h 2010-03-04 19:33:22.000000000 +0100
61673 @@ -0,0 +1,52 @@
61674 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61675 + * reiser4/README */
61676 +
61677 +#ifndef __READAHEAD_H__
61678 +#define __READAHEAD_H__
61679 +
61680 +#include "key.h"
61681 +
61682 +typedef enum {
61683 + RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
61684 + Default is NO (not only adjacent) */
61685 +} ra_global_flags;
61686 +
61687 +/* reiser4 super block has a field of this type.
61688 + It controls readahead during tree traversals */
61689 +struct formatted_ra_params {
61690 + unsigned long max; /* request not more than this amount of nodes.
61691 + Default is totalram_pages / 4 */
61692 + int flags;
61693 +};
61694 +
61695 +typedef struct {
61696 + reiser4_key key_to_stop;
61697 +} ra_info_t;
61698 +
61699 +void formatted_readahead(znode * , ra_info_t *);
61700 +void reiser4_init_ra_info(ra_info_t *rai);
61701 +
61702 +struct reiser4_file_ra_state {
61703 + loff_t start; /* Current window */
61704 + loff_t size;
61705 + loff_t next_size; /* Next window size */
61706 + loff_t ahead_start; /* Ahead window */
61707 + loff_t ahead_size;
61708 + loff_t max_window_size; /* Maximum readahead window */
61709 + loff_t slow_start; /* enlarging r/a size algorithm. */
61710 +};
61711 +
61712 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap);
61713 +
61714 +/* __READAHEAD_H__ */
61715 +#endif
61716 +
61717 +/*
61718 + Local variables:
61719 + c-indentation-style: "K&R"
61720 + mode-name: "LC"
61721 + c-basic-offset: 8
61722 + tab-width: 8
61723 + fill-column: 120
61724 + End:
61725 +*/
61726 diff -urN linux-2.6.33.orig/fs/reiser4/README linux-2.6.33/fs/reiser4/README
61727 --- linux-2.6.33.orig/fs/reiser4/README 1970-01-01 01:00:00.000000000 +0100
61728 +++ linux-2.6.33/fs/reiser4/README 2010-03-04 19:33:22.000000000 +0100
61729 @@ -0,0 +1,128 @@
61730 +[LICENSING]
61731 +
61732 +Reiser4 is hereby licensed under the GNU General
61733 +Public License version 2.
61734 +
61735 +Source code files that contain the phrase "licensing governed by
61736 +reiser4/README" are "governed files" throughout this file. Governed
61737 +files are licensed under the GPL. The portions of them owned by Hans
61738 +Reiser, or authorized to be licensed by him, have been in the past,
61739 +and likely will be in the future, licensed to other parties under
61740 +other licenses. If you add your code to governed files, and don't
61741 +want it to be owned by Hans Reiser, put your copyright label on that
61742 +code so the poor blight and his customers can keep things straight.
61743 +All portions of governed files not labeled otherwise are owned by Hans
61744 +Reiser, and by adding your code to it, widely distributing it to
61745 +others or sending us a patch, and leaving the sentence in stating that
61746 +licensing is governed by the statement in this file, you accept this.
61747 +It will be a kindness if you identify whether Hans Reiser is allowed
61748 +to license code labeled as owned by you on your behalf other than
61749 +under the GPL, because he wants to know if it is okay to do so and put
61750 +a check in the mail to you (for non-trivial improvements) when he
61751 +makes his next sale. He makes no guarantees as to the amount if any,
61752 +though he feels motivated to motivate contributors, and you can surely
61753 +discuss this with him before or after contributing. You have the
61754 +right to decline to allow him to license your code contribution other
61755 +than under the GPL.
61756 +
61757 +Further licensing options are available for commercial and/or other
61758 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
61759 +the GPL as not allowing those additional licensing options, you read
61760 +it wrongly, and Richard Stallman agrees with me, when carefully read
61761 +you can see that those restrictions on additional terms do not apply
61762 +to the owner of the copyright, and my interpretation of this shall
61763 +govern for this license.
61764 +
61765 +[END LICENSING]
61766 +
61767 +Reiser4 is a file system based on dancing tree algorithms, and is
61768 +described at http://www.namesys.com
61769 +
61770 +mkfs.reiser4 and other utilities are on our webpage or wherever your
61771 +Linux provider put them. You really want to be running the latest
61772 +version off the website if you use fsck.
61773 +
61774 +Yes, if you update your reiser4 kernel module you do have to
61775 +recompile your kernel, most of the time. The errors you get will be
61776 +quite cryptic if your forget to do so.
61777 +
61778 +Hideous Commercial Pitch: Spread your development costs across other OS
61779 +vendors. Select from the best in the world, not the best in your
61780 +building, by buying from third party OS component suppliers. Leverage
61781 +the software component development power of the internet. Be the most
61782 +aggressive in taking advantage of the commercial possibilities of
61783 +decentralized internet development, and add value through your branded
61784 +integration that you sell as an operating system. Let your competitors
61785 +be the ones to compete against the entire internet by themselves. Be
61786 +hip, get with the new economic trend, before your competitors do. Send
61787 +email to reiser@namesys.com
61788 +
61789 +Hans Reiser was the primary architect of Reiser4, but a whole team
61790 +chipped their ideas in. He invested everything he had into Namesys
61791 +for 5.5 dark years of no money before Reiser3 finally started to work well
61792 +enough to bring in money. He owns the copyright.
61793 +
61794 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
61795 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
61796 +opinion, unique in its willingness to invest into things more
61797 +theoretical than the VC community can readily understand, and more
61798 +longterm than allows them to be sure that they will be the ones to
61799 +extract the economic benefits from. DARPA also integrated us into a
61800 +security community that transformed our security worldview.
61801 +
61802 +Vladimir Saveliev is our lead programmer, with us from the beginning,
61803 +and he worked long hours writing the cleanest code. This is why he is
61804 +now the lead programmer after years of commitment to our work. He
61805 +always made the effort to be the best he could be, and to make his
61806 +code the best that it could be. What resulted was quite remarkable. I
61807 +don't think that money can ever motivate someone to work the way he
61808 +did, he is one of the most selfless men I know.
61809 +
61810 +Alexander Lyamin was our sysadmin, and helped to educate us in
61811 +security issues. Moscow State University and IMT were very generous
61812 +in the internet access they provided us, and in lots of other little
61813 +ways that a generous institution can be.
61814 +
61815 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61816 +locking code, the block allocator, and finished the flushing code.
61817 +His code is always crystal clean and well structured.
61818 +
61819 +Nikita Danilov wrote the core of the balancing code, the core of the
61820 +plugins code, and the directory code. He worked a steady pace of long
61821 +hours that produced a whole lot of well abstracted code. He is our
61822 +senior computer scientist.
61823 +
61824 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
61825 +something very few persons have the skills for, and it is thanks to
61826 +him that we can say that the parser is really not so big compared to
61827 +various bits of our other code, and making a parser work in the kernel
61828 +was not so complicated as everyone would imagine mainly because it was
61829 +him doing it...
61830 +
61831 +Joshua McDonald wrote the transaction manager, and the flush code.
61832 +The flush code unexpectedly turned out be extremely hairy for reasons
61833 +you can read about on our web page, and he did a great job on an
61834 +extremely difficult task.
61835 +
61836 +Nina Reiser handled our accounting, government relations, and much
61837 +more.
61838 +
61839 +Ramon Reiser developed our website.
61840 +
61841 +Beverly Palmer drew our graphics.
61842 +
61843 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61844 +and worked with Umka on developing libreiser4 and userspace plugins.
61845 +
61846 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61847 +userspace tools (reiser4progs).
61848 +
61849 +Oleg Drokin (aka Green) is the release manager who fixes everything.
61850 +It is so nice to have someone like that on the team. He (plus Chris
61851 +and Jeff) make it possible for the entire rest of the Namesys team to
61852 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
61853 +is just amazing to watch his talent for spotting bugs in action.
61854 +
61855 +Edward Shishkin wrote cryptcompress file plugin (which manages files
61856 +built of encrypted and(or) compressed bodies) and other plugins related
61857 +to transparent encryption and compression support.
61858 diff -urN linux-2.6.33.orig/fs/reiser4/reiser4.h linux-2.6.33/fs/reiser4/reiser4.h
61859 --- linux-2.6.33.orig/fs/reiser4/reiser4.h 1970-01-01 01:00:00.000000000 +0100
61860 +++ linux-2.6.33/fs/reiser4/reiser4.h 2010-03-04 19:33:22.000000000 +0100
61861 @@ -0,0 +1,259 @@
61862 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61863 + * reiser4/README */
61864 +
61865 +/* definitions of common constants used by reiser4 */
61866 +
61867 +#if !defined( __REISER4_H__ )
61868 +#define __REISER4_H__
61869 +
61870 +#include <asm/param.h> /* for HZ */
61871 +#include <linux/errno.h>
61872 +#include <linux/types.h>
61873 +#include <linux/fs.h>
61874 +#include <linux/hardirq.h>
61875 +#include <linux/sched.h>
61876 +
61877 +/*
61878 + * reiser4 compilation options.
61879 + */
61880 +
61881 +#if defined(CONFIG_REISER4_DEBUG)
61882 +/* turn on assertion checks */
61883 +#define REISER4_DEBUG (1)
61884 +#else
61885 +#define REISER4_DEBUG (0)
61886 +#endif
61887 +
61888 +#define REISER4_SHA256 (0)
61889 +
61890 +/*
61891 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61892 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
61893 + * components. Additional component, referred to as "ordering" is used to
61894 + * order items from which given object is composed of. As such, ordering is
61895 + * placed between locality and objectid. For directory item ordering contains
61896 + * initial prefix of the file name this item is for. This sorts all directory
61897 + * items within given directory lexicographically (but see
61898 + * fibration.[ch]). For file body and stat-data, ordering contains initial
61899 + * prefix of the name file was initially created with. In the common case
61900 + * (files with single name) this allows to order file bodies and stat-datas in
61901 + * the same order as their respective directory entries, thus speeding up
61902 + * readdir.
61903 + *
61904 + * Note, that kernel can only mount file system with the same key size as one
61905 + * it is compiled for, so flipping this option may render your data
61906 + * inaccessible.
61907 + */
61908 +#define REISER4_LARGE_KEY (1)
61909 +/*#define REISER4_LARGE_KEY (0)*/
61910 +
61911 +/*#define GUESS_EXISTS 1*/
61912 +
61913 +/*
61914 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61915 + * option
61916 + */
61917 +
61918 +extern const char *REISER4_SUPER_MAGIC_STRING;
61919 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61920 + * beginning of device */
61921 +
61922 +/* here go tunable parameters that are not worth special entry in kernel
61923 + configuration */
61924 +
61925 +/* default number of slots in coord-by-key caches */
61926 +#define CBK_CACHE_SLOTS (16)
61927 +/* how many elementary tree operation to carry on the next level */
61928 +#define CARRIES_POOL_SIZE (5)
61929 +/* size of pool of preallocated nodes for carry process. */
61930 +#define NODES_LOCKED_POOL_SIZE (5)
61931 +
61932 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61933 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61934 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61935 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61936 +
61937 +/* we are supporting reservation of disk space on uid basis */
61938 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61939 +/* we are supporting reservation of disk space for groups */
61940 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61941 +/* we are supporting reservation of disk space for root */
61942 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61943 +/* we use rapid flush mode, see flush.c for comments. */
61944 +#define REISER4_USE_RAPID_FLUSH (1)
61945 +
61946 +/*
61947 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61948 + */
61949 +#define REISER4_USE_ENTD (1)
61950 +
61951 +/* key allocation is Plan-A */
61952 +#define REISER4_PLANA_KEY_ALLOCATION (1)
61953 +/* key allocation follows good old 3.x scheme */
61954 +#define REISER4_3_5_KEY_ALLOCATION (0)
61955 +
61956 +/* size of hash-table for znodes */
61957 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61958 +
61959 +/* number of buckets in lnode hash-table */
61960 +#define LNODE_HTABLE_BUCKETS (1024)
61961 +
61962 +/* some ridiculously high maximal limit on height of znode tree. This
61963 + is used in declaration of various per level arrays and
61964 + to allocate stattistics gathering array for per-level stats. */
61965 +#define REISER4_MAX_ZTREE_HEIGHT (8)
61966 +
61967 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61968 +
61969 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61970 + sequential search is on average faster than binary. This is because
61971 + of better optimization and because sequential search is more CPU
61972 + cache friendly. This number (25) was found by experiments on dual AMD
61973 + Athlon(tm), 1400MHz.
61974 +
61975 + NOTE: testing in kernel has shown that binary search is more effective than
61976 + implied by results of the user level benchmarking. Probably because in the
61977 + node keys are separated by other data. So value was adjusted after few
61978 + tests. More thorough tuning is needed.
61979 +*/
61980 +#define REISER4_SEQ_SEARCH_BREAK (3)
61981 +
61982 +/* don't allow tree to be lower than this */
61983 +#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
61984 +
61985 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61986 + * available memory. */
61987 +/* Default value of maximal atom size. Can be ovewritten by
61988 + tmgr.atom_max_size mount option. By default infinity. */
61989 +#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
61990 +
61991 +/* Default value of maximal atom age (in jiffies). After reaching this age
61992 + atom will be forced to commit, either synchronously or asynchronously. Can
61993 + be overwritten by tmgr.atom_max_age mount option. */
61994 +#define REISER4_ATOM_MAX_AGE (600 * HZ)
61995 +
61996 +/* sleeping period for ktxnmrgd */
61997 +#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
61998 +
61999 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
62000 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
62001 +
62002 +/* start complaining after that many restarts in coord_by_key().
62003 +
62004 + This either means incredibly heavy contention for this part of a tree, or
62005 + some corruption or bug.
62006 +*/
62007 +#define REISER4_CBK_ITERATIONS_LIMIT (100)
62008 +
62009 +/* return -EIO after that many iterations in coord_by_key().
62010 +
62011 + I have witnessed more than 800 iterations (in 30 thread test) before cbk
62012 + finished. --nikita
62013 +*/
62014 +#define REISER4_MAX_CBK_ITERATIONS 500000
62015 +
62016 +/* put a per-inode limit on maximal number of directory entries with identical
62017 + keys in hashed directory.
62018 +
62019 + Disable this until inheritance interfaces stabilize: we need some way to
62020 + set per directory limit.
62021 +*/
62022 +#define REISER4_USE_COLLISION_LIMIT (0)
62023 +
62024 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level
62025 + blocks it will force them to be relocated. */
62026 +#define FLUSH_RELOCATE_THRESHOLD 64
62027 +/* If flush finds can find a block allocation closer than at most
62028 + FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position.
62029 + */
62030 +#define FLUSH_RELOCATE_DISTANCE 64
62031 +
62032 +/* If we have written this much or more blocks before encountering busy jnode
62033 + in flush list - abort flushing hoping that next time we get called
62034 + this jnode will be clean already, and we will save some seeks. */
62035 +#define FLUSH_WRITTEN_THRESHOLD 50
62036 +
62037 +/* The maximum number of nodes to scan left on a level during flush. */
62038 +#define FLUSH_SCAN_MAXNODES 10000
62039 +
62040 +/* per-atom limit of flushers */
62041 +#define ATOM_MAX_FLUSHERS (1)
62042 +
62043 +/* default tracing buffer size */
62044 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
62045 +
62046 +/* what size units of IO we would like cp, etc., to use, in writing to
62047 + reiser4. In bytes.
62048 +
62049 + Can be overwritten by optimal_io_size mount option.
62050 +*/
62051 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
62052 +
62053 +/* see comments in inode.c:oid_to_uino() */
62054 +#define REISER4_UINO_SHIFT (1 << 30)
62055 +
62056 +/* Mark function argument as unused to avoid compiler warnings. */
62057 +#define UNUSED_ARG __attribute__((unused))
62058 +
62059 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
62060 +#define NONNULL __attribute__((nonnull))
62061 +#else
62062 +#define NONNULL
62063 +#endif
62064 +
62065 +/* master super block offset in bytes.*/
62066 +#define REISER4_MASTER_OFFSET 65536
62067 +
62068 +/* size of VFS block */
62069 +#define VFS_BLKSIZE 512
62070 +/* number of bits in size of VFS block (512==2^9) */
62071 +#define VFS_BLKSIZE_BITS 9
62072 +
62073 +#define REISER4_I reiser4_inode_data
62074 +
62075 +/* implication */
62076 +#define ergo(antecedent, consequent) (!(antecedent) || (consequent))
62077 +/* logical equivalence */
62078 +#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1)))
62079 +
62080 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
62081 +
62082 +#define NOT_YET (0)
62083 +
62084 +/** Reiser4 specific error codes **/
62085 +
62086 +#define REISER4_ERROR_CODE_BASE 10000
62087 +
62088 +/* Neighbor is not available (side neighbor or parent) */
62089 +#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
62090 +
62091 +/* Node was not found in cache */
62092 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
62093 +
62094 +/* node has no free space enough for completion of balancing operation */
62095 +#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
62096 +
62097 +/* repeat operation */
62098 +#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
62099 +
62100 +/* deadlock happens */
62101 +#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
62102 +
62103 +/* operation cannot be performed, because it would block and non-blocking mode
62104 + * was requested. */
62105 +#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
62106 +
62107 +/* wait some event (depends on context), then repeat */
62108 +#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
62109 +
62110 +#endif /* __REISER4_H__ */
62111 +
62112 +/* Make Linus happy.
62113 + Local variables:
62114 + c-indentation-style: "K&R"
62115 + mode-name: "LC"
62116 + c-basic-offset: 8
62117 + tab-width: 8
62118 + fill-column: 120
62119 + End:
62120 +*/
62121 diff -urN linux-2.6.33.orig/fs/reiser4/safe_link.c linux-2.6.33/fs/reiser4/safe_link.c
62122 --- linux-2.6.33.orig/fs/reiser4/safe_link.c 1970-01-01 01:00:00.000000000 +0100
62123 +++ linux-2.6.33/fs/reiser4/safe_link.c 2010-03-04 19:33:22.000000000 +0100
62124 @@ -0,0 +1,354 @@
62125 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
62126 + * reiser4/README */
62127 +
62128 +/* Safe-links. */
62129 +
62130 +/*
62131 + * Safe-links are used to maintain file system consistency during operations
62132 + * that spawns multiple transactions. For example:
62133 + *
62134 + * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
62135 + * without user-visible names in the file system, but still opened by some
62136 + * active process. What happens here is that unlink proper (i.e., removal
62137 + * of the last file name) and file deletion (truncate of file body to zero
62138 + * and deletion of stat-data, that happens when last file descriptor is
62139 + * closed), may belong to different transactions T1 and T2. If a crash
62140 + * happens after T1 commit, but before T2 commit, on-disk file system has
62141 + * a file without name, that is, disk space leak.
62142 + *
62143 + * 2. Truncate. Truncate of large file may spawn multiple transactions. If
62144 + * system crashes while truncate was in-progress, file is left partially
62145 + * truncated, which violates "atomicity guarantees" of reiser4, viz. that
62146 + * every system is atomic.
62147 + *
62148 + * Safe-links address both above cases. Basically, safe-link is a way post
62149 + * some operation to be executed during commit of some other transaction than
62150 + * current one. (Another way to look at the safe-link is to interpret it as a
62151 + * logical logging.)
62152 + *
62153 + * Specifically, at the beginning of unlink safe-link in inserted in the
62154 + * tree. This safe-link is normally removed by file deletion code (during
62155 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
62156 + * normally removed when truncate operation is finished.
62157 + *
62158 + * This means, that in the case of "clean umount" there are no safe-links in
62159 + * the tree. If safe-links are observed during mount, it means that (a) system
62160 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
62161 + * (i.e., not finished) operations that were in-progress during system
62162 + * termination. Each safe-link record enough information to complete
62163 + * corresponding operation, and mount simply "replays" them (hence, the
62164 + * analogy with the logical logging).
62165 + *
62166 + * Safe-links are implemented as blackbox items (see
62167 + * plugin/item/blackbox.[ch]).
62168 + *
62169 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
62170 + * list" there.
62171 + */
62172 +
62173 +#include "safe_link.h"
62174 +#include "debug.h"
62175 +#include "inode.h"
62176 +
62177 +#include "plugin/item/blackbox.h"
62178 +
62179 +#include <linux/fs.h>
62180 +
62181 +/*
62182 + * On-disk format of safe-link.
62183 + */
62184 +typedef struct safelink {
62185 + reiser4_key sdkey; /* key of stat-data for the file safe-link is
62186 + * for */
62187 + d64 size; /* size to which file should be truncated */
62188 +} safelink_t;
62189 +
62190 +/*
62191 + * locality where safe-link items are stored. Next to the objectid of root
62192 + * directory.
62193 + */
62194 +static oid_t safe_link_locality(reiser4_tree * tree)
62195 +{
62196 + return get_key_objectid(get_super_private(tree->super)->df_plug->
62197 + root_dir_key(tree->super)) + 1;
62198 +}
62199 +
62200 +/*
62201 + Construct a key for the safe-link. Key has the following format:
62202 +
62203 +| 60 | 4 | 64 | 4 | 60 | 64 |
62204 ++---------------+---+------------------+---+---------------+------------------+
62205 +| locality | 0 | 0 | 0 | objectid | link type |
62206 ++---------------+---+------------------+---+---------------+------------------+
62207 +| | | | |
62208 +| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
62209 +
62210 + This is in large keys format. In small keys format second 8 byte chunk is
62211 + out. Locality is a constant returned by safe_link_locality(). objectid is
62212 + an oid of a file on which operation protected by this safe-link is
62213 + performed. link-type is used to distinguish safe-links for different
62214 + operations.
62215 +
62216 + */
62217 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
62218 + reiser4_safe_link_t link, reiser4_key * key)
62219 +{
62220 + reiser4_key_init(key);
62221 + set_key_locality(key, safe_link_locality(tree));
62222 + set_key_objectid(key, oid);
62223 + set_key_offset(key, link);
62224 + return key;
62225 +}
62226 +
62227 +/*
62228 + * how much disk space is necessary to insert and remove (in the
62229 + * error-handling path) safe-link.
62230 + */
62231 +static __u64 safe_link_tograb(reiser4_tree * tree)
62232 +{
62233 + return
62234 + /* insert safe link */
62235 + estimate_one_insert_item(tree) +
62236 + /* remove safe link */
62237 + estimate_one_item_removal(tree) +
62238 + /* drill to the leaf level during insertion */
62239 + 1 + estimate_one_insert_item(tree) +
62240 + /*
62241 + * possible update of existing safe-link. Actually, if
62242 + * safe-link existed already (we failed to remove it), then no
62243 + * insertion is necessary, so this term is already "covered",
62244 + * but for simplicity let's left it.
62245 + */
62246 + 1;
62247 +}
62248 +
62249 +/*
62250 + * grab enough disk space to insert and remove (in the error-handling path)
62251 + * safe-link.
62252 + */
62253 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
62254 +{
62255 + int result;
62256 +
62257 + grab_space_enable();
62258 + /* The sbinfo->delete_mutex can be taken here.
62259 + * safe_link_release() should be called before leaving reiser4
62260 + * context. */
62261 + result =
62262 + reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
62263 + grab_space_enable();
62264 + return result;
62265 +}
62266 +
62267 +/*
62268 + * release unused disk space reserved by safe_link_grab().
62269 + */
62270 +void safe_link_release(reiser4_tree * tree)
62271 +{
62272 + reiser4_release_reserved(tree->super);
62273 +}
62274 +
62275 +/*
62276 + * insert into tree safe-link for operation @link on inode @inode.
62277 + */
62278 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
62279 +{
62280 + reiser4_key key;
62281 + safelink_t sl;
62282 + int length;
62283 + int result;
62284 + reiser4_tree *tree;
62285 +
62286 + build_sd_key(inode, &sl.sdkey);
62287 + length = sizeof sl.sdkey;
62288 +
62289 + if (link == SAFE_TRUNCATE) {
62290 + /*
62291 + * for truncate we have to store final file length also,
62292 + * expand item.
62293 + */
62294 + length += sizeof(sl.size);
62295 + put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
62296 + }
62297 + tree = reiser4_tree_by_inode(inode);
62298 + build_link_key(tree, get_inode_oid(inode), link, &key);
62299 +
62300 + result = store_black_box(tree, &key, &sl, length);
62301 + if (result == -EEXIST)
62302 + result = update_black_box(tree, &key, &sl, length);
62303 + return result;
62304 +}
62305 +
62306 +/*
62307 + * remove safe-link corresponding to the operation @link on inode @inode from
62308 + * the tree.
62309 + */
62310 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
62311 +{
62312 + reiser4_key key;
62313 +
62314 + return kill_black_box(tree, build_link_key(tree, oid, link, &key));
62315 +}
62316 +
62317 +/*
62318 + * in-memory structure to keep information extracted from safe-link. This is
62319 + * used to iterate over all safe-links.
62320 + */
62321 +struct safe_link_context {
62322 + reiser4_tree *tree; /* internal tree */
62323 + reiser4_key key; /* safe-link key */
62324 + reiser4_key sdkey; /* key of object stat-data */
62325 + reiser4_safe_link_t link; /* safe-link type */
62326 + oid_t oid; /* object oid */
62327 + __u64 size; /* final size for truncate */
62328 +};
62329 +
62330 +/*
62331 + * start iterating over all safe-links.
62332 + */
62333 +static void safe_link_iter_begin(reiser4_tree * tree,
62334 + struct safe_link_context *ctx)
62335 +{
62336 + ctx->tree = tree;
62337 + reiser4_key_init(&ctx->key);
62338 + set_key_locality(&ctx->key, safe_link_locality(tree));
62339 + set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
62340 + set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
62341 +}
62342 +
62343 +/*
62344 + * return next safe-link.
62345 + */
62346 +static int safe_link_iter_next(struct safe_link_context *ctx)
62347 +{
62348 + int result;
62349 + safelink_t sl;
62350 +
62351 + result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
62352 + if (result == 0) {
62353 + ctx->oid = get_key_objectid(&ctx->key);
62354 + ctx->link = get_key_offset(&ctx->key);
62355 + ctx->sdkey = sl.sdkey;
62356 + if (ctx->link == SAFE_TRUNCATE)
62357 + ctx->size = le64_to_cpu(get_unaligned(&sl.size));
62358 + }
62359 + return result;
62360 +}
62361 +
62362 +/*
62363 + * check are there any more safe-links left in the tree.
62364 + */
62365 +static int safe_link_iter_finished(struct safe_link_context *ctx)
62366 +{
62367 + return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
62368 +}
62369 +
62370 +/*
62371 + * finish safe-link iteration.
62372 + */
62373 +static void safe_link_iter_end(struct safe_link_context *ctx)
62374 +{
62375 + /* nothing special */
62376 +}
62377 +
62378 +/*
62379 + * process single safe-link.
62380 + */
62381 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
62382 + reiser4_key * sdkey, oid_t oid, __u64 size)
62383 +{
62384 + struct inode *inode;
62385 + int result;
62386 +
62387 + /*
62388 + * obtain object inode by reiser4_iget(), then call object plugin
62389 + * ->safelink() method to do actual work, then delete safe-link on
62390 + * success.
62391 + */
62392 + inode = reiser4_iget(super, sdkey, 1);
62393 + if (!IS_ERR(inode)) {
62394 + file_plugin *fplug;
62395 +
62396 + fplug = inode_file_plugin(inode);
62397 + assert("nikita-3428", fplug != NULL);
62398 + assert("", oid == get_inode_oid(inode));
62399 + if (fplug->safelink != NULL) {
62400 + /* reiser4_txn_restart_current is not necessary because
62401 + * mounting is signle thread. However, without it
62402 + * deadlock detection code will complain (see
62403 + * nikita-3361). */
62404 + reiser4_txn_restart_current();
62405 + result = fplug->safelink(inode, link, size);
62406 + } else {
62407 + warning("nikita-3430",
62408 + "Cannot handle safelink for %lli",
62409 + (unsigned long long)oid);
62410 + reiser4_print_key("key", sdkey);
62411 + result = 0;
62412 + }
62413 + if (result != 0) {
62414 + warning("nikita-3431",
62415 + "Error processing safelink for %lli: %i",
62416 + (unsigned long long)oid, result);
62417 + }
62418 + reiser4_iget_complete(inode);
62419 + iput(inode);
62420 + if (result == 0) {
62421 + result = safe_link_grab(reiser4_get_tree(super),
62422 + BA_CAN_COMMIT);
62423 + if (result == 0)
62424 + result =
62425 + safe_link_del(reiser4_get_tree(super), oid,
62426 + link);
62427 + safe_link_release(reiser4_get_tree(super));
62428 + /*
62429 + * restart transaction: if there was large number of
62430 + * safe-links, their processing may fail to fit into
62431 + * single transaction.
62432 + */
62433 + if (result == 0)
62434 + reiser4_txn_restart_current();
62435 + }
62436 + } else
62437 + result = PTR_ERR(inode);
62438 + return result;
62439 +}
62440 +
62441 +/*
62442 + * iterate over all safe-links in the file-system processing them one by one.
62443 + */
62444 +int process_safelinks(struct super_block *super)
62445 +{
62446 + struct safe_link_context ctx;
62447 + int result;
62448 +
62449 + if (rofs_super(super))
62450 + /* do nothing on the read-only file system */
62451 + return 0;
62452 + safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62453 + result = 0;
62454 + do {
62455 + result = safe_link_iter_next(&ctx);
62456 + if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62457 + result = 0;
62458 + break;
62459 + }
62460 + if (result == 0)
62461 + result = process_safelink(super, ctx.link,
62462 + &ctx.sdkey, ctx.oid,
62463 + ctx.size);
62464 + } while (result == 0);
62465 + safe_link_iter_end(&ctx);
62466 + return result;
62467 +}
62468 +
62469 +/* Make Linus happy.
62470 + Local variables:
62471 + c-indentation-style: "K&R"
62472 + mode-name: "LC"
62473 + c-basic-offset: 8
62474 + tab-width: 8
62475 + fill-column: 120
62476 + scroll-step: 1
62477 + End:
62478 +*/
62479 diff -urN linux-2.6.33.orig/fs/reiser4/safe_link.h linux-2.6.33/fs/reiser4/safe_link.h
62480 --- linux-2.6.33.orig/fs/reiser4/safe_link.h 1970-01-01 01:00:00.000000000 +0100
62481 +++ linux-2.6.33/fs/reiser4/safe_link.h 2010-03-04 19:33:22.000000000 +0100
62482 @@ -0,0 +1,29 @@
62483 +/* Copyright 2003 by Hans Reiser, licensing governed by
62484 + * reiser4/README */
62485 +
62486 +/* Safe-links. See safe_link.c for details. */
62487 +
62488 +#if !defined(__FS_SAFE_LINK_H__)
62489 +#define __FS_SAFE_LINK_H__
62490 +
62491 +#include "tree.h"
62492 +
62493 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62494 +void safe_link_release(reiser4_tree * tree);
62495 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62496 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62497 +
62498 +int process_safelinks(struct super_block *super);
62499 +
62500 +/* __FS_SAFE_LINK_H__ */
62501 +#endif
62502 +
62503 +/* Make Linus happy.
62504 + Local variables:
62505 + c-indentation-style: "K&R"
62506 + mode-name: "LC"
62507 + c-basic-offset: 8
62508 + tab-width: 8
62509 + fill-column: 120
62510 + End:
62511 +*/
62512 diff -urN linux-2.6.33.orig/fs/reiser4/seal.c linux-2.6.33/fs/reiser4/seal.c
62513 --- linux-2.6.33.orig/fs/reiser4/seal.c 1970-01-01 01:00:00.000000000 +0100
62514 +++ linux-2.6.33/fs/reiser4/seal.c 2010-03-04 19:33:22.000000000 +0100
62515 @@ -0,0 +1,218 @@
62516 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62517 +/* Seals implementation. */
62518 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
62519 + allowing to bypass tree traversal. But normal usage of coords implies that
62520 + node pointed to by coord is locked, whereas seals don't keep a lock (or
62521 + even a reference) to znode. In stead, each znode contains a version number,
62522 + increased on each znode modification. This version number is copied into a
62523 + seal when seal is created. Later, one can "validate" seal by calling
62524 + reiser4_seal_validate(). If znode is in cache and its version number is
62525 + still the same, seal is "pristine" and coord associated with it can be
62526 + re-used immediately.
62527 +
62528 + If, on the other hand, znode is out of cache, or it is obviously different
62529 + one from the znode seal was initially attached to (for example, it is on
62530 + the different level, or is being removed from the tree), seal is
62531 + irreparably invalid ("burned") and tree traversal has to be repeated.
62532 +
62533 + Otherwise, there is some hope, that while znode was modified (and seal was
62534 + "broken" as a result), key attached to the seal is still in the node. This
62535 + is checked by first comparing this key with delimiting keys of node and, if
62536 + key is ok, doing intra-node lookup.
62537 +
62538 + Znode version is maintained in the following way:
62539 +
62540 + there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62541 + znode_epoch is incremented and its new value is stored in ->version field
62542 + of new znode. Whenever znode is dirtied (which means it was probably
62543 + modified), znode_epoch is also incremented and its new value is stored in
62544 + znode->version. This is done so, because just incrementing znode->version
62545 + on each update is not enough: it may so happen, that znode get deleted, new
62546 + znode is allocated for the same disk block and gets the same version
62547 + counter, tricking seal code into false positive.
62548 +*/
62549 +
62550 +#include "forward.h"
62551 +#include "debug.h"
62552 +#include "key.h"
62553 +#include "coord.h"
62554 +#include "seal.h"
62555 +#include "plugin/item/item.h"
62556 +#include "plugin/node/node.h"
62557 +#include "jnode.h"
62558 +#include "znode.h"
62559 +#include "super.h"
62560 +
62561 +static znode *seal_node(const seal_t *seal);
62562 +static int seal_matches(const seal_t *seal, znode * node);
62563 +
62564 +/* initialise seal. This can be called several times on the same seal. @coord
62565 + and @key can be NULL. */
62566 +void reiser4_seal_init(seal_t *seal /* seal to initialise */ ,
62567 + const coord_t *coord /* coord @seal will be
62568 + * attached to */ ,
62569 + const reiser4_key * key UNUSED_ARG /* key @seal will be
62570 + * attached to */ )
62571 +{
62572 + assert("nikita-1886", seal != NULL);
62573 + memset(seal, 0, sizeof *seal);
62574 + if (coord != NULL) {
62575 + znode *node;
62576 +
62577 + node = coord->node;
62578 + assert("nikita-1987", node != NULL);
62579 + spin_lock_znode(node);
62580 + seal->version = node->version;
62581 + assert("nikita-1988", seal->version != 0);
62582 + seal->block = *znode_get_block(node);
62583 +#if REISER4_DEBUG
62584 + seal->coord1 = *coord;
62585 + if (key != NULL)
62586 + seal->key = *key;
62587 +#endif
62588 + spin_unlock_znode(node);
62589 + }
62590 +}
62591 +
62592 +/* finish with seal */
62593 +void reiser4_seal_done(seal_t *seal/* seal to clear */)
62594 +{
62595 + assert("nikita-1887", seal != NULL);
62596 + seal->version = 0;
62597 +}
62598 +
62599 +/* true if seal was initialised */
62600 +int reiser4_seal_is_set(const seal_t *seal/* seal to query */)
62601 +{
62602 + assert("nikita-1890", seal != NULL);
62603 + return seal->version != 0;
62604 +}
62605 +
62606 +#if REISER4_DEBUG
62607 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
62608 + * has expected key. This is to detect cases where node was modified but wasn't
62609 + * marked dirty. */
62610 +static inline int check_seal_match(const coord_t *coord /* coord to check */ ,
62611 + const reiser4_key * k/* expected key */)
62612 +{
62613 + reiser4_key ukey;
62614 +
62615 + return (coord->between != AT_UNIT) ||
62616 + /* FIXME-VS: we only can compare keys for items whose units
62617 + represent exactly one key */
62618 + ((coord_is_existing_unit(coord))
62619 + && (item_is_extent(coord)
62620 + || keyeq(k, unit_key_by_coord(coord, &ukey))))
62621 + || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62622 + && keyge(k, unit_key_by_coord(coord, &ukey)));
62623 +}
62624 +#endif
62625 +
62626 +/* this is used by reiser4_seal_validate. It accepts return value of
62627 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
62628 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62629 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62630 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
62631 + * distinguish between -EINVAL and -E_REPEAT. */
62632 +static int should_repeat(int return_code)
62633 +{
62634 + return return_code == -EINVAL;
62635 +}
62636 +
62637 +/* (re-)validate seal.
62638 +
62639 + Checks whether seal is pristine, and try to revalidate it if possible.
62640 +
62641 + If seal was burned, or broken irreparably, return -E_REPEAT.
62642 +
62643 + NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62644 + looking for is in range of keys covered by the sealed node, but item wasn't
62645 + found by node ->lookup() method. Alternative is to return -ENOENT in this
62646 + case, but this would complicate callers logic.
62647 +
62648 +*/
62649 +int reiser4_seal_validate(seal_t *seal /* seal to validate */,
62650 + coord_t *coord /* coord to validate against */,
62651 + const reiser4_key * key /* key to validate against */,
62652 + lock_handle * lh /* resulting lock handle */,
62653 + znode_lock_mode mode /* lock node */,
62654 + znode_lock_request request/* locking priority */)
62655 +{
62656 + znode *node;
62657 + int result;
62658 +
62659 + assert("nikita-1889", seal != NULL);
62660 + assert("nikita-1881", reiser4_seal_is_set(seal));
62661 + assert("nikita-1882", key != NULL);
62662 + assert("nikita-1883", coord != NULL);
62663 + assert("nikita-1884", lh != NULL);
62664 + assert("nikita-1885", keyeq(&seal->key, key));
62665 + assert("nikita-1989", coords_equal(&seal->coord1, coord));
62666 +
62667 + /* obtain znode by block number */
62668 + node = seal_node(seal);
62669 + if (node != NULL) {
62670 + /* znode was in cache, lock it */
62671 + result = longterm_lock_znode(lh, node, mode, request);
62672 + zput(node);
62673 + if (result == 0) {
62674 + if (seal_matches(seal, node)) {
62675 + /* if seal version and znode version
62676 + coincide */
62677 + ON_DEBUG(coord_update_v(coord));
62678 + assert("nikita-1990",
62679 + node == seal->coord1.node);
62680 + assert("nikita-1898",
62681 + WITH_DATA_RET(coord->node, 1,
62682 + check_seal_match(coord,
62683 + key)));
62684 + } else
62685 + result = RETERR(-E_REPEAT);
62686 + }
62687 + if (result != 0) {
62688 + if (should_repeat(result))
62689 + result = RETERR(-E_REPEAT);
62690 + /* unlock node on failure */
62691 + done_lh(lh);
62692 + }
62693 + } else {
62694 + /* znode wasn't in cache */
62695 + result = RETERR(-E_REPEAT);
62696 + }
62697 + return result;
62698 +}
62699 +
62700 +/* helpers functions */
62701 +
62702 +/* obtain reference to znode seal points to, if in cache */
62703 +static znode *seal_node(const seal_t *seal/* seal to query */)
62704 +{
62705 + assert("nikita-1891", seal != NULL);
62706 + return zlook(current_tree, &seal->block);
62707 +}
62708 +
62709 +/* true if @seal version and @node version coincide */
62710 +static int seal_matches(const seal_t *seal /* seal to check */ ,
62711 + znode * node/* node to check */)
62712 +{
62713 + int result;
62714 +
62715 + assert("nikita-1991", seal != NULL);
62716 + assert("nikita-1993", node != NULL);
62717 +
62718 + spin_lock_znode(node);
62719 + result = (seal->version == node->version);
62720 + spin_unlock_znode(node);
62721 + return result;
62722 +}
62723 +
62724 +/* Make Linus happy.
62725 + Local variables:
62726 + c-indentation-style: "K&R"
62727 + mode-name: "LC"
62728 + c-basic-offset: 8
62729 + tab-width: 8
62730 + fill-column: 120
62731 + scroll-step: 1
62732 + End:
62733 +*/
62734 diff -urN linux-2.6.33.orig/fs/reiser4/seal.h linux-2.6.33/fs/reiser4/seal.h
62735 --- linux-2.6.33.orig/fs/reiser4/seal.h 1970-01-01 01:00:00.000000000 +0100
62736 +++ linux-2.6.33/fs/reiser4/seal.h 2010-03-04 19:33:22.000000000 +0100
62737 @@ -0,0 +1,49 @@
62738 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62739 +
62740 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62741 +
62742 +#ifndef __SEAL_H__
62743 +#define __SEAL_H__
62744 +
62745 +#include "forward.h"
62746 +#include "debug.h"
62747 +#include "dformat.h"
62748 +#include "key.h"
62749 +#include "coord.h"
62750 +
62751 +/* for __u?? types */
62752 +/*#include <linux/types.h>*/
62753 +
62754 +/* seal. See comment at the top of seal.c */
62755 +typedef struct seal_s {
62756 + /* version of znode recorder at the time of seal creation */
62757 + __u64 version;
62758 + /* block number of znode attached to this seal */
62759 + reiser4_block_nr block;
62760 +#if REISER4_DEBUG
62761 + /* coord this seal is attached to. For debugging. */
62762 + coord_t coord1;
62763 + /* key this seal is attached to. For debugging. */
62764 + reiser4_key key;
62765 +#endif
62766 +} seal_t;
62767 +
62768 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62769 +extern void reiser4_seal_done(seal_t *);
62770 +extern int reiser4_seal_is_set(const seal_t *);
62771 +extern int reiser4_seal_validate(seal_t *, coord_t *,
62772 + const reiser4_key *, lock_handle * ,
62773 + znode_lock_mode mode, znode_lock_request request);
62774 +
62775 +/* __SEAL_H__ */
62776 +#endif
62777 +
62778 +/* Make Linus happy.
62779 + Local variables:
62780 + c-indentation-style: "K&R"
62781 + mode-name: "LC"
62782 + c-basic-offset: 8
62783 + tab-width: 8
62784 + fill-column: 120
62785 + End:
62786 +*/
62787 diff -urN linux-2.6.33.orig/fs/reiser4/search.c linux-2.6.33/fs/reiser4/search.c
62788 --- linux-2.6.33.orig/fs/reiser4/search.c 1970-01-01 01:00:00.000000000 +0100
62789 +++ linux-2.6.33/fs/reiser4/search.c 2010-03-04 19:33:22.000000000 +0100
62790 @@ -0,0 +1,1612 @@
62791 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62792 + * reiser4/README */
62793 +
62794 +#include "forward.h"
62795 +#include "debug.h"
62796 +#include "dformat.h"
62797 +#include "key.h"
62798 +#include "coord.h"
62799 +#include "seal.h"
62800 +#include "plugin/item/item.h"
62801 +#include "plugin/node/node.h"
62802 +#include "plugin/plugin.h"
62803 +#include "jnode.h"
62804 +#include "znode.h"
62805 +#include "block_alloc.h"
62806 +#include "tree_walk.h"
62807 +#include "tree.h"
62808 +#include "reiser4.h"
62809 +#include "super.h"
62810 +#include "inode.h"
62811 +
62812 +#include <linux/slab.h>
62813 +
62814 +static const char *bias_name(lookup_bias bias);
62815 +
62816 +/* tree searching algorithm, intranode searching algorithms are in
62817 + plugin/node/ */
62818 +
62819 +/* tree lookup cache
62820 + *
62821 + * The coord by key cache consists of small list of recently accessed nodes
62822 + * maintained according to the LRU discipline. Before doing real top-to-down
62823 + * tree traversal this cache is scanned for nodes that can contain key
62824 + * requested.
62825 + *
62826 + * The efficiency of coord cache depends heavily on locality of reference for
62827 + * tree accesses. Our user level simulations show reasonably good hit ratios
62828 + * for coord cache under most loads so far.
62829 + */
62830 +
62831 +/* Initialise coord cache slot */
62832 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
62833 +{
62834 + assert("nikita-345", slot != NULL);
62835 +
62836 + INIT_LIST_HEAD(&slot->lru);
62837 + slot->node = NULL;
62838 +}
62839 +
62840 +/* Initialize coord cache */
62841 +int cbk_cache_init(cbk_cache * cache/* cache to init */)
62842 +{
62843 + int i;
62844 +
62845 + assert("nikita-346", cache != NULL);
62846 +
62847 + cache->slot =
62848 + kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62849 + reiser4_ctx_gfp_mask_get());
62850 + if (cache->slot == NULL)
62851 + return RETERR(-ENOMEM);
62852 +
62853 + INIT_LIST_HEAD(&cache->lru);
62854 + for (i = 0; i < cache->nr_slots; ++i) {
62855 + cbk_cache_init_slot(cache->slot + i);
62856 + list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62857 + }
62858 + rwlock_init(&cache->guard);
62859 + return 0;
62860 +}
62861 +
62862 +/* free cbk cache data */
62863 +void cbk_cache_done(cbk_cache * cache/* cache to release */)
62864 +{
62865 + assert("nikita-2493", cache != NULL);
62866 + if (cache->slot != NULL) {
62867 + kfree(cache->slot);
62868 + cache->slot = NULL;
62869 + }
62870 +}
62871 +
62872 +/* macro to iterate over all cbk cache slots */
62873 +#define for_all_slots(cache, slot) \
62874 + for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
62875 + &(cache)->lru != &(slot)->lru; \
62876 + (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62877 +
62878 +#if REISER4_DEBUG
62879 +/* this function assures that [cbk-cache-invariant] invariant holds */
62880 +static int cbk_cache_invariant(const cbk_cache * cache)
62881 +{
62882 + cbk_cache_slot *slot;
62883 + int result;
62884 + int unused;
62885 +
62886 + if (cache->nr_slots == 0)
62887 + return 1;
62888 +
62889 + assert("nikita-2469", cache != NULL);
62890 + unused = 0;
62891 + result = 1;
62892 + read_lock(&((cbk_cache *)cache)->guard);
62893 + for_all_slots(cache, slot) {
62894 + /* in LRU first go all `used' slots followed by `unused' */
62895 + if (unused && (slot->node != NULL))
62896 + result = 0;
62897 + if (slot->node == NULL)
62898 + unused = 1;
62899 + else {
62900 + cbk_cache_slot *scan;
62901 +
62902 + /* all cached nodes are different */
62903 + scan = slot;
62904 + while (result) {
62905 + scan = list_entry(scan->lru.next,
62906 + cbk_cache_slot, lru);
62907 + if (&cache->lru == &scan->lru)
62908 + break;
62909 + if (slot->node == scan->node)
62910 + result = 0;
62911 + }
62912 + }
62913 + if (!result)
62914 + break;
62915 + }
62916 + read_unlock(&((cbk_cache *)cache)->guard);
62917 + return result;
62918 +}
62919 +
62920 +#endif
62921 +
62922 +/* Remove references, if any, to @node from coord cache */
62923 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62924 + reiser4_tree * tree/* tree to remove node from */)
62925 +{
62926 + cbk_cache_slot *slot;
62927 + cbk_cache *cache;
62928 + int i;
62929 +
62930 + assert("nikita-350", node != NULL);
62931 + assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62932 +
62933 + cache = &tree->cbk_cache;
62934 + assert("nikita-2470", cbk_cache_invariant(cache));
62935 +
62936 + write_lock(&(cache->guard));
62937 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62938 + if (slot->node == node) {
62939 + list_move_tail(&slot->lru, &cache->lru);
62940 + slot->node = NULL;
62941 + break;
62942 + }
62943 + }
62944 + write_unlock(&(cache->guard));
62945 + assert("nikita-2471", cbk_cache_invariant(cache));
62946 +}
62947 +
62948 +/* add to the cbk-cache in the "tree" information about "node". This
62949 + can actually be update of existing slot in a cache. */
62950 +static void cbk_cache_add(const znode * node/* node to add to the cache */)
62951 +{
62952 + cbk_cache *cache;
62953 +
62954 + cbk_cache_slot *slot;
62955 + int i;
62956 +
62957 + assert("nikita-352", node != NULL);
62958 +
62959 + cache = &znode_get_tree(node)->cbk_cache;
62960 + assert("nikita-2472", cbk_cache_invariant(cache));
62961 +
62962 + if (cache->nr_slots == 0)
62963 + return;
62964 +
62965 + write_lock(&(cache->guard));
62966 + /* find slot to update/add */
62967 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62968 + /* oops, this node is already in a cache */
62969 + if (slot->node == node)
62970 + break;
62971 + }
62972 + /* if all slots are used, reuse least recently used one */
62973 + if (i == cache->nr_slots) {
62974 + slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62975 + slot->node = (znode *) node;
62976 + }
62977 + list_move(&slot->lru, &cache->lru);
62978 + write_unlock(&(cache->guard));
62979 + assert("nikita-2473", cbk_cache_invariant(cache));
62980 +}
62981 +
62982 +static int setup_delimiting_keys(cbk_handle * h);
62983 +static lookup_result coord_by_handle(cbk_handle * handle);
62984 +static lookup_result traverse_tree(cbk_handle * h);
62985 +static int cbk_cache_search(cbk_handle * h);
62986 +
62987 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
62988 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
62989 +
62990 +/* helper functions */
62991 +
62992 +static void update_stale_dk(reiser4_tree * tree, znode * node);
62993 +
62994 +/* release parent node during traversal */
62995 +static void put_parent(cbk_handle * h);
62996 +/* check consistency of fields */
62997 +static int sanity_check(cbk_handle * h);
62998 +/* release resources in handle */
62999 +static void hput(cbk_handle * h);
63000 +
63001 +static level_lookup_result search_to_left(cbk_handle * h);
63002 +
63003 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
63004 + * cbk_handle */
63005 +static cbk_handle *cbk_pack(cbk_handle * handle,
63006 + reiser4_tree * tree,
63007 + const reiser4_key * key,
63008 + coord_t *coord,
63009 + lock_handle * active_lh,
63010 + lock_handle * parent_lh,
63011 + znode_lock_mode lock_mode,
63012 + lookup_bias bias,
63013 + tree_level lock_level,
63014 + tree_level stop_level,
63015 + __u32 flags, ra_info_t *info)
63016 +{
63017 + memset(handle, 0, sizeof *handle);
63018 +
63019 + handle->tree = tree;
63020 + handle->key = key;
63021 + handle->lock_mode = lock_mode;
63022 + handle->bias = bias;
63023 + handle->lock_level = lock_level;
63024 + handle->stop_level = stop_level;
63025 + handle->coord = coord;
63026 + /* set flags. See comment in tree.h:cbk_flags */
63027 + handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
63028 +
63029 + handle->active_lh = active_lh;
63030 + handle->parent_lh = parent_lh;
63031 + handle->ra_info = info;
63032 + return handle;
63033 +}
63034 +
63035 +/* main tree lookup procedure
63036 +
63037 + Check coord cache. If key we are looking for is not found there, call cbk()
63038 + to do real tree traversal.
63039 +
63040 + As we have extents on the twig level, @lock_level and @stop_level can
63041 + be different from LEAF_LEVEL and each other.
63042 +
63043 + Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
63044 + long term locks) while calling this.
63045 +*/
63046 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
63047 + * in. Usually this tree is
63048 + * part of file-system
63049 + * super-block */ ,
63050 + const reiser4_key * key /* key to look for */ ,
63051 + coord_t *coord /* where to store found
63052 + * position in a tree. Fields
63053 + * in "coord" are only valid if
63054 + * coord_by_key() returned
63055 + * "CBK_COORD_FOUND" */ ,
63056 + lock_handle * lh, /* resulting lock handle */
63057 + znode_lock_mode lock_mode /* type of lookup we
63058 + * want on node. Pass
63059 + * ZNODE_READ_LOCK here
63060 + * if you only want to
63061 + * read item found and
63062 + * ZNODE_WRITE_LOCK if
63063 + * you want to modify
63064 + * it */ ,
63065 + lookup_bias bias /* what to return if coord
63066 + * with exactly the @key is
63067 + * not in the tree */ ,
63068 + tree_level lock_level/* tree level where to start
63069 + * taking @lock type of
63070 + * locks */ ,
63071 + tree_level stop_level/* tree level to stop. Pass
63072 + * LEAF_LEVEL or TWIG_LEVEL
63073 + * here Item being looked
63074 + * for has to be between
63075 + * @lock_level and
63076 + * @stop_level, inclusive */ ,
63077 + __u32 flags /* search flags */ ,
63078 + ra_info_t *
63079 + info
63080 + /* information about desired tree traversal
63081 + * readahead */
63082 + )
63083 +{
63084 + cbk_handle handle;
63085 + lock_handle parent_lh;
63086 + lookup_result result;
63087 +
63088 + init_lh(lh);
63089 + init_lh(&parent_lh);
63090 +
63091 + assert("nikita-3023", reiser4_schedulable());
63092 +
63093 + assert("nikita-353", tree != NULL);
63094 + assert("nikita-354", key != NULL);
63095 + assert("nikita-355", coord != NULL);
63096 + assert("nikita-356", (bias == FIND_EXACT)
63097 + || (bias == FIND_MAX_NOT_MORE_THAN));
63098 + assert("nikita-357", stop_level >= LEAF_LEVEL);
63099 + /* no locks can be held during tree traversal */
63100 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
63101 +
63102 + cbk_pack(&handle,
63103 + tree,
63104 + key,
63105 + coord,
63106 + lh,
63107 + &parent_lh,
63108 + lock_mode, bias, lock_level, stop_level, flags, info);
63109 +
63110 + result = coord_by_handle(&handle);
63111 + assert("nikita-3247",
63112 + ergo(!IS_CBKERR(result), coord->node == lh->node));
63113 + return result;
63114 +}
63115 +
63116 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
63117 + * from tree root. */
63118 +lookup_result reiser4_object_lookup(struct inode *object,
63119 + const reiser4_key * key,
63120 + coord_t *coord,
63121 + lock_handle * lh,
63122 + znode_lock_mode lock_mode,
63123 + lookup_bias bias,
63124 + tree_level lock_level,
63125 + tree_level stop_level, __u32 flags,
63126 + ra_info_t *info)
63127 +{
63128 + cbk_handle handle;
63129 + lock_handle parent_lh;
63130 + lookup_result result;
63131 +
63132 + init_lh(lh);
63133 + init_lh(&parent_lh);
63134 +
63135 + assert("nikita-3023", reiser4_schedulable());
63136 +
63137 + assert("nikita-354", key != NULL);
63138 + assert("nikita-355", coord != NULL);
63139 + assert("nikita-356", (bias == FIND_EXACT)
63140 + || (bias == FIND_MAX_NOT_MORE_THAN));
63141 + assert("nikita-357", stop_level >= LEAF_LEVEL);
63142 + /* no locks can be held during tree search by key */
63143 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
63144 +
63145 + cbk_pack(&handle,
63146 + object != NULL ? reiser4_tree_by_inode(object) : current_tree,
63147 + key,
63148 + coord,
63149 + lh,
63150 + &parent_lh,
63151 + lock_mode, bias, lock_level, stop_level, flags, info);
63152 + handle.object = object;
63153 +
63154 + result = coord_by_handle(&handle);
63155 + assert("nikita-3247",
63156 + ergo(!IS_CBKERR(result), coord->node == lh->node));
63157 + return result;
63158 +}
63159 +
63160 +/* lookup by cbk_handle. Common part of coord_by_key() and
63161 + reiser4_object_lookup(). */
63162 +static lookup_result coord_by_handle(cbk_handle * handle)
63163 +{
63164 + /*
63165 + * first check cbk_cache (which is look-aside cache for our tree) and
63166 + * of this fails, start traversal.
63167 + */
63168 + /* first check whether "key" is in cache of recent lookups. */
63169 + if (cbk_cache_search(handle) == 0)
63170 + return handle->result;
63171 + else
63172 + return traverse_tree(handle);
63173 +}
63174 +
63175 +/* Execute actor for each item (or unit, depending on @through_units_p),
63176 + starting from @coord, right-ward, until either:
63177 +
63178 + - end of the tree is reached
63179 + - unformatted node is met
63180 + - error occurred
63181 + - @actor returns 0 or less
63182 +
63183 + Error code, or last actor return value is returned.
63184 +
63185 + This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
63186 + sequence of entries with identical keys and alikes.
63187 +*/
63188 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
63189 + coord_t *coord /* coord to start from */ ,
63190 + lock_handle * lh /* lock handle to start with and to
63191 + * update along the way */ ,
63192 + tree_iterate_actor_t actor /* function to call on each
63193 + * item/unit */ ,
63194 + void *arg /* argument to pass to @actor */ ,
63195 + znode_lock_mode mode /* lock mode on scanned nodes */ ,
63196 + int through_units_p /* call @actor on each item or on
63197 + * each unit */ )
63198 +{
63199 + int result;
63200 +
63201 + assert("nikita-1143", tree != NULL);
63202 + assert("nikita-1145", coord != NULL);
63203 + assert("nikita-1146", lh != NULL);
63204 + assert("nikita-1147", actor != NULL);
63205 +
63206 + result = zload(coord->node);
63207 + coord_clear_iplug(coord);
63208 + if (result != 0)
63209 + return result;
63210 + if (!coord_is_existing_unit(coord)) {
63211 + zrelse(coord->node);
63212 + return -ENOENT;
63213 + }
63214 + while ((result = actor(tree, coord, lh, arg)) > 0) {
63215 + /* move further */
63216 + if ((through_units_p && coord_next_unit(coord)) ||
63217 + (!through_units_p && coord_next_item(coord))) {
63218 + do {
63219 + lock_handle couple;
63220 +
63221 + /* move to the next node */
63222 + init_lh(&couple);
63223 + result =
63224 + reiser4_get_right_neighbor(&couple,
63225 + coord->node,
63226 + (int)mode,
63227 + GN_CAN_USE_UPPER_LEVELS);
63228 + zrelse(coord->node);
63229 + if (result == 0) {
63230 +
63231 + result = zload(couple.node);
63232 + if (result != 0) {
63233 + done_lh(&couple);
63234 + return result;
63235 + }
63236 +
63237 + coord_init_first_unit(coord,
63238 + couple.node);
63239 + done_lh(lh);
63240 + move_lh(lh, &couple);
63241 + } else
63242 + return result;
63243 + } while (node_is_empty(coord->node));
63244 + }
63245 +
63246 + assert("nikita-1149", coord_is_existing_unit(coord));
63247 + }
63248 + zrelse(coord->node);
63249 + return result;
63250 +}
63251 +
63252 +/* return locked uber znode for @tree */
63253 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
63254 + znode_lock_request pri, lock_handle * lh)
63255 +{
63256 + int result;
63257 +
63258 + result = longterm_lock_znode(lh, tree->uber, mode, pri);
63259 + return result;
63260 +}
63261 +
63262 +/* true if @key is strictly within @node
63263 +
63264 + we are looking for possibly non-unique key and it is item is at the edge of
63265 + @node. May be it is in the neighbor.
63266 +*/
63267 +static int znode_contains_key_strict(znode * node /* node to check key
63268 + * against */ ,
63269 + const reiser4_key *
63270 + key /* key to check */ ,
63271 + int isunique)
63272 +{
63273 + int answer;
63274 +
63275 + assert("nikita-1760", node != NULL);
63276 + assert("nikita-1722", key != NULL);
63277 +
63278 + if (keyge(key, &node->rd_key))
63279 + return 0;
63280 +
63281 + answer = keycmp(&node->ld_key, key);
63282 +
63283 + if (isunique)
63284 + return answer != GREATER_THAN;
63285 + else
63286 + return answer == LESS_THAN;
63287 +}
63288 +
63289 +/*
63290 + * Virtual Root (vroot) code.
63291 + *
63292 + * For given file system object (e.g., regular file or directory) let's
63293 + * define its "virtual root" as lowest in the tree (that is, furtherest
63294 + * from the tree root) node such that all body items of said object are
63295 + * located in a tree rooted at this node.
63296 + *
63297 + * Once vroot of object is found all tree lookups for items within body of
63298 + * this object ("object lookups") can be started from its vroot rather
63299 + * than from real root. This has following advantages:
63300 + *
63301 + * 1. amount of nodes traversed during lookup (and, hence, amount of
63302 + * key comparisons made) decreases, and
63303 + *
63304 + * 2. contention on tree root is decreased. This latter was actually
63305 + * motivating reason behind vroot, because spin lock of root node,
63306 + * which is taken when acquiring long-term lock on root node is the
63307 + * hottest lock in the reiser4.
63308 + *
63309 + * How to find vroot.
63310 + *
63311 + * When vroot of object F is not yet determined, all object lookups start
63312 + * from the root of the tree. At each tree level during traversal we have
63313 + * a node N such that a key we are looking for (which is the key inside
63314 + * object's body) is located within N. In function handle_vroot() called
63315 + * from cbk_level_lookup() we check whether N is possible vroot for
63316 + * F. Check is trivial---if neither leftmost nor rightmost item of N
63317 + * belongs to F (and we already have helpful ->owns_item() method of
63318 + * object plugin for this), then N is possible vroot of F. This, of
63319 + * course, relies on the assumption that each object occupies contiguous
63320 + * range of keys in the tree.
63321 + *
63322 + * Thus, traversing tree downward and checking each node as we go, we can
63323 + * find lowest such node, which, by definition, is vroot.
63324 + *
63325 + * How to track vroot.
63326 + *
63327 + * Nohow. If actual vroot changes, next object lookup will just restart
63328 + * from the actual tree root, refreshing object's vroot along the way.
63329 + *
63330 + */
63331 +
63332 +/*
63333 + * Check whether @node is possible vroot of @object.
63334 + */
63335 +static void handle_vroot(struct inode *object, znode * node)
63336 +{
63337 + file_plugin *fplug;
63338 + coord_t coord;
63339 +
63340 + fplug = inode_file_plugin(object);
63341 + assert("nikita-3353", fplug != NULL);
63342 + assert("nikita-3354", fplug->owns_item != NULL);
63343 +
63344 + if (unlikely(node_is_empty(node)))
63345 + return;
63346 +
63347 + coord_init_first_unit(&coord, node);
63348 + /*
63349 + * if leftmost item of @node belongs to @object, we cannot be sure
63350 + * that @node is vroot of @object, because, some items of @object are
63351 + * probably in the sub-tree rooted at the left neighbor of @node.
63352 + */
63353 + if (fplug->owns_item(object, &coord))
63354 + return;
63355 + coord_init_last_unit(&coord, node);
63356 + /* mutatis mutandis for the rightmost item */
63357 + if (fplug->owns_item(object, &coord))
63358 + return;
63359 + /* otherwise, @node is possible vroot of @object */
63360 + inode_set_vroot(object, node);
63361 +}
63362 +
63363 +/*
63364 + * helper function used by traverse tree to start tree traversal not from the
63365 + * tree root, but from @h->object's vroot, if possible.
63366 + */
63367 +static int prepare_object_lookup(cbk_handle * h)
63368 +{
63369 + znode *vroot;
63370 + int result;
63371 +
63372 + vroot = inode_get_vroot(h->object);
63373 + if (vroot == NULL) {
63374 + /*
63375 + * object doesn't have known vroot, start from real tree root.
63376 + */
63377 + return LOOKUP_CONT;
63378 + }
63379 +
63380 + h->level = znode_get_level(vroot);
63381 + /* take a long-term lock on vroot */
63382 + h->result = longterm_lock_znode(h->active_lh, vroot,
63383 + cbk_lock_mode(h->level, h),
63384 + ZNODE_LOCK_LOPRI);
63385 + result = LOOKUP_REST;
63386 + if (h->result == 0) {
63387 + int isunique;
63388 + int inside;
63389 +
63390 + isunique = h->flags & CBK_UNIQUE;
63391 + /* check that key is inside vroot */
63392 + read_lock_dk(h->tree);
63393 + inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
63394 + !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
63395 + read_unlock_dk(h->tree);
63396 + if (inside) {
63397 + h->result = zload(vroot);
63398 + if (h->result == 0) {
63399 + /* search for key in vroot. */
63400 + result = cbk_node_lookup(h);
63401 + zrelse(vroot); /*h->active_lh->node); */
63402 + if (h->active_lh->node != vroot) {
63403 + result = LOOKUP_REST;
63404 + } else if (result == LOOKUP_CONT) {
63405 + move_lh(h->parent_lh, h->active_lh);
63406 + h->flags &= ~CBK_DKSET;
63407 + }
63408 + }
63409 + }
63410 + }
63411 +
63412 + zput(vroot);
63413 +
63414 + if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63415 + hput(h);
63416 + return result;
63417 +}
63418 +
63419 +/* main function that handles common parts of tree traversal: starting
63420 + (fake znode handling), restarts, error handling, completion */
63421 +static lookup_result traverse_tree(cbk_handle * h/* search handle */)
63422 +{
63423 + int done;
63424 + int iterations;
63425 + int vroot_used;
63426 +
63427 + assert("nikita-365", h != NULL);
63428 + assert("nikita-366", h->tree != NULL);
63429 + assert("nikita-367", h->key != NULL);
63430 + assert("nikita-368", h->coord != NULL);
63431 + assert("nikita-369", (h->bias == FIND_EXACT)
63432 + || (h->bias == FIND_MAX_NOT_MORE_THAN));
63433 + assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63434 + assert("nikita-2949", !(h->flags & CBK_DKSET));
63435 + assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63436 +
63437 + done = 0;
63438 + iterations = 0;
63439 + vroot_used = 0;
63440 +
63441 + /* loop for restarts */
63442 +restart:
63443 +
63444 + assert("nikita-3024", reiser4_schedulable());
63445 +
63446 + h->result = CBK_COORD_FOUND;
63447 + /* connect_znode() needs it */
63448 + h->ld_key = *reiser4_min_key();
63449 + h->rd_key = *reiser4_max_key();
63450 + h->flags |= CBK_DKSET;
63451 + h->error = NULL;
63452 +
63453 + if (!vroot_used && h->object != NULL) {
63454 + vroot_used = 1;
63455 + done = prepare_object_lookup(h);
63456 + if (done == LOOKUP_REST)
63457 + goto restart;
63458 + else if (done == LOOKUP_DONE)
63459 + return h->result;
63460 + }
63461 + if (h->parent_lh->node == NULL) {
63462 + done =
63463 + get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63464 + h->parent_lh);
63465 +
63466 + assert("nikita-1637", done != -E_DEADLOCK);
63467 +
63468 + h->block = h->tree->root_block;
63469 + h->level = h->tree->height;
63470 + h->coord->node = h->parent_lh->node;
63471 +
63472 + if (done != 0)
63473 + return done;
63474 + }
63475 +
63476 + /* loop descending a tree */
63477 + while (!done) {
63478 +
63479 + if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63480 + IS_POW(iterations))) {
63481 + warning("nikita-1481", "Too many iterations: %i",
63482 + iterations);
63483 + reiser4_print_key("key", h->key);
63484 + ++iterations;
63485 + } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63486 + h->error =
63487 + "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63488 + h->result = RETERR(-EIO);
63489 + break;
63490 + }
63491 + switch (cbk_level_lookup(h)) {
63492 + case LOOKUP_CONT:
63493 + move_lh(h->parent_lh, h->active_lh);
63494 + continue;
63495 + default:
63496 + wrong_return_value("nikita-372", "cbk_level");
63497 + case LOOKUP_DONE:
63498 + done = 1;
63499 + break;
63500 + case LOOKUP_REST:
63501 + hput(h);
63502 + /* deadlock avoidance is normal case. */
63503 + if (h->result != -E_DEADLOCK)
63504 + ++iterations;
63505 + reiser4_preempt_point();
63506 + goto restart;
63507 + }
63508 + }
63509 + /* that's all. The rest is error handling */
63510 + if (unlikely(h->error != NULL)) {
63511 + warning("nikita-373", "%s: level: %i, "
63512 + "lock_level: %i, stop_level: %i "
63513 + "lock_mode: %s, bias: %s",
63514 + h->error, h->level, h->lock_level, h->stop_level,
63515 + lock_mode_name(h->lock_mode), bias_name(h->bias));
63516 + reiser4_print_address("block", &h->block);
63517 + reiser4_print_key("key", h->key);
63518 + print_coord_content("coord", h->coord);
63519 + }
63520 + /* `unlikely' error case */
63521 + if (unlikely(IS_CBKERR(h->result))) {
63522 + /* failure. do cleanup */
63523 + hput(h);
63524 + } else {
63525 + assert("nikita-1605", WITH_DATA_RET
63526 + (h->coord->node, 1,
63527 + ergo((h->result == CBK_COORD_FOUND) &&
63528 + (h->bias == FIND_EXACT) &&
63529 + (!node_is_empty(h->coord->node)),
63530 + coord_is_existing_item(h->coord))));
63531 + }
63532 + return h->result;
63533 +}
63534 +
63535 +/* find delimiting keys of child
63536 +
63537 + Determine left and right delimiting keys for child pointed to by
63538 + @parent_coord.
63539 +
63540 +*/
63541 +static void find_child_delimiting_keys(znode * parent /* parent znode, passed
63542 + * locked */ ,
63543 + const coord_t *parent_coord
63544 + /* coord where pointer
63545 + * to child is stored
63546 + */ ,
63547 + reiser4_key * ld /* where to store left
63548 + * delimiting key */ ,
63549 + reiser4_key * rd /* where to store right
63550 + * delimiting key */ )
63551 +{
63552 + coord_t neighbor;
63553 +
63554 + assert("nikita-1484", parent != NULL);
63555 + assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63556 +
63557 + coord_dup(&neighbor, parent_coord);
63558 +
63559 + if (neighbor.between == AT_UNIT)
63560 + /* imitate item ->lookup() behavior. */
63561 + neighbor.between = AFTER_UNIT;
63562 +
63563 + if (coord_set_to_left(&neighbor) == 0)
63564 + unit_key_by_coord(&neighbor, ld);
63565 + else {
63566 + assert("nikita-14851", 0);
63567 + *ld = *znode_get_ld_key(parent);
63568 + }
63569 +
63570 + coord_dup(&neighbor, parent_coord);
63571 + if (neighbor.between == AT_UNIT)
63572 + neighbor.between = AFTER_UNIT;
63573 + if (coord_set_to_right(&neighbor) == 0)
63574 + unit_key_by_coord(&neighbor, rd);
63575 + else
63576 + *rd = *znode_get_rd_key(parent);
63577 +}
63578 +
63579 +/*
63580 + * setup delimiting keys for a child
63581 + *
63582 + * @parent parent node
63583 + *
63584 + * @coord location in @parent where pointer to @child is
63585 + *
63586 + * @child child node
63587 + */
63588 +int
63589 +set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child)
63590 +{
63591 + reiser4_tree *tree;
63592 +
63593 + assert("nikita-2952",
63594 + znode_get_level(parent) == znode_get_level(coord->node));
63595 +
63596 + /* fast check without taking dk lock. This is safe, because
63597 + * JNODE_DKSET is never cleared once set. */
63598 + if (!ZF_ISSET(child, JNODE_DKSET)) {
63599 + tree = znode_get_tree(parent);
63600 + write_lock_dk(tree);
63601 + if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63602 + find_child_delimiting_keys(parent, coord,
63603 + &child->ld_key,
63604 + &child->rd_key);
63605 + ON_DEBUG(child->ld_key_version =
63606 + atomic_inc_return(&delim_key_version);
63607 + child->rd_key_version =
63608 + atomic_inc_return(&delim_key_version););
63609 + ZF_SET(child, JNODE_DKSET);
63610 + }
63611 + write_unlock_dk(tree);
63612 + return 1;
63613 + }
63614 + return 0;
63615 +}
63616 +
63617 +/* Perform tree lookup at one level. This is called from cbk_traverse()
63618 + function that drives lookup through tree and calls cbk_node_lookup() to
63619 + perform lookup within one node.
63620 +
63621 + See comments in a code.
63622 +*/
63623 +static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */)
63624 +{
63625 + int ret;
63626 + int setdk;
63627 + int ldkeyset = 0;
63628 + reiser4_key ldkey;
63629 + reiser4_key key;
63630 + znode *active;
63631 +
63632 + assert("nikita-3025", reiser4_schedulable());
63633 +
63634 + /* acquire reference to @active node */
63635 + active =
63636 + zget(h->tree, &h->block, h->parent_lh->node, h->level,
63637 + reiser4_ctx_gfp_mask_get());
63638 +
63639 + if (IS_ERR(active)) {
63640 + h->result = PTR_ERR(active);
63641 + return LOOKUP_DONE;
63642 + }
63643 +
63644 + /* lock @active */
63645 + h->result = longterm_lock_znode(h->active_lh,
63646 + active,
63647 + cbk_lock_mode(h->level, h),
63648 + ZNODE_LOCK_LOPRI);
63649 + /* longterm_lock_znode() acquires additional reference to znode (which
63650 + will be later released by longterm_unlock_znode()). Release
63651 + reference acquired by zget().
63652 + */
63653 + zput(active);
63654 + if (unlikely(h->result != 0))
63655 + goto fail_or_restart;
63656 +
63657 + setdk = 0;
63658 + /* if @active is accessed for the first time, setup delimiting keys on
63659 + it. Delimiting keys are taken from the parent node. See
63660 + setup_delimiting_keys() for details.
63661 + */
63662 + if (h->flags & CBK_DKSET) {
63663 + setdk = setup_delimiting_keys(h);
63664 + h->flags &= ~CBK_DKSET;
63665 + } else {
63666 + znode *parent;
63667 +
63668 + parent = h->parent_lh->node;
63669 + h->result = zload(parent);
63670 + if (unlikely(h->result != 0))
63671 + goto fail_or_restart;
63672 +
63673 + if (!ZF_ISSET(active, JNODE_DKSET))
63674 + setdk = set_child_delimiting_keys(parent,
63675 + h->coord, active);
63676 + else {
63677 + read_lock_dk(h->tree);
63678 + find_child_delimiting_keys(parent, h->coord, &ldkey,
63679 + &key);
63680 + read_unlock_dk(h->tree);
63681 + ldkeyset = 1;
63682 + }
63683 + zrelse(parent);
63684 + }
63685 +
63686 + /* this is ugly kludge. Reminder: this is necessary, because
63687 + ->lookup() method returns coord with ->between field probably set
63688 + to something different from AT_UNIT.
63689 + */
63690 + h->coord->between = AT_UNIT;
63691 +
63692 + if (znode_just_created(active) && (h->coord->node != NULL)) {
63693 + write_lock_tree(h->tree);
63694 + /* if we are going to load znode right now, setup
63695 + ->in_parent: coord where pointer to this node is stored in
63696 + parent.
63697 + */
63698 + coord_to_parent_coord(h->coord, &active->in_parent);
63699 + write_unlock_tree(h->tree);
63700 + }
63701 +
63702 + /* check connectedness without holding tree lock---false negatives
63703 + * will be re-checked by connect_znode(), and false positives are
63704 + * impossible---@active cannot suddenly turn into unconnected
63705 + * state. */
63706 + if (!znode_is_connected(active)) {
63707 + h->result = connect_znode(h->coord, active);
63708 + if (unlikely(h->result != 0)) {
63709 + put_parent(h);
63710 + goto fail_or_restart;
63711 + }
63712 + }
63713 +
63714 + jload_prefetch(ZJNODE(active));
63715 +
63716 + if (setdk)
63717 + update_stale_dk(h->tree, active);
63718 +
63719 + /* put_parent() cannot be called earlier, because connect_znode()
63720 + assumes parent node is referenced; */
63721 + put_parent(h);
63722 +
63723 + if ((!znode_contains_key_lock(active, h->key) &&
63724 + (h->flags & CBK_TRUST_DK))
63725 + || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63726 + /* 1. key was moved out of this node while this thread was
63727 + waiting for the lock. Restart. More elaborate solution is
63728 + to determine where key moved (to the left, or to the right)
63729 + and try to follow it through sibling pointers.
63730 +
63731 + 2. or, node itself is going to be removed from the
63732 + tree. Release lock and restart.
63733 + */
63734 + h->result = -E_REPEAT;
63735 + }
63736 + if (h->result == -E_REPEAT)
63737 + return LOOKUP_REST;
63738 +
63739 + h->result = zload_ra(active, h->ra_info);
63740 + if (h->result)
63741 + return LOOKUP_DONE;
63742 +
63743 + /* sanity checks */
63744 + if (sanity_check(h)) {
63745 + zrelse(active);
63746 + return LOOKUP_DONE;
63747 + }
63748 +
63749 + /* check that key of leftmost item in the @active is the same as in
63750 + * its parent */
63751 + if (ldkeyset && !node_is_empty(active) &&
63752 + !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63753 + warning("vs-3533", "Keys are inconsistent. Fsck?");
63754 + reiser4_print_key("inparent", &ldkey);
63755 + reiser4_print_key("inchild", &key);
63756 + h->result = RETERR(-EIO);
63757 + zrelse(active);
63758 + return LOOKUP_DONE;
63759 + }
63760 +
63761 + if (h->object != NULL)
63762 + handle_vroot(h->object, active);
63763 +
63764 + ret = cbk_node_lookup(h);
63765 +
63766 + /* h->active_lh->node might change, but active is yet to be zrelsed */
63767 + zrelse(active);
63768 +
63769 + return ret;
63770 +
63771 +fail_or_restart:
63772 + if (h->result == -E_DEADLOCK)
63773 + return LOOKUP_REST;
63774 + return LOOKUP_DONE;
63775 +}
63776 +
63777 +#if REISER4_DEBUG
63778 +/* check left and right delimiting keys of a znode */
63779 +void check_dkeys(znode * node)
63780 +{
63781 + znode *left;
63782 + znode *right;
63783 +
63784 + read_lock_tree(current_tree);
63785 + read_lock_dk(current_tree);
63786 +
63787 + assert("vs-1710", znode_is_any_locked(node));
63788 + assert("vs-1197",
63789 + !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63790 +
63791 + left = node->left;
63792 + right = node->right;
63793 +
63794 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63795 + && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63796 + /* check left neighbor. Note that left neighbor is not locked,
63797 + so it might get wrong delimiting keys therefore */
63798 + assert("vs-1198",
63799 + (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63800 + || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63801 +
63802 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63803 + && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63804 + /* check right neighbor. Note that right neighbor is not
63805 + locked, so it might get wrong delimiting keys therefore */
63806 + assert("vs-1199",
63807 + (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63808 + || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63809 +
63810 + read_unlock_dk(current_tree);
63811 + read_unlock_tree(current_tree);
63812 +}
63813 +#endif
63814 +
63815 +/* true if @key is left delimiting key of @node */
63816 +static int key_is_ld(znode * node, const reiser4_key * key)
63817 +{
63818 + int ld;
63819 +
63820 + assert("nikita-1716", node != NULL);
63821 + assert("nikita-1758", key != NULL);
63822 +
63823 + read_lock_dk(znode_get_tree(node));
63824 + assert("nikita-1759", znode_contains_key(node, key));
63825 + ld = keyeq(znode_get_ld_key(node), key);
63826 + read_unlock_dk(znode_get_tree(node));
63827 + return ld;
63828 +}
63829 +
63830 +/* Process one node during tree traversal.
63831 +
63832 + This is called by cbk_level_lookup(). */
63833 +static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */)
63834 +{
63835 + /* node plugin of @active */
63836 + node_plugin *nplug;
63837 + /* item plugin of item that was found */
63838 + item_plugin *iplug;
63839 + /* search bias */
63840 + lookup_bias node_bias;
63841 + /* node we are operating upon */
63842 + znode *active;
63843 + /* tree we are searching in */
63844 + reiser4_tree *tree;
63845 + /* result */
63846 + int result;
63847 +
63848 + assert("nikita-379", h != NULL);
63849 +
63850 + active = h->active_lh->node;
63851 + tree = h->tree;
63852 +
63853 + nplug = active->nplug;
63854 + assert("nikita-380", nplug != NULL);
63855 +
63856 + ON_DEBUG(check_dkeys(active));
63857 +
63858 + /* return item from "active" node with maximal key not greater than
63859 + "key" */
63860 + node_bias = h->bias;
63861 + result = nplug->lookup(active, h->key, node_bias, h->coord);
63862 + if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63863 + /* error occurred */
63864 + h->result = result;
63865 + return LOOKUP_DONE;
63866 + }
63867 + if (h->level == h->stop_level) {
63868 + /* welcome to the stop level */
63869 + assert("nikita-381", h->coord->node == active);
63870 + if (result == NS_FOUND) {
63871 + /* success of tree lookup */
63872 + if (!(h->flags & CBK_UNIQUE)
63873 + && key_is_ld(active, h->key))
63874 + return search_to_left(h);
63875 + else
63876 + h->result = CBK_COORD_FOUND;
63877 + } else {
63878 + h->result = CBK_COORD_NOTFOUND;
63879 + }
63880 + if (!(h->flags & CBK_IN_CACHE))
63881 + cbk_cache_add(active);
63882 + return LOOKUP_DONE;
63883 + }
63884 +
63885 + if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63886 + h->error = "not found on internal node";
63887 + h->result = result;
63888 + return LOOKUP_DONE;
63889 + }
63890 +
63891 + assert("vs-361", h->level > h->stop_level);
63892 +
63893 + if (handle_eottl(h, &result)) {
63894 + assert("vs-1674", (result == LOOKUP_DONE ||
63895 + result == LOOKUP_REST));
63896 + return result;
63897 + }
63898 +
63899 + /* go down to next level */
63900 + check_me("vs-12", zload(h->coord->node) == 0);
63901 + assert("nikita-2116", item_is_internal(h->coord));
63902 + iplug = item_plugin_by_coord(h->coord);
63903 + iplug->s.internal.down_link(h->coord, h->key, &h->block);
63904 + zrelse(h->coord->node);
63905 + --h->level;
63906 + return LOOKUP_CONT; /* continue */
63907 +}
63908 +
63909 +/* scan cbk_cache slots looking for a match for @h */
63910 +static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */)
63911 +{
63912 + level_lookup_result llr;
63913 + znode *node;
63914 + reiser4_tree *tree;
63915 + cbk_cache_slot *slot;
63916 + cbk_cache *cache;
63917 + tree_level level;
63918 + int isunique;
63919 + const reiser4_key *key;
63920 + int result;
63921 +
63922 + assert("nikita-1317", h != NULL);
63923 + assert("nikita-1315", h->tree != NULL);
63924 + assert("nikita-1316", h->key != NULL);
63925 +
63926 + tree = h->tree;
63927 + cache = &tree->cbk_cache;
63928 + if (cache->nr_slots == 0)
63929 + /* size of cbk cache was set to 0 by mount time option. */
63930 + return RETERR(-ENOENT);
63931 +
63932 + assert("nikita-2474", cbk_cache_invariant(cache));
63933 + node = NULL; /* to keep gcc happy */
63934 + level = h->level;
63935 + key = h->key;
63936 + isunique = h->flags & CBK_UNIQUE;
63937 + result = RETERR(-ENOENT);
63938 +
63939 + /*
63940 + * this is time-critical function and dragons had, hence, been settled
63941 + * here.
63942 + *
63943 + * Loop below scans cbk cache slots trying to find matching node with
63944 + * suitable range of delimiting keys and located at the h->level.
63945 + *
63946 + * Scan is done under cbk cache spin lock that protects slot->node
63947 + * pointers. If suitable node is found we want to pin it in
63948 + * memory. But slot->node can point to the node with x_count 0
63949 + * (unreferenced). Such node can be recycled at any moment, or can
63950 + * already be in the process of being recycled (within jput()).
63951 + *
63952 + * As we found node in the cbk cache, it means that jput() hasn't yet
63953 + * called cbk_cache_invalidate().
63954 + *
63955 + * We acquire reference to the node without holding tree lock, and
63956 + * later, check node's RIP bit. This avoids races with jput().
63957 + */
63958 +
63959 + rcu_read_lock();
63960 + read_lock(&((cbk_cache *)cache)->guard);
63961 +
63962 + slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63963 + slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63964 + BUG_ON(&slot->lru != &cache->lru);/*????*/
63965 + while (1) {
63966 +
63967 + slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63968 +
63969 + if (&cache->lru != &slot->lru)
63970 + node = slot->node;
63971 + else
63972 + node = NULL;
63973 +
63974 + if (unlikely(node == NULL))
63975 + break;
63976 +
63977 + /*
63978 + * this is (hopefully) the only place in the code where we are
63979 + * working with delimiting keys without holding dk lock. This
63980 + * is fine here, because this is only "guess" anyway---keys
63981 + * are rechecked under dk lock below.
63982 + */
63983 + if (znode_get_level(node) == level &&
63984 + /* reiser4_min_key < key < reiser4_max_key */
63985 + znode_contains_key_strict(node, key, isunique)) {
63986 + zref(node);
63987 + result = 0;
63988 + spin_lock_prefetch(&tree->tree_lock);
63989 + break;
63990 + }
63991 + }
63992 + read_unlock(&((cbk_cache *)cache)->guard);
63993 +
63994 + assert("nikita-2475", cbk_cache_invariant(cache));
63995 +
63996 + if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63997 + result = -ENOENT;
63998 +
63999 + rcu_read_unlock();
64000 +
64001 + if (result != 0) {
64002 + h->result = CBK_COORD_NOTFOUND;
64003 + return RETERR(-ENOENT);
64004 + }
64005 +
64006 + result =
64007 + longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
64008 + ZNODE_LOCK_LOPRI);
64009 + zput(node);
64010 + if (result != 0)
64011 + return result;
64012 + result = zload(node);
64013 + if (result != 0)
64014 + return result;
64015 +
64016 + /* recheck keys */
64017 + read_lock_dk(tree);
64018 + result = (znode_contains_key_strict(node, key, isunique) &&
64019 + !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
64020 + read_unlock_dk(tree);
64021 + if (result) {
64022 + /* do lookup inside node */
64023 + llr = cbk_node_lookup(h);
64024 + /* if cbk_node_lookup() wandered to another node (due to eottl
64025 + or non-unique keys), adjust @node */
64026 + /*node = h->active_lh->node; */
64027 +
64028 + if (llr != LOOKUP_DONE) {
64029 + /* restart or continue on the next level */
64030 + result = RETERR(-ENOENT);
64031 + } else if (IS_CBKERR(h->result))
64032 + /* io or oom */
64033 + result = RETERR(-ENOENT);
64034 + else {
64035 + /* good. Either item found or definitely not found. */
64036 + result = 0;
64037 +
64038 + write_lock(&(cache->guard));
64039 + if (slot->node == h->active_lh->node) {
64040 + /* if this node is still in cbk cache---move
64041 + its slot to the head of the LRU list. */
64042 + list_move(&slot->lru, &cache->lru);
64043 + }
64044 + write_unlock(&(cache->guard));
64045 + }
64046 + } else {
64047 + /* race. While this thread was waiting for the lock, node was
64048 + rebalanced and item we are looking for, shifted out of it
64049 + (if it ever was here).
64050 +
64051 + Continuing scanning is almost hopeless: node key range was
64052 + moved to, is almost certainly at the beginning of the LRU
64053 + list at this time, because it's hot, but restarting
64054 + scanning from the very beginning is complex. Just return,
64055 + so that cbk() will be performed. This is not that
64056 + important, because such races should be rare. Are they?
64057 + */
64058 + result = RETERR(-ENOENT); /* -ERAUGHT */
64059 + }
64060 + zrelse(node);
64061 + assert("nikita-2476", cbk_cache_invariant(cache));
64062 + return result;
64063 +}
64064 +
64065 +/* look for item with given key in the coord cache
64066 +
64067 + This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
64068 + which is a small LRU list of znodes accessed lately. For each znode in
64069 + znode in this list, it checks whether key we are looking for fits into key
64070 + range covered by this node. If so, and in addition, node lies at allowed
64071 + level (this is to handle extents on a twig level), node is locked, and
64072 + lookup inside it is performed.
64073 +
64074 + we need a measurement of the cost of this cache search compared to the cost
64075 + of coord_by_key.
64076 +
64077 +*/
64078 +static int cbk_cache_search(cbk_handle * h/* cbk handle */)
64079 +{
64080 + int result = 0;
64081 + tree_level level;
64082 +
64083 + /* add CBK_IN_CACHE to the handle flags. This means that
64084 + * cbk_node_lookup() assumes that cbk_cache is scanned and would add
64085 + * found node to the cache. */
64086 + h->flags |= CBK_IN_CACHE;
64087 + for (level = h->stop_level; level <= h->lock_level; ++level) {
64088 + h->level = level;
64089 + result = cbk_cache_scan_slots(h);
64090 + if (result != 0) {
64091 + done_lh(h->active_lh);
64092 + done_lh(h->parent_lh);
64093 + } else {
64094 + assert("nikita-1319", !IS_CBKERR(h->result));
64095 + break;
64096 + }
64097 + }
64098 + h->flags &= ~CBK_IN_CACHE;
64099 + return result;
64100 +}
64101 +
64102 +/* type of lock we want to obtain during tree traversal. On stop level
64103 + we want type of lock user asked for, on upper levels: read lock. */
64104 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
64105 +{
64106 + assert("nikita-382", h != NULL);
64107 +
64108 + return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
64109 +}
64110 +
64111 +/* update outdated delimiting keys */
64112 +static void stale_dk(reiser4_tree * tree, znode * node)
64113 +{
64114 + znode *right;
64115 +
64116 + read_lock_tree(tree);
64117 + write_lock_dk(tree);
64118 + right = node->right;
64119 +
64120 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
64121 + right && ZF_ISSET(right, JNODE_DKSET) &&
64122 + !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
64123 + znode_set_rd_key(node, znode_get_ld_key(right));
64124 +
64125 + write_unlock_dk(tree);
64126 + read_unlock_tree(tree);
64127 +}
64128 +
64129 +/* check for possibly outdated delimiting keys, and update them if
64130 + * necessary. */
64131 +static void update_stale_dk(reiser4_tree * tree, znode * node)
64132 +{
64133 + znode *right;
64134 + reiser4_key rd;
64135 +
64136 + read_lock_tree(tree);
64137 + read_lock_dk(tree);
64138 + rd = *znode_get_rd_key(node);
64139 + right = node->right;
64140 + if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
64141 + right && ZF_ISSET(right, JNODE_DKSET) &&
64142 + !keyeq(&rd, znode_get_ld_key(right)))) {
64143 + assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
64144 + read_unlock_dk(tree);
64145 + read_unlock_tree(tree);
64146 + stale_dk(tree, node);
64147 + return;
64148 + }
64149 + read_unlock_dk(tree);
64150 + read_unlock_tree(tree);
64151 +}
64152 +
64153 +/*
64154 + * handle searches a the non-unique key.
64155 + *
64156 + * Suppose that we are looking for an item with possibly non-unique key 100.
64157 + *
64158 + * Root node contains two pointers: one to a node with left delimiting key 0,
64159 + * and another to a node with left delimiting key 100. Item we interested in
64160 + * may well happen in the sub-tree rooted at the first pointer.
64161 + *
64162 + * To handle this search_to_left() is called when search reaches stop
64163 + * level. This function checks it is _possible_ that item we are looking for
64164 + * is in the left neighbor (this can be done by comparing delimiting keys) and
64165 + * if so, tries to lock left neighbor (this is low priority lock, so it can
64166 + * deadlock, tree traversal is just restarted if it did) and then checks
64167 + * whether left neighbor actually contains items with our key.
64168 + *
64169 + * Note that this is done on the stop level only. It is possible to try such
64170 + * left-check on each level, but as duplicate keys are supposed to be rare
64171 + * (very unlikely that more than one node is completely filled with items with
64172 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
64173 + *
64174 + */
64175 +static level_lookup_result search_to_left(cbk_handle * h/* search handle */)
64176 +{
64177 + level_lookup_result result;
64178 + coord_t *coord;
64179 + znode *node;
64180 + znode *neighbor;
64181 +
64182 + lock_handle lh;
64183 +
64184 + assert("nikita-1761", h != NULL);
64185 + assert("nikita-1762", h->level == h->stop_level);
64186 +
64187 + init_lh(&lh);
64188 + coord = h->coord;
64189 + node = h->active_lh->node;
64190 + assert("nikita-1763", coord_is_leftmost_unit(coord));
64191 +
64192 + h->result =
64193 + reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
64194 + GN_CAN_USE_UPPER_LEVELS);
64195 + neighbor = NULL;
64196 + switch (h->result) {
64197 + case -E_DEADLOCK:
64198 + result = LOOKUP_REST;
64199 + break;
64200 + case 0:{
64201 + node_plugin *nplug;
64202 + coord_t crd;
64203 + lookup_bias bias;
64204 +
64205 + neighbor = lh.node;
64206 + h->result = zload(neighbor);
64207 + if (h->result != 0) {
64208 + result = LOOKUP_DONE;
64209 + break;
64210 + }
64211 +
64212 + nplug = neighbor->nplug;
64213 +
64214 + coord_init_zero(&crd);
64215 + bias = h->bias;
64216 + h->bias = FIND_EXACT;
64217 + h->result =
64218 + nplug->lookup(neighbor, h->key, h->bias, &crd);
64219 + h->bias = bias;
64220 +
64221 + if (h->result == NS_NOT_FOUND) {
64222 + case -E_NO_NEIGHBOR:
64223 + h->result = CBK_COORD_FOUND;
64224 + if (!(h->flags & CBK_IN_CACHE))
64225 + cbk_cache_add(node);
64226 + default: /* some other error */
64227 + result = LOOKUP_DONE;
64228 + } else if (h->result == NS_FOUND) {
64229 + read_lock_dk(znode_get_tree(neighbor));
64230 + h->rd_key = *znode_get_ld_key(node);
64231 + leftmost_key_in_node(neighbor, &h->ld_key);
64232 + read_unlock_dk(znode_get_tree(neighbor));
64233 + h->flags |= CBK_DKSET;
64234 +
64235 + h->block = *znode_get_block(neighbor);
64236 + /* clear coord->node so that cbk_level_lookup()
64237 + wouldn't overwrite parent hint in neighbor.
64238 +
64239 + Parent hint was set up by
64240 + reiser4_get_left_neighbor()
64241 + */
64242 + /* FIXME: why do we have to spinlock here? */
64243 + write_lock_tree(znode_get_tree(neighbor));
64244 + h->coord->node = NULL;
64245 + write_unlock_tree(znode_get_tree(neighbor));
64246 + result = LOOKUP_CONT;
64247 + } else {
64248 + result = LOOKUP_DONE;
64249 + }
64250 + if (neighbor != NULL)
64251 + zrelse(neighbor);
64252 + }
64253 + }
64254 + done_lh(&lh);
64255 + return result;
64256 +}
64257 +
64258 +/* debugging aid: return symbolic name of search bias */
64259 +static const char *bias_name(lookup_bias bias/* bias to get name of */)
64260 +{
64261 + if (bias == FIND_EXACT)
64262 + return "exact";
64263 + else if (bias == FIND_MAX_NOT_MORE_THAN)
64264 + return "left-slant";
64265 +/* else if( bias == RIGHT_SLANT_BIAS ) */
64266 +/* return "right-bias"; */
64267 + else {
64268 + static char buf[30];
64269 +
64270 + sprintf(buf, "unknown: %i", bias);
64271 + return buf;
64272 + }
64273 +}
64274 +
64275 +#if REISER4_DEBUG
64276 +/* debugging aid: print human readable information about @p */
64277 +void print_coord_content(const char *prefix /* prefix to print */ ,
64278 + coord_t *p/* coord to print */)
64279 +{
64280 + reiser4_key key;
64281 +
64282 + if (p == NULL) {
64283 + printk("%s: null\n", prefix);
64284 + return;
64285 + }
64286 + if ((p->node != NULL) && znode_is_loaded(p->node)
64287 + && coord_is_existing_item(p))
64288 + printk("%s: data: %p, length: %i\n", prefix,
64289 + item_body_by_coord(p), item_length_by_coord(p));
64290 + if (znode_is_loaded(p->node)) {
64291 + item_key_by_coord(p, &key);
64292 + reiser4_print_key(prefix, &key);
64293 + }
64294 +}
64295 +
64296 +/* debugging aid: print human readable information about @block */
64297 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
64298 + const reiser4_block_nr * block/* block number to print */)
64299 +{
64300 + printk("%s: %s\n", prefix, sprint_address(block));
64301 +}
64302 +#endif
64303 +
64304 +/* return string containing human readable representation of @block */
64305 +char *sprint_address(const reiser4_block_nr *
64306 + block/* block number to print */)
64307 +{
64308 + static char address[30];
64309 +
64310 + if (block == NULL)
64311 + sprintf(address, "null");
64312 + else if (reiser4_blocknr_is_fake(block))
64313 + sprintf(address, "%llx", (unsigned long long)(*block));
64314 + else
64315 + sprintf(address, "%llu", (unsigned long long)(*block));
64316 + return address;
64317 +}
64318 +
64319 +/* release parent node during traversal */
64320 +static void put_parent(cbk_handle * h/* search handle */)
64321 +{
64322 + assert("nikita-383", h != NULL);
64323 + if (h->parent_lh->node != NULL)
64324 + longterm_unlock_znode(h->parent_lh);
64325 +}
64326 +
64327 +/* helper function used by coord_by_key(): release reference to parent znode
64328 + stored in handle before processing its child. */
64329 +static void hput(cbk_handle * h/* search handle */)
64330 +{
64331 + assert("nikita-385", h != NULL);
64332 + done_lh(h->parent_lh);
64333 + done_lh(h->active_lh);
64334 +}
64335 +
64336 +/* Helper function used by cbk(): update delimiting keys of child node (stored
64337 + in h->active_lh->node) using key taken from parent on the parent level. */
64338 +static int setup_delimiting_keys(cbk_handle * h/* search handle */)
64339 +{
64340 + znode *active;
64341 + reiser4_tree *tree;
64342 +
64343 + assert("nikita-1088", h != NULL);
64344 +
64345 + active = h->active_lh->node;
64346 +
64347 + /* fast check without taking dk lock. This is safe, because
64348 + * JNODE_DKSET is never cleared once set. */
64349 + if (!ZF_ISSET(active, JNODE_DKSET)) {
64350 + tree = znode_get_tree(active);
64351 + write_lock_dk(tree);
64352 + if (!ZF_ISSET(active, JNODE_DKSET)) {
64353 + znode_set_ld_key(active, &h->ld_key);
64354 + znode_set_rd_key(active, &h->rd_key);
64355 + ZF_SET(active, JNODE_DKSET);
64356 + }
64357 + write_unlock_dk(tree);
64358 + return 1;
64359 + }
64360 + return 0;
64361 +}
64362 +
64363 +/* true if @block makes sense for the @tree. Used to detect corrupted node
64364 + * pointers */
64365 +static int
64366 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
64367 + reiser4_tree * tree/* tree to check against */)
64368 +{
64369 + assert("nikita-757", block != NULL);
64370 + assert("nikita-758", tree != NULL);
64371 +
64372 + /* check to see if it exceeds the size of the device. */
64373 + return reiser4_blocknr_is_sane_for(tree->super, block);
64374 +}
64375 +
64376 +/* check consistency of fields */
64377 +static int sanity_check(cbk_handle * h/* search handle */)
64378 +{
64379 + assert("nikita-384", h != NULL);
64380 +
64381 + if (h->level < h->stop_level) {
64382 + h->error = "Buried under leaves";
64383 + h->result = RETERR(-EIO);
64384 + return LOOKUP_DONE;
64385 + } else if (!block_nr_is_correct(&h->block, h->tree)) {
64386 + h->error = "bad block number";
64387 + h->result = RETERR(-EIO);
64388 + return LOOKUP_DONE;
64389 + } else
64390 + return 0;
64391 +}
64392 +
64393 +/* Make Linus happy.
64394 + Local variables:
64395 + c-indentation-style: "K&R"
64396 + mode-name: "LC"
64397 + c-basic-offset: 8
64398 + tab-width: 8
64399 + fill-column: 120
64400 + scroll-step: 1
64401 + End:
64402 +*/
64403 diff -urN linux-2.6.33.orig/fs/reiser4/status_flags.c linux-2.6.33/fs/reiser4/status_flags.c
64404 --- linux-2.6.33.orig/fs/reiser4/status_flags.c 1970-01-01 01:00:00.000000000 +0100
64405 +++ linux-2.6.33/fs/reiser4/status_flags.c 2010-03-04 19:33:22.000000000 +0100
64406 @@ -0,0 +1,174 @@
64407 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64408 + * reiser4/README */
64409 +
64410 +/* Functions that deal with reiser4 status block, query status and update it,
64411 + * if needed */
64412 +
64413 +#include <linux/bio.h>
64414 +#include <linux/highmem.h>
64415 +#include <linux/fs.h>
64416 +#include <linux/blkdev.h>
64417 +#include "debug.h"
64418 +#include "dformat.h"
64419 +#include "status_flags.h"
64420 +#include "super.h"
64421 +
64422 +/* This is our end I/O handler that marks page uptodate if IO was successful.
64423 + It also unconditionally unlocks the page, so we can see that io was done.
64424 + We do not free bio, because we hope to reuse that. */
64425 +static void reiser4_status_endio(struct bio *bio, int err)
64426 +{
64427 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64428 + SetPageUptodate(bio->bi_io_vec->bv_page);
64429 + } else {
64430 + ClearPageUptodate(bio->bi_io_vec->bv_page);
64431 + SetPageError(bio->bi_io_vec->bv_page);
64432 + }
64433 + unlock_page(bio->bi_io_vec->bv_page);
64434 +}
64435 +
64436 +/* Initialise status code. This is expected to be called from the disk format
64437 + code. block paremeter is where status block lives. */
64438 +int reiser4_status_init(reiser4_block_nr block)
64439 +{
64440 + struct super_block *sb = reiser4_get_current_sb();
64441 + struct reiser4_status *statuspage;
64442 + struct bio *bio;
64443 + struct page *page;
64444 +
64445 + get_super_private(sb)->status_page = NULL;
64446 + get_super_private(sb)->status_bio = NULL;
64447 +
64448 + page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64449 + if (!page)
64450 + return -ENOMEM;
64451 +
64452 + bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64453 + if (bio != NULL) {
64454 + bio->bi_sector = block * (sb->s_blocksize >> 9);
64455 + bio->bi_bdev = sb->s_bdev;
64456 + bio->bi_io_vec[0].bv_page = page;
64457 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64458 + bio->bi_io_vec[0].bv_offset = 0;
64459 + bio->bi_vcnt = 1;
64460 + bio->bi_size = sb->s_blocksize;
64461 + bio->bi_end_io = reiser4_status_endio;
64462 + } else {
64463 + __free_pages(page, 0);
64464 + return -ENOMEM;
64465 + }
64466 + lock_page(page);
64467 + submit_bio(READ, bio);
64468 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64469 + wait_on_page_locked(page);
64470 + if (!PageUptodate(page)) {
64471 + warning("green-2007",
64472 + "I/O error while tried to read status page\n");
64473 + return -EIO;
64474 + }
64475 +
64476 + statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64477 + if (memcmp
64478 + (statuspage->magic, REISER4_STATUS_MAGIC,
64479 + sizeof(REISER4_STATUS_MAGIC))) {
64480 + /* Magic does not match. */
64481 + kunmap_atomic((char *)statuspage, KM_USER0);
64482 + warning("green-2008", "Wrong magic in status block\n");
64483 + __free_pages(page, 0);
64484 + bio_put(bio);
64485 + return -EINVAL;
64486 + }
64487 + kunmap_atomic((char *)statuspage, KM_USER0);
64488 +
64489 + get_super_private(sb)->status_page = page;
64490 + get_super_private(sb)->status_bio = bio;
64491 + return 0;
64492 +}
64493 +
64494 +/* Query the status of fs. Returns if the FS can be safely mounted.
64495 + Also if "status" and "extended" parameters are given, it will fill
64496 + actual parts of status from disk there. */
64497 +int reiser4_status_query(u64 *status, u64 *extended)
64498 +{
64499 + struct super_block *sb = reiser4_get_current_sb();
64500 + struct reiser4_status *statuspage;
64501 + int retval;
64502 +
64503 + if (!get_super_private(sb)->status_page)
64504 + /* No status page? */
64505 + return REISER4_STATUS_MOUNT_UNKNOWN;
64506 + statuspage = (struct reiser4_status *)
64507 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64508 + switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {
64509 + /* FIXME: this cast is a hack for 32 bit arches to work. */
64510 + case REISER4_STATUS_OK:
64511 + retval = REISER4_STATUS_MOUNT_OK;
64512 + break;
64513 + case REISER4_STATUS_CORRUPTED:
64514 + retval = REISER4_STATUS_MOUNT_WARN;
64515 + break;
64516 + case REISER4_STATUS_DAMAGED:
64517 + case REISER4_STATUS_DESTROYED:
64518 + case REISER4_STATUS_IOERROR:
64519 + retval = REISER4_STATUS_MOUNT_RO;
64520 + break;
64521 + default:
64522 + retval = REISER4_STATUS_MOUNT_UNKNOWN;
64523 + break;
64524 + }
64525 +
64526 + if (status)
64527 + *status = le64_to_cpu(get_unaligned(&statuspage->status));
64528 + if (extended)
64529 + *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64530 +
64531 + kunmap_atomic((char *)statuspage, KM_USER0);
64532 + return retval;
64533 +}
64534 +
64535 +/* This function should be called when something bad happens (e.g. from
64536 + reiser4_panic). It fills the status structure and tries to push it to disk.*/
64537 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64538 +{
64539 + struct super_block *sb = reiser4_get_current_sb();
64540 + struct reiser4_status *statuspage;
64541 + struct bio *bio = get_super_private(sb)->status_bio;
64542 +
64543 + if (!get_super_private(sb)->status_page)
64544 + /* No status page? */
64545 + return -1;
64546 + statuspage = (struct reiser4_status *)
64547 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64548 +
64549 + put_unaligned(cpu_to_le64(status), &statuspage->status);
64550 + put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64551 + strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64552 +
64553 + kunmap_atomic((char *)statuspage, KM_USER0);
64554 + bio->bi_bdev = sb->s_bdev;
64555 + bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64556 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64557 + bio->bi_io_vec[0].bv_offset = 0;
64558 + bio->bi_vcnt = 1;
64559 + bio->bi_size = sb->s_blocksize;
64560 + bio->bi_end_io = reiser4_status_endio;
64561 + lock_page(get_super_private(sb)->status_page); /* Safe as nobody should
64562 + * touch our page. */
64563 + /* We can block now, but we have no other choice anyway */
64564 + submit_bio(WRITE, bio);
64565 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64566 + return 0; /* We do not wait for io to finish. */
64567 +}
64568 +
64569 +/* Frees the page with status and bio structure. Should be called by disk format
64570 + * at umount time */
64571 +int reiser4_status_finish(void)
64572 +{
64573 + struct super_block *sb = reiser4_get_current_sb();
64574 +
64575 + __free_pages(get_super_private(sb)->status_page, 0);
64576 + get_super_private(sb)->status_page = NULL;
64577 + bio_put(get_super_private(sb)->status_bio);
64578 + get_super_private(sb)->status_bio = NULL;
64579 + return 0;
64580 +}
64581 diff -urN linux-2.6.33.orig/fs/reiser4/status_flags.h linux-2.6.33/fs/reiser4/status_flags.h
64582 --- linux-2.6.33.orig/fs/reiser4/status_flags.h 1970-01-01 01:00:00.000000000 +0100
64583 +++ linux-2.6.33/fs/reiser4/status_flags.h 2010-03-04 19:33:22.000000000 +0100
64584 @@ -0,0 +1,47 @@
64585 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64586 + * reiser4/README */
64587 +
64588 +/* Here we declare structures and flags that store reiser4 status on disk.
64589 + The status that helps us to find out if the filesystem is valid or if it
64590 + contains some critical, or not so critical errors */
64591 +
64592 +#if !defined(__REISER4_STATUS_FLAGS_H__)
64593 +#define __REISER4_STATUS_FLAGS_H__
64594 +
64595 +#include "dformat.h"
64596 +/* These are major status flags */
64597 +#define REISER4_STATUS_OK 0
64598 +#define REISER4_STATUS_CORRUPTED 0x1
64599 +#define REISER4_STATUS_DAMAGED 0x2
64600 +#define REISER4_STATUS_DESTROYED 0x4
64601 +#define REISER4_STATUS_IOERROR 0x8
64602 +
64603 +/* Return values for reiser4_status_query() */
64604 +#define REISER4_STATUS_MOUNT_OK 0
64605 +#define REISER4_STATUS_MOUNT_WARN 1
64606 +#define REISER4_STATUS_MOUNT_RO 2
64607 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
64608 +
64609 +#define REISER4_TEXTERROR_LEN 256
64610 +
64611 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64612 +/* We probably need to keep its size under sector size which is 512 bytes */
64613 +struct reiser4_status {
64614 + char magic[16];
64615 + d64 status; /* Current FS state */
64616 + d64 extended_status; /* Any additional info that might have sense in
64617 + * addition to "status". E.g. last sector where
64618 + * io error happened if status is
64619 + * "io error encountered" */
64620 + d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
64621 + char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if
64622 + * appropriate, otherwise filled
64623 + * with zeroes */
64624 +};
64625 +
64626 +int reiser4_status_init(reiser4_block_nr block);
64627 +int reiser4_status_query(u64 *status, u64 *extended);
64628 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
64629 +int reiser4_status_finish(void);
64630 +
64631 +#endif
64632 diff -urN linux-2.6.33.orig/fs/reiser4/super.c linux-2.6.33/fs/reiser4/super.c
64633 --- linux-2.6.33.orig/fs/reiser4/super.c 1970-01-01 01:00:00.000000000 +0100
64634 +++ linux-2.6.33/fs/reiser4/super.c 2010-03-04 19:33:22.000000000 +0100
64635 @@ -0,0 +1,306 @@
64636 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64637 + * reiser4/README */
64638 +
64639 +/* Super-block manipulations. */
64640 +
64641 +#include "debug.h"
64642 +#include "dformat.h"
64643 +#include "key.h"
64644 +#include "plugin/security/perm.h"
64645 +#include "plugin/space/space_allocator.h"
64646 +#include "plugin/plugin.h"
64647 +#include "tree.h"
64648 +#include "vfs_ops.h"
64649 +#include "super.h"
64650 +#include "reiser4.h"
64651 +
64652 +#include <linux/types.h> /* for __u?? */
64653 +#include <linux/fs.h> /* for struct super_block */
64654 +
64655 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64656 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64657 +static __u64 reserved_for_root(const struct super_block *super);
64658 +
64659 +/* Return reiser4-specific part of super block */
64660 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super)
64661 +{
64662 + return (reiser4_super_info_data *) super->s_fs_info;
64663 +}
64664 +
64665 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs()
64666 + */
64667 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64668 +{
64669 + assert("nikita-448", super != NULL);
64670 + assert("nikita-449", is_reiser4_super(super));
64671 + return (long)REISER4_SUPER_MAGIC;
64672 +}
64673 +
64674 +/* functions to read/modify fields of reiser4_super_info_data */
64675 +
64676 +/* get number of blocks in file system */
64677 +__u64 reiser4_block_count(const struct super_block *super /* super block
64678 + queried */ )
64679 +{
64680 + assert("vs-494", super != NULL);
64681 + assert("vs-495", is_reiser4_super(super));
64682 + return get_super_private(super)->block_count;
64683 +}
64684 +
64685 +#if REISER4_DEBUG
64686 +/*
64687 + * number of blocks in the current file system
64688 + */
64689 +__u64 reiser4_current_block_count(void)
64690 +{
64691 + return get_current_super_private()->block_count;
64692 +}
64693 +#endif /* REISER4_DEBUG */
64694 +
64695 +/* set number of block in filesystem */
64696 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64697 +{
64698 + assert("vs-501", super != NULL);
64699 + assert("vs-502", is_reiser4_super(super));
64700 + get_super_private(super)->block_count = nr;
64701 + /*
64702 + * The proper calculation of the reserved space counter (%5 of device
64703 + * block counter) we need a 64 bit division which is missing in Linux
64704 + * on i386 platform. Because we do not need a precise calculation here
64705 + * we can replace a div64 operation by this combination of
64706 + * multiplication and shift: 51. / (2^10) == .0498 .
64707 + * FIXME: this is a bug. It comes up only for very small filesystems
64708 + * which probably are never used. Nevertheless, it is a bug. Number of
64709 + * reserved blocks must be not less than maximal number of blocks which
64710 + * get grabbed with BA_RESERVED.
64711 + */
64712 + get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64713 +}
64714 +
64715 +/* amount of blocks used (allocated for data) in file system */
64716 +__u64 reiser4_data_blocks(const struct super_block *super /* super block
64717 + queried */ )
64718 +{
64719 + assert("nikita-452", super != NULL);
64720 + assert("nikita-453", is_reiser4_super(super));
64721 + return get_super_private(super)->blocks_used;
64722 +}
64723 +
64724 +/* set number of block used in filesystem */
64725 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64726 +{
64727 + assert("vs-503", super != NULL);
64728 + assert("vs-504", is_reiser4_super(super));
64729 + get_super_private(super)->blocks_used = nr;
64730 +}
64731 +
64732 +/* amount of free blocks in file system */
64733 +__u64 reiser4_free_blocks(const struct super_block *super /* super block
64734 + queried */ )
64735 +{
64736 + assert("nikita-454", super != NULL);
64737 + assert("nikita-455", is_reiser4_super(super));
64738 + return get_super_private(super)->blocks_free;
64739 +}
64740 +
64741 +/* set number of blocks free in filesystem */
64742 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64743 +{
64744 + assert("vs-505", super != NULL);
64745 + assert("vs-506", is_reiser4_super(super));
64746 + get_super_private(super)->blocks_free = nr;
64747 +}
64748 +
64749 +/* get mkfs unique identifier */
64750 +__u32 reiser4_mkfs_id(const struct super_block *super /* super block
64751 + queried */ )
64752 +{
64753 + assert("vpf-221", super != NULL);
64754 + assert("vpf-222", is_reiser4_super(super));
64755 + return get_super_private(super)->mkfs_id;
64756 +}
64757 +
64758 +/* amount of free blocks in file system */
64759 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
64760 +{
64761 + assert("vs-497", super != NULL);
64762 + assert("vs-498", is_reiser4_super(super));
64763 + return get_super_private(super)->blocks_free_committed;
64764 +}
64765 +
64766 +/* amount of blocks in the file system reserved for @uid and @gid */
64767 +long reiser4_reserved_blocks(const struct super_block *super /* super block
64768 + queried */ ,
64769 + uid_t uid /* user id */ ,
64770 + gid_t gid/* group id */)
64771 +{
64772 + long reserved;
64773 +
64774 + assert("nikita-456", super != NULL);
64775 + assert("nikita-457", is_reiser4_super(super));
64776 +
64777 + reserved = 0;
64778 + if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64779 + reserved += reserved_for_gid(super, gid);
64780 + if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64781 + reserved += reserved_for_uid(super, uid);
64782 + if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64783 + reserved += reserved_for_root(super);
64784 + return reserved;
64785 +}
64786 +
64787 +/* get/set value of/to grabbed blocks counter */
64788 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
64789 +{
64790 + assert("zam-512", super != NULL);
64791 + assert("zam-513", is_reiser4_super(super));
64792 +
64793 + return get_super_private(super)->blocks_grabbed;
64794 +}
64795 +
64796 +__u64 reiser4_flush_reserved(const struct super_block *super)
64797 +{
64798 + assert("vpf-285", super != NULL);
64799 + assert("vpf-286", is_reiser4_super(super));
64800 +
64801 + return get_super_private(super)->blocks_flush_reserved;
64802 +}
64803 +
64804 +/* get/set value of/to counter of fake allocated formatted blocks */
64805 +__u64 reiser4_fake_allocated(const struct super_block *super)
64806 +{
64807 + assert("zam-516", super != NULL);
64808 + assert("zam-517", is_reiser4_super(super));
64809 +
64810 + return get_super_private(super)->blocks_fake_allocated;
64811 +}
64812 +
64813 +/* get/set value of/to counter of fake allocated unformatted blocks */
64814 +__u64 reiser4_fake_allocated_unformatted(const struct super_block *super)
64815 +{
64816 + assert("zam-516", super != NULL);
64817 + assert("zam-517", is_reiser4_super(super));
64818 +
64819 + return get_super_private(super)->blocks_fake_allocated_unformatted;
64820 +}
64821 +
64822 +/* get/set value of/to counter of clustered blocks */
64823 +__u64 reiser4_clustered_blocks(const struct super_block *super)
64824 +{
64825 + assert("edward-601", super != NULL);
64826 + assert("edward-602", is_reiser4_super(super));
64827 +
64828 + return get_super_private(super)->blocks_clustered;
64829 +}
64830 +
64831 +/* space allocator used by this file system */
64832 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64833 + *super)
64834 +{
64835 + assert("nikita-1965", super != NULL);
64836 + assert("nikita-1966", is_reiser4_super(super));
64837 + return &get_super_private(super)->space_allocator;
64838 +}
64839 +
64840 +/* return fake inode used to bind formatted nodes in the page cache */
64841 +struct inode *reiser4_get_super_fake(const struct super_block *super)
64842 +{
64843 + assert("nikita-1757", super != NULL);
64844 + return get_super_private(super)->fake;
64845 +}
64846 +
64847 +/* return fake inode used to bind copied on capture nodes in the page cache */
64848 +struct inode *reiser4_get_cc_fake(const struct super_block *super)
64849 +{
64850 + assert("nikita-1757", super != NULL);
64851 + return get_super_private(super)->cc;
64852 +}
64853 +
64854 +/* return fake inode used to bind bitmaps and journlal heads */
64855 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64856 +{
64857 + assert("nikita-17571", super != NULL);
64858 + return get_super_private(super)->bitmap;
64859 +}
64860 +
64861 +/* tree used by this file system */
64862 +reiser4_tree *reiser4_get_tree(const struct super_block *super)
64863 +{
64864 + assert("nikita-460", super != NULL);
64865 + assert("nikita-461", is_reiser4_super(super));
64866 + return &get_super_private(super)->tree;
64867 +}
64868 +
64869 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
64870 + use in assertions. */
64871 +int is_reiser4_super(const struct super_block *super)
64872 +{
64873 + return
64874 + super != NULL &&
64875 + get_super_private(super) != NULL &&
64876 + super->s_op == &(get_super_private(super)->ops.super);
64877 +}
64878 +
64879 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64880 +{
64881 + return test_bit((int)f, &get_super_private(super)->fs_flags);
64882 +}
64883 +
64884 +/* amount of blocks reserved for given group in file system */
64885 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG,
64886 + gid_t gid UNUSED_ARG/* group id */)
64887 +{
64888 + return 0;
64889 +}
64890 +
64891 +/* amount of blocks reserved for given user in file system */
64892 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG,
64893 + uid_t uid UNUSED_ARG/* user id */)
64894 +{
64895 + return 0;
64896 +}
64897 +
64898 +/* amount of blocks reserved for super user in file system */
64899 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG)
64900 +{
64901 + return 0;
64902 +}
64903 +
64904 +/*
64905 + * true if block number @blk makes sense for the file system at @super.
64906 + */
64907 +int
64908 +reiser4_blocknr_is_sane_for(const struct super_block *super,
64909 + const reiser4_block_nr * blk)
64910 +{
64911 + reiser4_super_info_data *sbinfo;
64912 +
64913 + assert("nikita-2957", super != NULL);
64914 + assert("nikita-2958", blk != NULL);
64915 +
64916 + if (reiser4_blocknr_is_fake(blk))
64917 + return 1;
64918 +
64919 + sbinfo = get_super_private(super);
64920 + return *blk < sbinfo->block_count;
64921 +}
64922 +
64923 +#if REISER4_DEBUG
64924 +/*
64925 + * true, if block number @blk makes sense for the current file system
64926 + */
64927 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64928 +{
64929 + return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64930 +}
64931 +#endif /* REISER4_DEBUG */
64932 +
64933 +/* Make Linus happy.
64934 + Local variables:
64935 + c-indentation-style: "K&R"
64936 + mode-name: "LC"
64937 + c-basic-offset: 8
64938 + tab-width: 8
64939 + fill-column: 120
64940 + End:
64941 +*/
64942 diff -urN linux-2.6.33.orig/fs/reiser4/super.h linux-2.6.33/fs/reiser4/super.h
64943 --- linux-2.6.33.orig/fs/reiser4/super.h 1970-01-01 01:00:00.000000000 +0100
64944 +++ linux-2.6.33/fs/reiser4/super.h 2010-03-04 19:33:22.000000000 +0100
64945 @@ -0,0 +1,466 @@
64946 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64947 + * reiser4/README */
64948 +
64949 +/* Super-block functions. See super.c for details. */
64950 +
64951 +#if !defined(__REISER4_SUPER_H__)
64952 +#define __REISER4_SUPER_H__
64953 +
64954 +#include <linux/exportfs.h>
64955 +
64956 +#include "tree.h"
64957 +#include "entd.h"
64958 +#include "wander.h"
64959 +#include "fsdata.h"
64960 +#include "plugin/object.h"
64961 +#include "plugin/space/space_allocator.h"
64962 +
64963 +/*
64964 + * Flush algorithms parameters.
64965 + */
64966 +struct flush_params {
64967 + unsigned relocate_threshold;
64968 + unsigned relocate_distance;
64969 + unsigned written_threshold;
64970 + unsigned scan_maxnodes;
64971 +};
64972 +
64973 +typedef enum {
64974 + /*
64975 + * True if this file system doesn't support hard-links (multiple names)
64976 + * for directories: this is default UNIX behavior.
64977 + *
64978 + * If hard-links on directoires are not allowed, file system is Acyclic
64979 + * Directed Graph (modulo dot, and dotdot, of course).
64980 + *
64981 + * This is used by reiser4_link().
64982 + */
64983 + REISER4_ADG = 0,
64984 + /*
64985 + * set if all nodes in internal tree have the same node layout plugin.
64986 + * If so, znode_guess_plugin() will return tree->node_plugin in stead
64987 + * of guessing plugin by plugin id stored in the node.
64988 + */
64989 + REISER4_ONE_NODE_PLUGIN = 1,
64990 + /* if set, bsd gid assignment is supported. */
64991 + REISER4_BSD_GID = 2,
64992 + /* [mac]_time are 32 bit in inode */
64993 + REISER4_32_BIT_TIMES = 3,
64994 + /* load all bitmap blocks at mount time */
64995 + REISER4_DONT_LOAD_BITMAP = 5,
64996 + /* enforce atomicity during write(2) */
64997 + REISER4_ATOMIC_WRITE = 6,
64998 + /* don't use write barriers in the log writer code. */
64999 + REISER4_NO_WRITE_BARRIER = 7
65000 +} reiser4_fs_flag;
65001 +
65002 +/*
65003 + * VFS related operation vectors.
65004 + */
65005 +struct object_ops {
65006 + struct super_operations super;
65007 + struct dentry_operations dentry;
65008 + struct export_operations export;
65009 +};
65010 +
65011 +/* reiser4-specific part of super block
65012 +
65013 + Locking
65014 +
65015 + Fields immutable after mount:
65016 +
65017 + ->oid*
65018 + ->space*
65019 + ->default_[ug]id
65020 + ->mkfs_id
65021 + ->trace_flags
65022 + ->debug_flags
65023 + ->fs_flags
65024 + ->df_plug
65025 + ->optimal_io_size
65026 + ->plug
65027 + ->flush
65028 + ->u (bad name)
65029 + ->txnmgr
65030 + ->ra_params
65031 + ->fsuid
65032 + ->journal_header
65033 + ->journal_footer
65034 +
65035 + Fields protected by ->lnode_guard
65036 +
65037 + ->lnode_htable
65038 +
65039 + Fields protected by per-super block spin lock
65040 +
65041 + ->block_count
65042 + ->blocks_used
65043 + ->blocks_free
65044 + ->blocks_free_committed
65045 + ->blocks_grabbed
65046 + ->blocks_fake_allocated_unformatted
65047 + ->blocks_fake_allocated
65048 + ->blocks_flush_reserved
65049 + ->eflushed
65050 + ->blocknr_hint_default
65051 +
65052 + After journal replaying during mount,
65053 +
65054 + ->last_committed_tx
65055 +
65056 + is protected by ->tmgr.commit_mutex
65057 +
65058 + Invariants involving this data-type:
65059 +
65060 + [sb-block-counts]
65061 + [sb-grabbed]
65062 + [sb-fake-allocated]
65063 +*/
65064 +struct reiser4_super_info_data {
65065 + /*
65066 + * guard spinlock which protects reiser4 super block fields (currently
65067 + * blocks_free, blocks_free_committed)
65068 + */
65069 + spinlock_t guard;
65070 +
65071 + /* next oid that will be returned by oid_allocate() */
65072 + oid_t next_to_use;
65073 + /* total number of used oids */
65074 + oid_t oids_in_use;
65075 +
65076 + /* space manager plugin */
65077 + reiser4_space_allocator space_allocator;
65078 +
65079 + /* reiser4 internal tree */
65080 + reiser4_tree tree;
65081 +
65082 + /*
65083 + * default user id used for light-weight files without their own
65084 + * stat-data.
65085 + */
65086 + uid_t default_uid;
65087 +
65088 + /*
65089 + * default group id used for light-weight files without their own
65090 + * stat-data.
65091 + */
65092 + gid_t default_gid;
65093 +
65094 + /* mkfs identifier generated at mkfs time. */
65095 + __u32 mkfs_id;
65096 + /* amount of blocks in a file system */
65097 + __u64 block_count;
65098 +
65099 + /* inviolable reserve */
65100 + __u64 blocks_reserved;
65101 +
65102 + /* amount of blocks used by file system data and meta-data. */
65103 + __u64 blocks_used;
65104 +
65105 + /*
65106 + * amount of free blocks. This is "working" free blocks counter. It is
65107 + * like "working" bitmap, please see block_alloc.c for description.
65108 + */
65109 + __u64 blocks_free;
65110 +
65111 + /*
65112 + * free block count for fs committed state. This is "commit" version of
65113 + * free block counter.
65114 + */
65115 + __u64 blocks_free_committed;
65116 +
65117 + /*
65118 + * number of blocks reserved for further allocation, for all
65119 + * threads.
65120 + */
65121 + __u64 blocks_grabbed;
65122 +
65123 + /* number of fake allocated unformatted blocks in tree. */
65124 + __u64 blocks_fake_allocated_unformatted;
65125 +
65126 + /* number of fake allocated formatted blocks in tree. */
65127 + __u64 blocks_fake_allocated;
65128 +
65129 + /* number of blocks reserved for flush operations. */
65130 + __u64 blocks_flush_reserved;
65131 +
65132 + /* number of blocks reserved for cluster operations. */
65133 + __u64 blocks_clustered;
65134 +
65135 + /* unique file-system identifier */
65136 + __u32 fsuid;
65137 +
65138 + /* On-disk format version. If does not equal to the disk_format
65139 + plugin version, some format updates (e.g. enlarging plugin
65140 + set, etc) may have place on mount. */
65141 + int version;
65142 +
65143 + /* file-system wide flags. See reiser4_fs_flag enum */
65144 + unsigned long fs_flags;
65145 +
65146 + /* transaction manager */
65147 + txn_mgr tmgr;
65148 +
65149 + /* ent thread */
65150 + entd_context entd;
65151 +
65152 + /* fake inode used to bind formatted nodes */
65153 + struct inode *fake;
65154 + /* inode used to bind bitmaps (and journal heads) */
65155 + struct inode *bitmap;
65156 + /* inode used to bind copied on capture nodes */
65157 + struct inode *cc;
65158 +
65159 + /* disk layout plugin */
65160 + disk_format_plugin *df_plug;
65161 +
65162 + /* disk layout specific part of reiser4 super info data */
65163 + union {
65164 + format40_super_info format40;
65165 + } u;
65166 +
65167 + /* value we return in st_blksize on stat(2) */
65168 + unsigned long optimal_io_size;
65169 +
65170 + /* parameters for the flush algorithm */
65171 + struct flush_params flush;
65172 +
65173 + /* pointers to jnodes for journal header and footer */
65174 + jnode *journal_header;
65175 + jnode *journal_footer;
65176 +
65177 + journal_location jloc;
65178 +
65179 + /* head block number of last committed transaction */
65180 + __u64 last_committed_tx;
65181 +
65182 + /*
65183 + * we remember last written location for using as a hint for new block
65184 + * allocation
65185 + */
65186 + __u64 blocknr_hint_default;
65187 +
65188 + /* committed number of files (oid allocator state variable ) */
65189 + __u64 nr_files_committed;
65190 +
65191 + struct formatted_ra_params ra_params;
65192 +
65193 + /*
65194 + * A mutex for serializing cut tree operation if out-of-free-space:
65195 + * the only one cut_tree thread is allowed to grab space from reserved
65196 + * area (it is 5% of disk space)
65197 + */
65198 + struct mutex delete_mutex;
65199 + /* task owning ->delete_mutex */
65200 + struct task_struct *delete_mutex_owner;
65201 +
65202 + /* Diskmap's blocknumber */
65203 + __u64 diskmap_block;
65204 +
65205 + /* What to do in case of error */
65206 + int onerror;
65207 +
65208 + /* operations for objects on this file system */
65209 + struct object_ops ops;
65210 +
65211 + /*
65212 + * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
65213 + * more details
65214 + */
65215 + struct d_cursor_info d_info;
65216 +
65217 +#ifdef CONFIG_REISER4_BADBLOCKS
65218 + /* Alternative master superblock offset (in bytes) */
65219 + unsigned long altsuper;
65220 +#endif
65221 + struct repacker *repacker;
65222 + struct page *status_page;
65223 + struct bio *status_bio;
65224 +
65225 +#if REISER4_DEBUG
65226 + /*
65227 + * minimum used blocks value (includes super blocks, bitmap blocks and
65228 + * other fs reserved areas), depends on fs format and fs size.
65229 + */
65230 + __u64 min_blocks_used;
65231 +
65232 + /*
65233 + * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
65234 + * are kept on a list anchored at sbinfo->all_jnodes. This list is
65235 + * protected by sbinfo->all_guard spin lock. This lock should be taken
65236 + * with _irq modifier, because it is also modified from interrupt
65237 + * contexts (by RCU).
65238 + */
65239 + spinlock_t all_guard;
65240 + /* list of all jnodes */
65241 + struct list_head all_jnodes;
65242 +#endif
65243 + struct dentry *debugfs_root;
65244 +};
65245 +
65246 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
65247 + super_block * super);
65248 +
65249 +/* Return reiser4-specific part of super block */
65250 +static inline reiser4_super_info_data *get_super_private(const struct
65251 + super_block * super)
65252 +{
65253 + assert("nikita-447", super != NULL);
65254 +
65255 + return (reiser4_super_info_data *) super->s_fs_info;
65256 +}
65257 +
65258 +/* get ent context for the @super */
65259 +static inline entd_context *get_entd_context(struct super_block *super)
65260 +{
65261 + return &get_super_private(super)->entd;
65262 +}
65263 +
65264 +/* "Current" super-block: main super block used during current system
65265 + call. Reference to this super block is stored in reiser4_context. */
65266 +static inline struct super_block *reiser4_get_current_sb(void)
65267 +{
65268 + return get_current_context()->super;
65269 +}
65270 +
65271 +/* Reiser4-specific part of "current" super-block: main super block used
65272 + during current system call. Reference to this super block is stored in
65273 + reiser4_context. */
65274 +static inline reiser4_super_info_data *get_current_super_private(void)
65275 +{
65276 + return get_super_private(reiser4_get_current_sb());
65277 +}
65278 +
65279 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
65280 +{
65281 + return &(get_current_super_private()->ra_params);
65282 +}
65283 +
65284 +/*
65285 + * true, if file system on @super is read-only
65286 + */
65287 +static inline int rofs_super(struct super_block *super)
65288 +{
65289 + return super->s_flags & MS_RDONLY;
65290 +}
65291 +
65292 +/*
65293 + * true, if @tree represents read-only file system
65294 + */
65295 +static inline int rofs_tree(reiser4_tree * tree)
65296 +{
65297 + return rofs_super(tree->super);
65298 +}
65299 +
65300 +/*
65301 + * true, if file system where @inode lives on, is read-only
65302 + */
65303 +static inline int rofs_inode(struct inode *inode)
65304 +{
65305 + return rofs_super(inode->i_sb);
65306 +}
65307 +
65308 +/*
65309 + * true, if file system where @node lives on, is read-only
65310 + */
65311 +static inline int rofs_jnode(jnode * node)
65312 +{
65313 + return rofs_tree(jnode_get_tree(node));
65314 +}
65315 +
65316 +extern __u64 reiser4_current_block_count(void);
65317 +
65318 +extern void build_object_ops(struct super_block *super, struct object_ops *ops);
65319 +
65320 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
65321 +
65322 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
65323 +{
65324 + spin_lock(&(sbinfo->guard));
65325 +}
65326 +
65327 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
65328 +{
65329 + assert_spin_locked(&(sbinfo->guard));
65330 + spin_unlock(&(sbinfo->guard));
65331 +}
65332 +
65333 +extern __u64 reiser4_flush_reserved(const struct super_block *);
65334 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
65335 +extern long reiser4_statfs_type(const struct super_block *super);
65336 +extern __u64 reiser4_block_count(const struct super_block *super);
65337 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
65338 +extern __u64 reiser4_data_blocks(const struct super_block *super);
65339 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
65340 +extern __u64 reiser4_free_blocks(const struct super_block *super);
65341 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
65342 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
65343 +
65344 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
65345 +
65346 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
65347 +extern __u64 reiser4_fake_allocated(const struct super_block *);
65348 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
65349 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
65350 +
65351 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
65352 + gid_t gid);
65353 +
65354 +extern reiser4_space_allocator *
65355 +reiser4_get_space_allocator(const struct super_block *super);
65356 +extern reiser4_oid_allocator *
65357 +reiser4_get_oid_allocator(const struct super_block *super);
65358 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
65359 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
65360 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
65361 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
65362 +extern int is_reiser4_super(const struct super_block *super);
65363 +
65364 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
65365 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
65366 + const reiser4_block_nr * blk);
65367 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
65368 +extern int reiser4_done_super(struct super_block *s);
65369 +
65370 +/* step of fill super */
65371 +extern int reiser4_init_fs_info(struct super_block *);
65372 +extern void reiser4_done_fs_info(struct super_block *);
65373 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
65374 +extern int reiser4_init_read_super(struct super_block *, int silent);
65375 +extern int reiser4_init_root_inode(struct super_block *);
65376 +extern reiser4_plugin *get_default_plugin(pset_member memb);
65377 +
65378 +/* Maximal possible object id. */
65379 +#define ABSOLUTE_MAX_OID ((oid_t)~0)
65380 +
65381 +#define OIDS_RESERVED (1 << 16)
65382 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
65383 +oid_t oid_allocate(struct super_block *);
65384 +int oid_release(struct super_block *, oid_t);
65385 +oid_t oid_next(const struct super_block *);
65386 +void oid_count_allocated(void);
65387 +void oid_count_released(void);
65388 +long oids_used(const struct super_block *);
65389 +
65390 +#if REISER4_DEBUG
65391 +void print_fs_info(const char *prefix, const struct super_block *);
65392 +#endif
65393 +
65394 +extern void destroy_reiser4_cache(struct kmem_cache **);
65395 +
65396 +extern struct super_operations reiser4_super_operations;
65397 +extern struct export_operations reiser4_export_operations;
65398 +extern struct dentry_operations reiser4_dentry_operations;
65399 +
65400 +/* __REISER4_SUPER_H__ */
65401 +#endif
65402 +
65403 +/*
65404 + * Local variables:
65405 + * c-indentation-style: "K&R"
65406 + * mode-name: "LC"
65407 + * c-basic-offset: 8
65408 + * tab-width: 8
65409 + * fill-column: 120
65410 + * End:
65411 + */
65412 diff -urN linux-2.6.33.orig/fs/reiser4/super_ops.c linux-2.6.33/fs/reiser4/super_ops.c
65413 --- linux-2.6.33.orig/fs/reiser4/super_ops.c 1970-01-01 01:00:00.000000000 +0100
65414 +++ linux-2.6.33/fs/reiser4/super_ops.c 2010-03-04 19:33:22.000000000 +0100
65415 @@ -0,0 +1,736 @@
65416 +/* Copyright 2005 by Hans Reiser, licensing governed by
65417 + * reiser4/README */
65418 +
65419 +#include "inode.h"
65420 +#include "page_cache.h"
65421 +#include "ktxnmgrd.h"
65422 +#include "flush.h"
65423 +#include "safe_link.h"
65424 +
65425 +#include <linux/vfs.h>
65426 +#include <linux/writeback.h>
65427 +#include <linux/mount.h>
65428 +#include <linux/seq_file.h>
65429 +#include <linux/debugfs.h>
65430 +
65431 +/* slab cache for inodes */
65432 +static struct kmem_cache *inode_cache;
65433 +
65434 +static struct dentry *reiser4_debugfs_root = NULL;
65435 +
65436 +/**
65437 + * init_once - constructor for reiser4 inodes
65438 + * @cache: cache @obj belongs to
65439 + * @obj: inode to be initialized
65440 + *
65441 + * Initialization function to be called when new page is allocated by reiser4
65442 + * inode cache. It is set on inode cache creation.
65443 + */
65444 +static void init_once(void *obj)
65445 +{
65446 + struct reiser4_inode_object *info;
65447 +
65448 + info = obj;
65449 +
65450 + /* initialize vfs inode */
65451 + inode_init_once(&info->vfs_inode);
65452 +
65453 + /*
65454 + * initialize reiser4 specific part fo inode.
65455 + * NOTE-NIKITA add here initializations for locks, list heads,
65456 + * etc. that will be added to our private inode part.
65457 + */
65458 + INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65459 + init_rwsem(&info->p.conv_sem);
65460 + /* init semaphore which is used during inode loading */
65461 + loading_init_once(&info->p);
65462 + INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65463 + GFP_ATOMIC);
65464 +#if REISER4_DEBUG
65465 + info->p.nr_jnodes = 0;
65466 +#endif
65467 +}
65468 +
65469 +/**
65470 + * init_inodes - create znode cache
65471 + *
65472 + * Initializes slab cache of inodes. It is part of reiser4 module initialization
65473 + */
65474 +static int init_inodes(void)
65475 +{
65476 + inode_cache = kmem_cache_create("reiser4_inode",
65477 + sizeof(struct reiser4_inode_object),
65478 + 0,
65479 + SLAB_HWCACHE_ALIGN |
65480 + SLAB_RECLAIM_ACCOUNT, init_once);
65481 + if (inode_cache == NULL)
65482 + return RETERR(-ENOMEM);
65483 + return 0;
65484 +}
65485 +
65486 +/**
65487 + * done_inodes - delete inode cache
65488 + *
65489 + * This is called on reiser4 module unloading or system shutdown.
65490 + */
65491 +static void done_inodes(void)
65492 +{
65493 + destroy_reiser4_cache(&inode_cache);
65494 +}
65495 +
65496 +/**
65497 + * reiser4_alloc_inode - alloc_inode of super operations
65498 + * @super: super block new inode is allocated for
65499 + *
65500 + * Allocates new inode, initializes reiser4 specific part of it.
65501 + */
65502 +static struct inode *reiser4_alloc_inode(struct super_block *super)
65503 +{
65504 + struct reiser4_inode_object *obj;
65505 +
65506 + assert("nikita-1696", super != NULL);
65507 + obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65508 + if (obj != NULL) {
65509 + reiser4_inode *info;
65510 +
65511 + info = &obj->p;
65512 +
65513 + info->pset = plugin_set_get_empty();
65514 + info->hset = plugin_set_get_empty();
65515 + info->extmask = 0;
65516 + info->locality_id = 0ull;
65517 + info->plugin_mask = 0;
65518 + info->heir_mask = 0;
65519 +#if !REISER4_INO_IS_OID
65520 + info->oid_hi = 0;
65521 +#endif
65522 + reiser4_seal_init(&info->sd_seal, NULL, NULL);
65523 + coord_init_invalid(&info->sd_coord, NULL);
65524 + info->flags = 0;
65525 + spin_lock_init(&info->guard);
65526 + /* this deals with info's loading semaphore */
65527 + loading_alloc(info);
65528 + info->vroot = UBER_TREE_ADDR;
65529 + return &obj->vfs_inode;
65530 + } else
65531 + return NULL;
65532 +}
65533 +
65534 +/**
65535 + * reiser4_destroy_inode - destroy_inode of super operations
65536 + * @inode: inode being destroyed
65537 + *
65538 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65539 + */
65540 +static void reiser4_destroy_inode(struct inode *inode)
65541 +{
65542 + reiser4_inode *info;
65543 +
65544 + info = reiser4_inode_data(inode);
65545 +
65546 + assert("vs-1220", inode_has_no_jnodes(info));
65547 +
65548 + if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65549 + file_plugin *fplug = inode_file_plugin(inode);
65550 + if (fplug->destroy_inode != NULL)
65551 + fplug->destroy_inode(inode);
65552 + }
65553 + reiser4_dispose_cursors(inode);
65554 + if (info->pset)
65555 + plugin_set_put(info->pset);
65556 + if (info->hset)
65557 + plugin_set_put(info->hset);
65558 +
65559 + /*
65560 + * cannot add similar assertion about ->i_list as prune_icache return
65561 + * inode into slab with dangling ->list.{next,prev}. This is safe,
65562 + * because they are re-initialized in the new_inode().
65563 + */
65564 + assert("nikita-2895", list_empty(&inode->i_dentry));
65565 + assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65566 + assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65567 +
65568 + /* this deals with info's loading semaphore */
65569 + loading_destroy(info);
65570 +
65571 + kmem_cache_free(inode_cache,
65572 + container_of(info, struct reiser4_inode_object, p));
65573 +}
65574 +
65575 +/**
65576 + * reiser4_dirty_inode - dirty_inode of super operations
65577 + * @inode: inode being dirtied
65578 + *
65579 + * Updates stat data.
65580 + */
65581 +static void reiser4_dirty_inode(struct inode *inode)
65582 +{
65583 + int result;
65584 +
65585 + if (!is_in_reiser4_context())
65586 + return;
65587 + assert("", !IS_RDONLY(inode));
65588 + assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65589 + get_current_context()->grabbed_blocks));
65590 +
65591 + result = reiser4_update_sd(inode);
65592 + if (result)
65593 + warning("", "failed to dirty inode for %llu: %d",
65594 + get_inode_oid(inode), result);
65595 +}
65596 +
65597 +/**
65598 + * reiser4_delete_inode - delete_inode of super operations
65599 + * @inode: inode to delete
65600 + *
65601 + * Calls file plugin's delete_object method to delete object items from
65602 + * filesystem tree and calls clear_inode.
65603 + */
65604 +static void reiser4_delete_inode(struct inode *inode)
65605 +{
65606 + reiser4_context *ctx;
65607 + file_plugin *fplug;
65608 +
65609 + ctx = reiser4_init_context(inode->i_sb);
65610 + if (IS_ERR(ctx)) {
65611 + warning("vs-15", "failed to init context");
65612 + return;
65613 + }
65614 +
65615 + if (is_inode_loaded(inode)) {
65616 + fplug = inode_file_plugin(inode);
65617 + if (fplug != NULL && fplug->delete_object != NULL)
65618 + fplug->delete_object(inode);
65619 + }
65620 +
65621 + truncate_inode_pages(&inode->i_data, 0);
65622 + inode->i_blocks = 0;
65623 + clear_inode(inode);
65624 + reiser4_exit_context(ctx);
65625 +}
65626 +
65627 +/**
65628 + * reiser4_put_super - put_super of super operations
65629 + * @super: super block to free
65630 + *
65631 + * Stops daemons, release resources, umounts in short.
65632 + */
65633 +static void reiser4_put_super(struct super_block *super)
65634 +{
65635 + reiser4_super_info_data *sbinfo;
65636 + reiser4_context *ctx;
65637 +
65638 + sbinfo = get_super_private(super);
65639 + assert("vs-1699", sbinfo);
65640 +
65641 + debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65642 + debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65643 + debugfs_remove(sbinfo->debugfs_root);
65644 +
65645 + ctx = reiser4_init_context(super);
65646 + if (IS_ERR(ctx)) {
65647 + warning("vs-17", "failed to init context");
65648 + return;
65649 + }
65650 +
65651 + /* have disk format plugin to free its resources */
65652 + if (get_super_private(super)->df_plug->release)
65653 + get_super_private(super)->df_plug->release(super);
65654 +
65655 + reiser4_done_formatted_fake(super);
65656 +
65657 + /* stop daemons: ktxnmgr and entd */
65658 + reiser4_done_entd(super);
65659 + reiser4_done_ktxnmgrd(super);
65660 + reiser4_done_txnmgr(&sbinfo->tmgr);
65661 +
65662 + reiser4_done_fs_info(super);
65663 + reiser4_exit_context(ctx);
65664 +}
65665 +
65666 +/**
65667 + * reiser4_write_super - write_super of super operations
65668 + * @super: super block to write
65669 + *
65670 + * Captures znode associated with super block, comit all transactions.
65671 + */
65672 +static void reiser4_write_super(struct super_block *super)
65673 +{
65674 + int ret;
65675 + reiser4_context *ctx;
65676 +
65677 + assert("vs-1700", !rofs_super(super));
65678 +
65679 + ctx = reiser4_init_context(super);
65680 + if (IS_ERR(ctx)) {
65681 + warning("vs-16", "failed to init context");
65682 + return;
65683 + }
65684 +
65685 + ret = reiser4_capture_super_block(super);
65686 + if (ret != 0)
65687 + warning("vs-1701",
65688 + "reiser4_capture_super_block failed in write_super: %d",
65689 + ret);
65690 + ret = txnmgr_force_commit_all(super, 0);
65691 + if (ret != 0)
65692 + warning("jmacd-77113",
65693 + "txn_force failed in write_super: %d", ret);
65694 +
65695 + super->s_dirt = 0;
65696 +
65697 + reiser4_exit_context(ctx);
65698 +}
65699 +
65700 +/**
65701 + * reiser4_statfs - statfs of super operations
65702 + * @super: super block of file system in queried
65703 + * @stafs: buffer to fill with statistics
65704 + *
65705 + * Returns information about filesystem.
65706 + */
65707 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65708 +{
65709 + sector_t total;
65710 + sector_t reserved;
65711 + sector_t free;
65712 + sector_t forroot;
65713 + sector_t deleted;
65714 + reiser4_context *ctx;
65715 + struct super_block *super = dentry->d_sb;
65716 +
65717 + assert("nikita-408", super != NULL);
65718 + assert("nikita-409", statfs != NULL);
65719 +
65720 + ctx = reiser4_init_context(super);
65721 + if (IS_ERR(ctx))
65722 + return PTR_ERR(ctx);
65723 +
65724 + statfs->f_type = reiser4_statfs_type(super);
65725 + statfs->f_bsize = super->s_blocksize;
65726 +
65727 + /*
65728 + * 5% of total block space is reserved. This is needed for flush and
65729 + * for truncates (so that we are able to perform truncate/unlink even
65730 + * on the otherwise completely full file system). If this reservation
65731 + * is hidden from statfs(2), users will mistakenly guess that they
65732 + * have enough free space to complete some operation, which is
65733 + * frustrating.
65734 + *
65735 + * Another possible solution is to subtract ->blocks_reserved from
65736 + * ->f_bfree, but changing available space seems less intrusive than
65737 + * letting user to see 5% of disk space to be used directly after
65738 + * mkfs.
65739 + */
65740 + total = reiser4_block_count(super);
65741 + reserved = get_super_private(super)->blocks_reserved;
65742 + deleted = txnmgr_count_deleted_blocks();
65743 + free = reiser4_free_blocks(super) + deleted;
65744 + forroot = reiser4_reserved_blocks(super, 0, 0);
65745 +
65746 + /*
65747 + * These counters may be in inconsistent state because we take the
65748 + * values without keeping any global spinlock. Here we do a sanity
65749 + * check that free block counter does not exceed the number of all
65750 + * blocks.
65751 + */
65752 + if (free > total)
65753 + free = total;
65754 + statfs->f_blocks = total - reserved;
65755 + /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65756 + if (free > reserved)
65757 + free -= reserved;
65758 + else
65759 + free = 0;
65760 + statfs->f_bfree = free;
65761 +
65762 + if (free > forroot)
65763 + free -= forroot;
65764 + else
65765 + free = 0;
65766 + statfs->f_bavail = free;
65767 +
65768 + statfs->f_files = 0;
65769 + statfs->f_ffree = 0;
65770 +
65771 + /* maximal acceptable name length depends on directory plugin. */
65772 + assert("nikita-3351", super->s_root->d_inode != NULL);
65773 + statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65774 + reiser4_exit_context(ctx);
65775 + return 0;
65776 +}
65777 +
65778 +/**
65779 + * reiser4_clear_inode - clear_inode of super operation
65780 + * @inode: inode about to destroy
65781 + *
65782 + * Does sanity checks: being destroyed should have all jnodes detached.
65783 + */
65784 +static void reiser4_clear_inode(struct inode *inode)
65785 +{
65786 +#if REISER4_DEBUG
65787 + reiser4_inode *r4_inode;
65788 +
65789 + r4_inode = reiser4_inode_data(inode);
65790 + if (!inode_has_no_jnodes(r4_inode))
65791 + warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65792 + r4_inode->nr_jnodes);
65793 +#endif
65794 +}
65795 +
65796 +/**
65797 + * reiser4_writeback_inodes - writeback_inodes of super operations
65798 + * @super:
65799 + * @wb:
65800 + * @wbc:
65801 + *
65802 + * This method is called by background and non-backgound writeback. Reiser4's
65803 + * implementation uses generic_writeback_sb_inodes to call reiser4_writepages
65804 + * for each of dirty inodes. reiser4_writepages handles pages dirtied via shared
65805 + * mapping - dirty pages get into atoms. Writeout is called to flush some atoms.
65806 + */
65807 +static int reiser4_writeback_inodes(struct super_block *super,
65808 + struct bdi_writeback *wb,
65809 + struct writeback_control *wbc)
65810 +{
65811 + int ret;
65812 + long to_write;
65813 + reiser4_context *ctx;
65814 +
65815 + if (wbc->for_kupdate)
65816 + /* reiser4 has its own means of periodical write-out */
65817 + goto skip;
65818 + assert("vs-49", wbc->older_than_this == NULL);
65819 +
65820 + spin_unlock(&inode_lock);
65821 + ctx = reiser4_init_context(super);
65822 + if (IS_ERR(ctx)) {
65823 + warning("vs-13", "failed to init context");
65824 + spin_lock(&inode_lock);
65825 + goto skip;
65826 + }
65827 + to_write = wbc->nr_to_write;
65828 + /*
65829 + * call reiser4_writepages for each of dirty inodes to turn
65830 + * dirty pages into transactions if they were not yet.
65831 + */
65832 + spin_lock(&inode_lock);
65833 + ret = generic_writeback_sb_inodes(super, wb, wbc);
65834 + spin_unlock(&inode_lock);
65835 +
65836 + wbc->nr_to_write = to_write;
65837 +
65838 + /* flush goes here */
65839 + reiser4_writeout(super, wbc);
65840 +
65841 + /* avoid recursive calls to ->writeback_inodes */
65842 + context_set_commit_async(ctx);
65843 + reiser4_exit_context(ctx);
65844 + spin_lock(&inode_lock);
65845 +
65846 + return wbc->nr_to_write <= 0 ? 1 : ret;
65847 + skip:
65848 + writeback_skip_sb_inodes(super, wb);
65849 + return 0;
65850 +}
65851 +
65852 +/**
65853 + * reiser4_show_options - show_options of super operations
65854 + * @m: file where to write information
65855 + * @mnt: mount structure
65856 + *
65857 + * Makes reiser4 mount options visible in /proc/mounts.
65858 + */
65859 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65860 +{
65861 + struct super_block *super;
65862 + reiser4_super_info_data *sbinfo;
65863 +
65864 + super = mnt->mnt_sb;
65865 + sbinfo = get_super_private(super);
65866 +
65867 + seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65868 + seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65869 + seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65870 + seq_printf(m, ",atom_max_flushers=0x%x",
65871 + sbinfo->tmgr.atom_max_flushers);
65872 + seq_printf(m, ",cbk_cache_slots=0x%x",
65873 + sbinfo->tree.cbk_cache.nr_slots);
65874 +
65875 + return 0;
65876 +}
65877 +
65878 +struct super_operations reiser4_super_operations = {
65879 + .alloc_inode = reiser4_alloc_inode,
65880 + .destroy_inode = reiser4_destroy_inode,
65881 + .dirty_inode = reiser4_dirty_inode,
65882 + .delete_inode = reiser4_delete_inode,
65883 + .put_super = reiser4_put_super,
65884 + .write_super = reiser4_write_super,
65885 + .statfs = reiser4_statfs,
65886 + .clear_inode = reiser4_clear_inode,
65887 + .writeback_inodes = reiser4_writeback_inodes,
65888 + .show_options = reiser4_show_options
65889 +};
65890 +
65891 +/**
65892 + * fill_super - initialize super block on mount
65893 + * @super: super block to fill
65894 + * @data: reiser4 specific mount option
65895 + * @silent:
65896 + *
65897 + * This is to be called by reiser4_get_sb. Mounts filesystem.
65898 + */
65899 +static int fill_super(struct super_block *super, void *data, int silent)
65900 +{
65901 + reiser4_context ctx;
65902 + int result;
65903 + reiser4_super_info_data *sbinfo;
65904 +
65905 + assert("zam-989", super != NULL);
65906 +
65907 + super->s_op = NULL;
65908 + init_stack_context(&ctx, super);
65909 +
65910 + /* allocate reiser4 specific super block */
65911 + if ((result = reiser4_init_fs_info(super)) != 0)
65912 + goto failed_init_sinfo;
65913 +
65914 + sbinfo = get_super_private(super);
65915 + /* initialize various reiser4 parameters, parse mount options */
65916 + if ((result = reiser4_init_super_data(super, data)) != 0)
65917 + goto failed_init_super_data;
65918 +
65919 + /* read reiser4 master super block, initialize disk format plugin */
65920 + if ((result = reiser4_init_read_super(super, silent)) != 0)
65921 + goto failed_init_read_super;
65922 +
65923 + /* initialize transaction manager */
65924 + reiser4_init_txnmgr(&sbinfo->tmgr);
65925 +
65926 + /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65927 + if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65928 + goto failed_init_ktxnmgrd;
65929 +
65930 + /* initialize entd context and start kernel thread entd */
65931 + if ((result = reiser4_init_entd(super)) != 0)
65932 + goto failed_init_entd;
65933 +
65934 + /* initialize address spaces for formatted nodes and bitmaps */
65935 + if ((result = reiser4_init_formatted_fake(super)) != 0)
65936 + goto failed_init_formatted_fake;
65937 +
65938 + /* initialize disk format plugin */
65939 + if ((result = get_super_private(super)->df_plug->init_format(super,
65940 + data)) != 0)
65941 + goto failed_init_disk_format;
65942 +
65943 + /*
65944 + * There are some 'committed' versions of reiser4 super block counters,
65945 + * which correspond to reiser4 on-disk state. These counters are
65946 + * initialized here
65947 + */
65948 + sbinfo->blocks_free_committed = sbinfo->blocks_free;
65949 + sbinfo->nr_files_committed = oids_used(super);
65950 +
65951 + /* get inode of root directory */
65952 + if ((result = reiser4_init_root_inode(super)) != 0)
65953 + goto failed_init_root_inode;
65954 +
65955 + if ((result = get_super_private(super)->df_plug->version_update(super)) != 0)
65956 + goto failed_update_format_version;
65957 +
65958 + process_safelinks(super);
65959 + reiser4_exit_context(&ctx);
65960 +
65961 + sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65962 + reiser4_debugfs_root);
65963 + if (sbinfo->debugfs_root) {
65964 + sbinfo->tmgr.debugfs_atom_count =
65965 + debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65966 + sbinfo->debugfs_root,
65967 + &sbinfo->tmgr.atom_count);
65968 + sbinfo->tmgr.debugfs_id_count =
65969 + debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65970 + sbinfo->debugfs_root,
65971 + &sbinfo->tmgr.id_count);
65972 + }
65973 + return 0;
65974 +
65975 + failed_update_format_version:
65976 + failed_init_root_inode:
65977 + if (sbinfo->df_plug->release)
65978 + sbinfo->df_plug->release(super);
65979 + failed_init_disk_format:
65980 + reiser4_done_formatted_fake(super);
65981 + failed_init_formatted_fake:
65982 + reiser4_done_entd(super);
65983 + failed_init_entd:
65984 + reiser4_done_ktxnmgrd(super);
65985 + failed_init_ktxnmgrd:
65986 + reiser4_done_txnmgr(&sbinfo->tmgr);
65987 + failed_init_read_super:
65988 + failed_init_super_data:
65989 + reiser4_done_fs_info(super);
65990 + failed_init_sinfo:
65991 + reiser4_exit_context(&ctx);
65992 + return result;
65993 +}
65994 +
65995 +/**
65996 + * reiser4_get_sb - get_sb of file_system_type operations
65997 + * @fs_type:
65998 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65999 + * @dev_name: block device file name
66000 + * @data: specific mount options
66001 + *
66002 + * Reiser4 mount entry.
66003 + */
66004 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
66005 + const char *dev_name, void *data, struct vfsmount *mnt)
66006 +{
66007 + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
66008 +}
66009 +
66010 +/* structure describing the reiser4 filesystem implementation */
66011 +static struct file_system_type reiser4_fs_type = {
66012 + .owner = THIS_MODULE,
66013 + .name = "reiser4",
66014 + .fs_flags = FS_REQUIRES_DEV,
66015 + .get_sb = reiser4_get_sb,
66016 + .kill_sb = kill_block_super,
66017 + .next = NULL
66018 +};
66019 +
66020 +void destroy_reiser4_cache(struct kmem_cache **cachep)
66021 +{
66022 + BUG_ON(*cachep == NULL);
66023 + kmem_cache_destroy(*cachep);
66024 + *cachep = NULL;
66025 +}
66026 +
66027 +/**
66028 + * init_reiser4 - reiser4 initialization entry point
66029 + *
66030 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
66031 + * on kernel initialization or during reiser4 module load.
66032 + */
66033 +static int __init init_reiser4(void)
66034 +{
66035 + int result;
66036 +
66037 + printk(KERN_INFO
66038 + "Loading Reiser4. "
66039 + "See www.namesys.com for a description of Reiser4.\n");
66040 +
66041 + /* initialize slab cache of inodes */
66042 + if ((result = init_inodes()) != 0)
66043 + goto failed_inode_cache;
66044 +
66045 + /* initialize cache of znodes */
66046 + if ((result = init_znodes()) != 0)
66047 + goto failed_init_znodes;
66048 +
66049 + /* initialize all plugins */
66050 + if ((result = init_plugins()) != 0)
66051 + goto failed_init_plugins;
66052 +
66053 + /* initialize cache of plugin_set-s and plugin_set's hash table */
66054 + if ((result = init_plugin_set()) != 0)
66055 + goto failed_init_plugin_set;
66056 +
66057 + /* initialize caches of txn_atom-s and txn_handle-s */
66058 + if ((result = init_txnmgr_static()) != 0)
66059 + goto failed_init_txnmgr_static;
66060 +
66061 + /* initialize cache of jnodes */
66062 + if ((result = init_jnodes()) != 0)
66063 + goto failed_init_jnodes;
66064 +
66065 + /* initialize cache of flush queues */
66066 + if ((result = reiser4_init_fqs()) != 0)
66067 + goto failed_init_fqs;
66068 +
66069 + /* initialize cache of structures attached to dentry->d_fsdata */
66070 + if ((result = reiser4_init_dentry_fsdata()) != 0)
66071 + goto failed_init_dentry_fsdata;
66072 +
66073 + /* initialize cache of structures attached to file->private_data */
66074 + if ((result = reiser4_init_file_fsdata()) != 0)
66075 + goto failed_init_file_fsdata;
66076 +
66077 + /*
66078 + * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
66079 + * more details
66080 + */
66081 + if ((result = reiser4_init_d_cursor()) != 0)
66082 + goto failed_init_d_cursor;
66083 +
66084 + if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
66085 + reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
66086 + return 0;
66087 + }
66088 +
66089 + reiser4_done_d_cursor();
66090 + failed_init_d_cursor:
66091 + reiser4_done_file_fsdata();
66092 + failed_init_file_fsdata:
66093 + reiser4_done_dentry_fsdata();
66094 + failed_init_dentry_fsdata:
66095 + reiser4_done_fqs();
66096 + failed_init_fqs:
66097 + done_jnodes();
66098 + failed_init_jnodes:
66099 + done_txnmgr_static();
66100 + failed_init_txnmgr_static:
66101 + done_plugin_set();
66102 + failed_init_plugin_set:
66103 + failed_init_plugins:
66104 + done_znodes();
66105 + failed_init_znodes:
66106 + done_inodes();
66107 + failed_inode_cache:
66108 + return result;
66109 +}
66110 +
66111 +/**
66112 + * done_reiser4 - reiser4 exit entry point
66113 + *
66114 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
66115 + * or at module unload.
66116 + */
66117 +static void __exit done_reiser4(void)
66118 +{
66119 + int result;
66120 +
66121 + debugfs_remove(reiser4_debugfs_root);
66122 + result = unregister_filesystem(&reiser4_fs_type);
66123 + BUG_ON(result != 0);
66124 + reiser4_done_d_cursor();
66125 + reiser4_done_file_fsdata();
66126 + reiser4_done_dentry_fsdata();
66127 + reiser4_done_fqs();
66128 + done_jnodes();
66129 + done_txnmgr_static();
66130 + done_plugin_set();
66131 + done_znodes();
66132 + destroy_reiser4_cache(&inode_cache);
66133 +}
66134 +
66135 +module_init(init_reiser4);
66136 +module_exit(done_reiser4);
66137 +
66138 +MODULE_DESCRIPTION("Reiser4 filesystem");
66139 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
66140 +
66141 +MODULE_LICENSE("GPL");
66142 +
66143 +/*
66144 + * Local variables:
66145 + * c-indentation-style: "K&R"
66146 + * mode-name: "LC"
66147 + * c-basic-offset: 8
66148 + * tab-width: 8
66149 + * fill-column: 79
66150 + * End:
66151 + */
66152 diff -urN linux-2.6.33.orig/fs/reiser4/tap.c linux-2.6.33/fs/reiser4/tap.c
66153 --- linux-2.6.33.orig/fs/reiser4/tap.c 1970-01-01 01:00:00.000000000 +0100
66154 +++ linux-2.6.33/fs/reiser4/tap.c 2010-03-04 19:33:22.000000000 +0100
66155 @@ -0,0 +1,376 @@
66156 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66157 + * reiser4/README */
66158 +
66159 +/*
66160 + Tree Access Pointer (tap).
66161 +
66162 + tap is data structure combining coord and lock handle (mostly). It is
66163 + useful when one has to scan tree nodes (for example, in readdir, or flush),
66164 + for tap functions allow to move tap in either direction transparently
66165 + crossing unit/item/node borders.
66166 +
66167 + Tap doesn't provide automatic synchronization of its fields as it is
66168 + supposed to be per-thread object.
66169 +*/
66170 +
66171 +#include "forward.h"
66172 +#include "debug.h"
66173 +#include "coord.h"
66174 +#include "tree.h"
66175 +#include "context.h"
66176 +#include "tap.h"
66177 +#include "znode.h"
66178 +#include "tree_walk.h"
66179 +
66180 +#if REISER4_DEBUG
66181 +static int tap_invariant(const tap_t *tap);
66182 +static void tap_check(const tap_t *tap);
66183 +#else
66184 +#define tap_check(tap) noop
66185 +#endif
66186 +
66187 +/** load node tap is pointing to, if not loaded already */
66188 +int reiser4_tap_load(tap_t *tap)
66189 +{
66190 + tap_check(tap);
66191 + if (tap->loaded == 0) {
66192 + int result;
66193 +
66194 + result = zload_ra(tap->coord->node, &tap->ra_info);
66195 + if (result != 0)
66196 + return result;
66197 + coord_clear_iplug(tap->coord);
66198 + }
66199 + ++tap->loaded;
66200 + tap_check(tap);
66201 + return 0;
66202 +}
66203 +
66204 +/** release node tap is pointing to. Dual to tap_load() */
66205 +void reiser4_tap_relse(tap_t *tap)
66206 +{
66207 + tap_check(tap);
66208 + if (tap->loaded > 0) {
66209 + --tap->loaded;
66210 + if (tap->loaded == 0)
66211 + zrelse(tap->coord->node);
66212 + }
66213 + tap_check(tap);
66214 +}
66215 +
66216 +/**
66217 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
66218 + * @mode
66219 + */
66220 +void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
66221 + znode_lock_mode mode)
66222 +{
66223 + tap->coord = coord;
66224 + tap->lh = lh;
66225 + tap->mode = mode;
66226 + tap->loaded = 0;
66227 + INIT_LIST_HEAD(&tap->linkage);
66228 + reiser4_init_ra_info(&tap->ra_info);
66229 +}
66230 +
66231 +/** add @tap to the per-thread list of all taps */
66232 +void reiser4_tap_monitor(tap_t *tap)
66233 +{
66234 + assert("nikita-2623", tap != NULL);
66235 + tap_check(tap);
66236 + list_add(&tap->linkage, reiser4_taps_list());
66237 + tap_check(tap);
66238 +}
66239 +
66240 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
66241 + * loaded. */
66242 +void reiser4_tap_copy(tap_t *dst, tap_t *src)
66243 +{
66244 + assert("nikita-3193", src != NULL);
66245 + assert("nikita-3194", dst != NULL);
66246 +
66247 + *dst->coord = *src->coord;
66248 + if (src->lh->node)
66249 + copy_lh(dst->lh, src->lh);
66250 + dst->mode = src->mode;
66251 + dst->loaded = 0;
66252 + INIT_LIST_HEAD(&dst->linkage);
66253 + dst->ra_info = src->ra_info;
66254 +}
66255 +
66256 +/** finish with @tap */
66257 +void reiser4_tap_done(tap_t *tap)
66258 +{
66259 + assert("nikita-2565", tap != NULL);
66260 + tap_check(tap);
66261 + if (tap->loaded > 0)
66262 + zrelse(tap->coord->node);
66263 + done_lh(tap->lh);
66264 + tap->loaded = 0;
66265 + list_del_init(&tap->linkage);
66266 + tap->coord->node = NULL;
66267 +}
66268 +
66269 +/**
66270 + * move @tap to the new node, locked with @target. Load @target, if @tap was
66271 + * already loaded.
66272 + */
66273 +int reiser4_tap_move(tap_t *tap, lock_handle * target)
66274 +{
66275 + int result = 0;
66276 +
66277 + assert("nikita-2567", tap != NULL);
66278 + assert("nikita-2568", target != NULL);
66279 + assert("nikita-2570", target->node != NULL);
66280 + assert("nikita-2569", tap->coord->node == tap->lh->node);
66281 +
66282 + tap_check(tap);
66283 + if (tap->loaded > 0)
66284 + result = zload_ra(target->node, &tap->ra_info);
66285 +
66286 + if (result == 0) {
66287 + if (tap->loaded > 0)
66288 + zrelse(tap->coord->node);
66289 + done_lh(tap->lh);
66290 + copy_lh(tap->lh, target);
66291 + tap->coord->node = target->node;
66292 + coord_clear_iplug(tap->coord);
66293 + }
66294 + tap_check(tap);
66295 + return result;
66296 +}
66297 +
66298 +/**
66299 + * move @tap to @target. Acquire lock on @target, if @tap was already
66300 + * loaded.
66301 + */
66302 +static int tap_to(tap_t *tap, znode * target)
66303 +{
66304 + int result;
66305 +
66306 + assert("nikita-2624", tap != NULL);
66307 + assert("nikita-2625", target != NULL);
66308 +
66309 + tap_check(tap);
66310 + result = 0;
66311 + if (tap->coord->node != target) {
66312 + lock_handle here;
66313 +
66314 + init_lh(&here);
66315 + result = longterm_lock_znode(&here, target,
66316 + tap->mode, ZNODE_LOCK_HIPRI);
66317 + if (result == 0) {
66318 + result = reiser4_tap_move(tap, &here);
66319 + done_lh(&here);
66320 + }
66321 + }
66322 + tap_check(tap);
66323 + return result;
66324 +}
66325 +
66326 +/**
66327 + * move @tap to given @target, loading and locking @target->node if
66328 + * necessary
66329 + */
66330 +int tap_to_coord(tap_t *tap, coord_t *target)
66331 +{
66332 + int result;
66333 +
66334 + tap_check(tap);
66335 + result = tap_to(tap, target->node);
66336 + if (result == 0)
66337 + coord_dup(tap->coord, target);
66338 + tap_check(tap);
66339 + return result;
66340 +}
66341 +
66342 +/** return list of all taps */
66343 +struct list_head *reiser4_taps_list(void)
66344 +{
66345 + return &get_current_context()->taps;
66346 +}
66347 +
66348 +/** helper function for go_{next,prev}_{item,unit,node}() */
66349 +int go_dir_el(tap_t *tap, sideof dir, int units_p)
66350 +{
66351 + coord_t dup;
66352 + coord_t *coord;
66353 + int result;
66354 +
66355 + int (*coord_dir) (coord_t *);
66356 + int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
66357 + void (*coord_init) (coord_t *, const znode *);
66358 + ON_DEBUG(int (*coord_check) (const coord_t *));
66359 +
66360 + assert("nikita-2556", tap != NULL);
66361 + assert("nikita-2557", tap->coord != NULL);
66362 + assert("nikita-2558", tap->lh != NULL);
66363 + assert("nikita-2559", tap->coord->node != NULL);
66364 +
66365 + tap_check(tap);
66366 + if (dir == LEFT_SIDE) {
66367 + coord_dir = units_p ? coord_prev_unit : coord_prev_item;
66368 + get_dir_neighbor = reiser4_get_left_neighbor;
66369 + coord_init = coord_init_last_unit;
66370 + } else {
66371 + coord_dir = units_p ? coord_next_unit : coord_next_item;
66372 + get_dir_neighbor = reiser4_get_right_neighbor;
66373 + coord_init = coord_init_first_unit;
66374 + }
66375 + ON_DEBUG(coord_check =
66376 + units_p ? coord_is_existing_unit : coord_is_existing_item);
66377 + assert("nikita-2560", coord_check(tap->coord));
66378 +
66379 + coord = tap->coord;
66380 + coord_dup(&dup, coord);
66381 + if (coord_dir(&dup) != 0) {
66382 + do {
66383 + /* move to the left neighboring node */
66384 + lock_handle dup;
66385 +
66386 + init_lh(&dup);
66387 + result =
66388 + get_dir_neighbor(&dup, coord->node, (int)tap->mode,
66389 + GN_CAN_USE_UPPER_LEVELS);
66390 + if (result == 0) {
66391 + result = reiser4_tap_move(tap, &dup);
66392 + if (result == 0)
66393 + coord_init(tap->coord, dup.node);
66394 + done_lh(&dup);
66395 + }
66396 + /* skip empty nodes */
66397 + } while ((result == 0) && node_is_empty(coord->node));
66398 + } else {
66399 + result = 0;
66400 + coord_dup(coord, &dup);
66401 + }
66402 + assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
66403 + tap_check(tap);
66404 + return result;
66405 +}
66406 +
66407 +/**
66408 + * move @tap to the next unit, transparently crossing item and node
66409 + * boundaries
66410 + */
66411 +int go_next_unit(tap_t *tap)
66412 +{
66413 + return go_dir_el(tap, RIGHT_SIDE, 1);
66414 +}
66415 +
66416 +/**
66417 + * move @tap to the previous unit, transparently crossing item and node
66418 + * boundaries
66419 + */
66420 +int go_prev_unit(tap_t *tap)
66421 +{
66422 + return go_dir_el(tap, LEFT_SIDE, 1);
66423 +}
66424 +
66425 +/**
66426 + * @shift times apply @actor to the @tap. This is used to move @tap by
66427 + * @shift units (or items, or nodes) in either direction.
66428 + */
66429 +static int rewind_to(tap_t *tap, go_actor_t actor, int shift)
66430 +{
66431 + int result;
66432 +
66433 + assert("nikita-2555", shift >= 0);
66434 + assert("nikita-2562", tap->coord->node == tap->lh->node);
66435 +
66436 + tap_check(tap);
66437 + result = reiser4_tap_load(tap);
66438 + if (result != 0)
66439 + return result;
66440 +
66441 + for (; shift > 0; --shift) {
66442 + result = actor(tap);
66443 + assert("nikita-2563", tap->coord->node == tap->lh->node);
66444 + if (result != 0)
66445 + break;
66446 + }
66447 + reiser4_tap_relse(tap);
66448 + tap_check(tap);
66449 + return result;
66450 +}
66451 +
66452 +/** move @tap @shift units rightward */
66453 +int rewind_right(tap_t *tap, int shift)
66454 +{
66455 + return rewind_to(tap, go_next_unit, shift);
66456 +}
66457 +
66458 +/** move @tap @shift units leftward */
66459 +int rewind_left(tap_t *tap, int shift)
66460 +{
66461 + return rewind_to(tap, go_prev_unit, shift);
66462 +}
66463 +
66464 +#if REISER4_DEBUG
66465 +/** debugging function: print @tap content in human readable form */
66466 +static void print_tap(const char *prefix, const tap_t *tap)
66467 +{
66468 + if (tap == NULL) {
66469 + printk("%s: null tap\n", prefix);
66470 + return;
66471 + }
66472 + printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66473 + tap->loaded, (&tap->linkage == tap->linkage.next &&
66474 + &tap->linkage == tap->linkage.prev),
66475 + tap->lh->node,
66476 + lock_mode_name(tap->mode));
66477 + print_coord("\tcoord", tap->coord, 0);
66478 +}
66479 +
66480 +/** check [tap-sane] invariant */
66481 +static int tap_invariant(const tap_t *tap)
66482 +{
66483 + /* [tap-sane] invariant */
66484 +
66485 + if (tap == NULL)
66486 + return 1;
66487 + /* tap->mode is one of
66488 + *
66489 + * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66490 + */
66491 + if (tap->mode != ZNODE_NO_LOCK &&
66492 + tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66493 + return 2;
66494 + /* tap->coord != NULL, and */
66495 + if (tap->coord == NULL)
66496 + return 3;
66497 + /* tap->lh != NULL, and */
66498 + if (tap->lh == NULL)
66499 + return 4;
66500 + /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66501 + if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66502 + return 5;
66503 + /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66504 + if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66505 + return 6;
66506 + return 0;
66507 +}
66508 +
66509 +/** debugging function: check internal @tap consistency */
66510 +static void tap_check(const tap_t *tap)
66511 +{
66512 + int result;
66513 +
66514 + result = tap_invariant(tap);
66515 + if (result != 0) {
66516 + print_tap("broken", tap);
66517 + reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66518 + }
66519 +}
66520 +#endif
66521 +
66522 +/* Make Linus happy.
66523 + Local variables:
66524 + c-indentation-style: "K&R"
66525 + mode-name: "LC"
66526 + c-basic-offset: 8
66527 + tab-width: 8
66528 + fill-column: 120
66529 + scroll-step: 1
66530 + End:
66531 +*/
66532 diff -urN linux-2.6.33.orig/fs/reiser4/tap.h linux-2.6.33/fs/reiser4/tap.h
66533 --- linux-2.6.33.orig/fs/reiser4/tap.h 1970-01-01 01:00:00.000000000 +0100
66534 +++ linux-2.6.33/fs/reiser4/tap.h 2010-03-04 19:33:22.000000000 +0100
66535 @@ -0,0 +1,70 @@
66536 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66537 +
66538 +/* Tree Access Pointers. See tap.c for more details. */
66539 +
66540 +#if !defined(__REISER4_TAP_H__)
66541 +#define __REISER4_TAP_H__
66542 +
66543 +#include "forward.h"
66544 +#include "readahead.h"
66545 +
66546 +/**
66547 + tree_access_pointer aka tap. Data structure combining coord_t and lock
66548 + handle.
66549 + Invariants involving this data-type, see doc/lock-ordering for details:
66550 +
66551 + [tap-sane]
66552 + */
66553 +struct tree_access_pointer {
66554 + /* coord tap is at */
66555 + coord_t *coord;
66556 + /* lock handle on ->coord->node */
66557 + lock_handle *lh;
66558 + /* mode of lock acquired by this tap */
66559 + znode_lock_mode mode;
66560 + /* incremented by reiser4_tap_load().
66561 + Decremented by reiser4_tap_relse(). */
66562 + int loaded;
66563 + /* list of taps */
66564 + struct list_head linkage;
66565 + /* read-ahead hint */
66566 + ra_info_t ra_info;
66567 +};
66568 +
66569 +typedef int (*go_actor_t) (tap_t *tap);
66570 +
66571 +extern int reiser4_tap_load(tap_t *tap);
66572 +extern void reiser4_tap_relse(tap_t *tap);
66573 +extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
66574 + znode_lock_mode mode);
66575 +extern void reiser4_tap_monitor(tap_t *tap);
66576 +extern void reiser4_tap_copy(tap_t *dst, tap_t *src);
66577 +extern void reiser4_tap_done(tap_t *tap);
66578 +extern int reiser4_tap_move(tap_t *tap, lock_handle * target);
66579 +extern int tap_to_coord(tap_t *tap, coord_t *target);
66580 +
66581 +extern int go_dir_el(tap_t *tap, sideof dir, int units_p);
66582 +extern int go_next_unit(tap_t *tap);
66583 +extern int go_prev_unit(tap_t *tap);
66584 +extern int rewind_right(tap_t *tap, int shift);
66585 +extern int rewind_left(tap_t *tap, int shift);
66586 +
66587 +extern struct list_head *reiser4_taps_list(void);
66588 +
66589 +#define for_all_taps(tap) \
66590 + for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
66591 + reiser4_taps_list() != &tap->linkage; \
66592 + tap = list_entry(tap->linkage.next, tap_t, linkage))
66593 +
66594 +/* __REISER4_TAP_H__ */
66595 +#endif
66596 +/* Make Linus happy.
66597 + Local variables:
66598 + c-indentation-style: "K&R"
66599 + mode-name: "LC"
66600 + c-basic-offset: 8
66601 + tab-width: 8
66602 + fill-column: 120
66603 + scroll-step: 1
66604 + End:
66605 +*/
66606 diff -urN linux-2.6.33.orig/fs/reiser4/tree.c linux-2.6.33/fs/reiser4/tree.c
66607 --- linux-2.6.33.orig/fs/reiser4/tree.c 1970-01-01 01:00:00.000000000 +0100
66608 +++ linux-2.6.33/fs/reiser4/tree.c 2010-03-04 19:33:22.000000000 +0100
66609 @@ -0,0 +1,1878 @@
66610 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66611 + * reiser4/README */
66612 +
66613 +/*
66614 + * KEYS IN A TREE.
66615 + *
66616 + * The tree consists of nodes located on the disk. Node in the tree is either
66617 + * formatted or unformatted. Formatted node is one that has structure
66618 + * understood by the tree balancing and traversal code. Formatted nodes are
66619 + * further classified into leaf and internal nodes. Latter distinctions is
66620 + * (almost) of only historical importance: general structure of leaves and
66621 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66622 + * that are part of bodies of ordinary files and attributes.
66623 + *
66624 + * Each node in the tree spawns some interval in the key space. Key ranges for
66625 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
66626 + * sense, because of the non-unique keys: intersection of key ranges for
66627 + * different nodes is either empty, or consists of exactly one key.
66628 + *
66629 + * Formatted node consists of a sequence of items. Each item spawns some
66630 + * interval in key space. Key ranges for all items in a tree are disjoint,
66631 + * modulo non-unique keys again. Items within nodes are ordered in the key
66632 + * order of the smallest key in a item.
66633 + *
66634 + * Particular type of item can be further split into units. Unit is piece of
66635 + * item that can be cut from item and moved into another item of the same
66636 + * time. Units are used by balancing code to repack data during balancing.
66637 + *
66638 + * Unit can be further split into smaller entities (for example, extent unit
66639 + * represents several pages, and it is natural for extent code to operate on
66640 + * particular pages and even bytes within one unit), but this is of no
66641 + * relevance to the generic balancing and lookup code.
66642 + *
66643 + * Although item is said to "spawn" range or interval of keys, it is not
66644 + * necessary that item contains piece of data addressable by each and every
66645 + * key in this range. For example, compound directory item, consisting of
66646 + * units corresponding to directory entries and keyed by hashes of file names,
66647 + * looks more as having "discrete spectrum": only some disjoint keys inside
66648 + * range occupied by this item really address data.
66649 + *
66650 + * No than less, each item always has well-defined least (minimal) key, that
66651 + * is recorded in item header, stored in the node this item is in. Also, item
66652 + * plugin can optionally define method ->max_key_inside() returning maximal
66653 + * key that can _possibly_ be located within this item. This method is used
66654 + * (mainly) to determine when given piece of data should be merged into
66655 + * existing item, in stead of creating new one. Because of this, even though
66656 + * ->max_key_inside() can be larger that any key actually located in the item,
66657 + * intervals
66658 + *
66659 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66660 + *
66661 + * are still disjoint for all items within the _same_ node.
66662 + *
66663 + * In memory node is represented by znode. It plays several roles:
66664 + *
66665 + * . something locks are taken on
66666 + *
66667 + * . something tracked by transaction manager (this is going to change)
66668 + *
66669 + * . something used to access node data
66670 + *
66671 + * . something used to maintain tree structure in memory: sibling and
66672 + * parental linkage.
66673 + *
66674 + * . something used to organize nodes into "slums"
66675 + *
66676 + * More on znodes see in znode.[ch]
66677 + *
66678 + * DELIMITING KEYS
66679 + *
66680 + * To simplify balancing, allow some flexibility in locking and speed up
66681 + * important coord cache optimization, we keep delimiting keys of nodes in
66682 + * memory. Depending on disk format (implemented by appropriate node plugin)
66683 + * node on disk can record both left and right delimiting key, only one of
66684 + * them, or none. Still, our balancing and tree traversal code keep both
66685 + * delimiting keys for a node that is in memory stored in the znode. When
66686 + * node is first brought into memory during tree traversal, its left
66687 + * delimiting key is taken from its parent, and its right delimiting key is
66688 + * either next key in its parent, or is right delimiting key of parent if
66689 + * node is the rightmost child of parent.
66690 + *
66691 + * Physical consistency of delimiting key is protected by special dk
66692 + * read-write lock. That is, delimiting keys can only be inspected or
66693 + * modified under this lock. But dk lock is only sufficient for fast
66694 + * "pessimistic" check, because to simplify code and to decrease lock
66695 + * contention, balancing (carry) only updates delimiting keys right before
66696 + * unlocking all locked nodes on the given tree level. For example,
66697 + * coord-by-key cache scans LRU list of recently accessed znodes. For each
66698 + * node it first does fast check under dk spin lock. If key looked for is
66699 + * not between delimiting keys for this node, next node is inspected and so
66700 + * on. If key is inside of the key range, long term lock is taken on node
66701 + * and key range is rechecked.
66702 + *
66703 + * COORDINATES
66704 + *
66705 + * To find something in the tree, you supply a key, and the key is resolved
66706 + * by coord_by_key() into a coord (coordinate) that is valid as long as the
66707 + * node the coord points to remains locked. As mentioned above trees
66708 + * consist of nodes that consist of items that consist of units. A unit is
66709 + * the smallest and indivisible piece of tree as far as balancing and tree
66710 + * search are concerned. Each node, item, and unit can be addressed by
66711 + * giving its level in the tree and the key occupied by this entity. A node
66712 + * knows what the key ranges are of the items within it, and how to find its
66713 + * items and invoke their item handlers, but it does not know how to access
66714 + * individual units within its items except through the item handlers.
66715 + * coord is a structure containing a pointer to the node, the ordinal number
66716 + * of the item within this node (a sort of item offset), and the ordinal
66717 + * number of the unit within this item.
66718 + *
66719 + * TREE LOOKUP
66720 + *
66721 + * There are two types of access to the tree: lookup and modification.
66722 + *
66723 + * Lookup is a search for the key in the tree. Search can look for either
66724 + * exactly the key given to it, or for the largest key that is not greater
66725 + * than the key given to it. This distinction is determined by "bias"
66726 + * parameter of search routine (coord_by_key()). coord_by_key() either
66727 + * returns error (key is not in the tree, or some kind of external error
66728 + * occurred), or successfully resolves key into coord.
66729 + *
66730 + * This resolution is done by traversing tree top-to-bottom from root level
66731 + * to the desired level. On levels above twig level (level one above the
66732 + * leaf level) nodes consist exclusively of internal items. Internal item is
66733 + * nothing more than pointer to the tree node on the child level. On twig
66734 + * level nodes consist of internal items intermixed with extent
66735 + * items. Internal items form normal search tree structure used by traversal
66736 + * to descent through the tree.
66737 + *
66738 + * TREE LOOKUP OPTIMIZATIONS
66739 + *
66740 + * Tree lookup described above is expensive even if all nodes traversed are
66741 + * already in the memory: for each node binary search within it has to be
66742 + * performed and binary searches are CPU consuming and tend to destroy CPU
66743 + * caches.
66744 + *
66745 + * Several optimizations are used to work around this:
66746 + *
66747 + * . cbk_cache (look-aside cache for tree traversals, see search.c for
66748 + * details)
66749 + *
66750 + * . seals (see seal.[ch])
66751 + *
66752 + * . vroot (see search.c)
66753 + *
66754 + * General search-by-key is layered thusly:
66755 + *
66756 + * [check seal, if any] --ok--> done
66757 + * |
66758 + * failed
66759 + * |
66760 + * V
66761 + * [vroot defined] --no--> node = tree_root
66762 + * | |
66763 + * yes |
66764 + * | |
66765 + * V |
66766 + * node = vroot |
66767 + * | |
66768 + * | |
66769 + * | |
66770 + * V V
66771 + * [check cbk_cache for key] --ok--> done
66772 + * |
66773 + * failed
66774 + * |
66775 + * V
66776 + * [start tree traversal from node]
66777 + *
66778 + */
66779 +
66780 +#include "forward.h"
66781 +#include "debug.h"
66782 +#include "dformat.h"
66783 +#include "key.h"
66784 +#include "coord.h"
66785 +#include "plugin/item/static_stat.h"
66786 +#include "plugin/item/item.h"
66787 +#include "plugin/node/node.h"
66788 +#include "plugin/plugin.h"
66789 +#include "txnmgr.h"
66790 +#include "jnode.h"
66791 +#include "znode.h"
66792 +#include "block_alloc.h"
66793 +#include "tree_walk.h"
66794 +#include "carry.h"
66795 +#include "carry_ops.h"
66796 +#include "tap.h"
66797 +#include "tree.h"
66798 +#include "vfs_ops.h"
66799 +#include "page_cache.h"
66800 +#include "super.h"
66801 +#include "reiser4.h"
66802 +#include "inode.h"
66803 +
66804 +#include <linux/fs.h> /* for struct super_block */
66805 +#include <linux/spinlock.h>
66806 +
66807 +/* Disk address (block number) never ever used for any real tree node. This is
66808 + used as block number of "uber" znode.
66809 +
66810 + Invalid block addresses are 0 by tradition.
66811 +
66812 +*/
66813 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66814 +
66815 +#define CUT_TREE_MIN_ITERATIONS 64
66816 +
66817 +static int find_child_by_addr(znode * parent, znode * child, coord_t *result);
66818 +
66819 +/* return node plugin of coord->node */
66820 +node_plugin *node_plugin_by_coord(const coord_t *coord)
66821 +{
66822 + assert("vs-1", coord != NULL);
66823 + assert("vs-2", coord->node != NULL);
66824 +
66825 + return coord->node->nplug;
66826 +}
66827 +
66828 +/* insert item into tree. Fields of @coord are updated so that they can be
66829 + * used by consequent insert operation. */
66830 +insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
66831 + * into */ ,
66832 + const reiser4_key * key /* key of new item */ ,
66833 + reiser4_item_data * data /* parameters for item
66834 + * creation */ ,
66835 + coord_t *coord /* resulting insertion coord */ ,
66836 + lock_handle * lh /* resulting lock
66837 + * handle */ ,
66838 + tree_level stop_level /* level where to insert */ ,
66839 + __u32 flags/* insertion flags */)
66840 +{
66841 + int result;
66842 +
66843 + assert("nikita-358", tree != NULL);
66844 + assert("nikita-360", coord != NULL);
66845 +
66846 + result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66847 + FIND_EXACT, stop_level, stop_level,
66848 + flags | CBK_FOR_INSERT, NULL/*ra_info */);
66849 + switch (result) {
66850 + default:
66851 + break;
66852 + case CBK_COORD_FOUND:
66853 + result = IBK_ALREADY_EXISTS;
66854 + break;
66855 + case CBK_COORD_NOTFOUND:
66856 + assert("nikita-2017", coord->node != NULL);
66857 + result = insert_by_coord(coord, data, key, lh, 0/*flags */);
66858 + break;
66859 + }
66860 + return result;
66861 +}
66862 +
66863 +/* insert item by calling carry. Helper function called if short-cut
66864 + insertion failed */
66865 +static insert_result insert_with_carry_by_coord(coord_t *coord,
66866 + /* coord where to insert */
66867 + lock_handle * lh,
66868 + /* lock handle of insertion node */
66869 + reiser4_item_data * data,
66870 + /* parameters of new item */
66871 + const reiser4_key * key,
66872 + /* key of new item */
66873 + carry_opcode cop,
66874 + /* carry operation to perform */
66875 + cop_insert_flag flags
66876 + /* carry flags */ )
66877 +{
66878 + int result;
66879 + carry_pool *pool;
66880 + carry_level *lowest_level;
66881 + carry_insert_data *cdata;
66882 + carry_op *op;
66883 +
66884 + assert("umka-314", coord != NULL);
66885 +
66886 + /* allocate carry_pool and 3 carry_level-s */
66887 + pool =
66888 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66889 + sizeof(*cdata));
66890 + if (IS_ERR(pool))
66891 + return PTR_ERR(pool);
66892 + lowest_level = (carry_level *) (pool + 1);
66893 + init_carry_level(lowest_level, pool);
66894 +
66895 + op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66896 + if (IS_ERR(op) || (op == NULL)) {
66897 + done_carry_pool(pool);
66898 + return RETERR(op ? PTR_ERR(op) : -EIO);
66899 + }
66900 + cdata = (carry_insert_data *) (lowest_level + 3);
66901 + cdata->coord = coord;
66902 + cdata->data = data;
66903 + cdata->key = key;
66904 + op->u.insert.d = cdata;
66905 + if (flags == 0)
66906 + flags = znode_get_tree(coord->node)->carry.insert_flags;
66907 + op->u.insert.flags = flags;
66908 + op->u.insert.type = COPT_ITEM_DATA;
66909 + op->u.insert.child = NULL;
66910 + if (lh != NULL) {
66911 + assert("nikita-3245", lh->node == coord->node);
66912 + lowest_level->track_type = CARRY_TRACK_CHANGE;
66913 + lowest_level->tracked = lh;
66914 + }
66915 +
66916 + result = reiser4_carry(lowest_level, NULL);
66917 + done_carry_pool(pool);
66918 +
66919 + return result;
66920 +}
66921 +
66922 +/* form carry queue to perform paste of @data with @key at @coord, and launch
66923 + its execution by calling carry().
66924 +
66925 + Instruct carry to update @lh it after balancing insertion coord moves into
66926 + different block.
66927 +
66928 +*/
66929 +static int paste_with_carry(coord_t *coord, /* coord of paste */
66930 + lock_handle * lh, /* lock handle of node
66931 + * where item is
66932 + * pasted */
66933 + reiser4_item_data * data, /* parameters of new
66934 + * item */
66935 + const reiser4_key * key, /* key of new item */
66936 + unsigned flags/* paste flags */)
66937 +{
66938 + int result;
66939 + carry_pool *pool;
66940 + carry_level *lowest_level;
66941 + carry_insert_data *cdata;
66942 + carry_op *op;
66943 +
66944 + assert("umka-315", coord != NULL);
66945 + assert("umka-316", key != NULL);
66946 +
66947 + pool =
66948 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66949 + sizeof(*cdata));
66950 + if (IS_ERR(pool))
66951 + return PTR_ERR(pool);
66952 + lowest_level = (carry_level *) (pool + 1);
66953 + init_carry_level(lowest_level, pool);
66954 +
66955 + op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66956 + if (IS_ERR(op) || (op == NULL)) {
66957 + done_carry_pool(pool);
66958 + return RETERR(op ? PTR_ERR(op) : -EIO);
66959 + }
66960 + cdata = (carry_insert_data *) (lowest_level + 3);
66961 + cdata->coord = coord;
66962 + cdata->data = data;
66963 + cdata->key = key;
66964 + op->u.paste.d = cdata;
66965 + if (flags == 0)
66966 + flags = znode_get_tree(coord->node)->carry.paste_flags;
66967 + op->u.paste.flags = flags;
66968 + op->u.paste.type = COPT_ITEM_DATA;
66969 + if (lh != NULL) {
66970 + lowest_level->track_type = CARRY_TRACK_CHANGE;
66971 + lowest_level->tracked = lh;
66972 + }
66973 +
66974 + result = reiser4_carry(lowest_level, NULL);
66975 + done_carry_pool(pool);
66976 +
66977 + return result;
66978 +}
66979 +
66980 +/* insert item at the given coord.
66981 +
66982 + First try to skip carry by directly calling ->create_item() method of node
66983 + plugin. If this is impossible (there is not enough free space in the node,
66984 + or leftmost item in the node is created), call insert_with_carry_by_coord()
66985 + that will do full carry().
66986 +
66987 +*/
66988 +insert_result insert_by_coord(coord_t *coord /* coord where to
66989 + * insert. coord->node has
66990 + * to be write locked by
66991 + * caller */ ,
66992 + reiser4_item_data * data /* data to be
66993 + * inserted */ ,
66994 + const reiser4_key * key /* key of new item */ ,
66995 + lock_handle * lh /* lock handle of write
66996 + * lock on node */ ,
66997 + __u32 flags/* insertion flags */)
66998 +{
66999 + unsigned item_size;
67000 + int result;
67001 + znode *node;
67002 +
67003 + assert("vs-247", coord != NULL);
67004 + assert("vs-248", data != NULL);
67005 + assert("vs-249", data->length >= 0);
67006 + assert("nikita-1191", znode_is_write_locked(coord->node));
67007 +
67008 + node = coord->node;
67009 + coord_clear_iplug(coord);
67010 + result = zload(node);
67011 + if (result != 0)
67012 + return result;
67013 +
67014 + item_size = space_needed(node, NULL, data, 1);
67015 + if (item_size > znode_free_space(node) &&
67016 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
67017 + && (flags & COPI_DONT_ALLOCATE)) {
67018 + /* we are forced to use free space of coord->node and new item
67019 + does not fit into it.
67020 +
67021 + Currently we get here only when we allocate and copy units
67022 + of extent item from a node to its left neighbor during
67023 + "squalloc"-ing. If @node (this is left neighbor) does not
67024 + have enough free space - we do not want to attempt any
67025 + shifting and allocations because we are in squeezing and
67026 + everything to the left of @node is tightly packed.
67027 + */
67028 + result = -E_NODE_FULL;
67029 + } else if ((item_size <= znode_free_space(node)) &&
67030 + !coord_is_before_leftmost(coord) &&
67031 + (node_plugin_by_node(node)->fast_insert != NULL)
67032 + && node_plugin_by_node(node)->fast_insert(coord)) {
67033 + /* shortcut insertion without carry() overhead.
67034 +
67035 + Only possible if:
67036 +
67037 + - there is enough free space
67038 +
67039 + - insertion is not into the leftmost position in a node
67040 + (otherwise it would require updating of delimiting key in a
67041 + parent)
67042 +
67043 + - node plugin agrees with this
67044 +
67045 + */
67046 + result =
67047 + node_plugin_by_node(node)->create_item(coord, key, data,
67048 + NULL);
67049 + znode_make_dirty(node);
67050 + } else {
67051 + /* otherwise do full-fledged carry(). */
67052 + result =
67053 + insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
67054 + flags);
67055 + }
67056 + zrelse(node);
67057 + return result;
67058 +}
67059 +
67060 +/* @coord is set to leaf level and @data is to be inserted to twig level */
67061 +insert_result
67062 +insert_extent_by_coord(coord_t *coord, /* coord where to insert.
67063 + * coord->node has to be write
67064 + * locked by caller */
67065 + reiser4_item_data *data,/* data to be inserted */
67066 + const reiser4_key *key, /* key of new item */
67067 + lock_handle *lh /* lock handle of write lock
67068 + on node */)
67069 +{
67070 + assert("vs-405", coord != NULL);
67071 + assert("vs-406", data != NULL);
67072 + assert("vs-407", data->length > 0);
67073 + assert("vs-408", znode_is_write_locked(coord->node));
67074 + assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
67075 +
67076 + return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
67077 + 0 /*flags */ );
67078 +}
67079 +
67080 +/* Insert into the item at the given coord.
67081 +
67082 + First try to skip carry by directly calling ->paste() method of item
67083 + plugin. If this is impossible (there is not enough free space in the node,
67084 + or we are pasting into leftmost position in the node), call
67085 + paste_with_carry() that will do full carry().
67086 +
67087 +*/
67088 +/* paste_into_item */
67089 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
67090 + lock_handle * lh /* lock handle on node involved */ ,
67091 + const reiser4_key * key /* key of unit being pasted */ ,
67092 + reiser4_item_data * data /* parameters for new unit */ ,
67093 + unsigned flags /* insert/paste flags */ )
67094 +{
67095 + int result;
67096 + int size_change;
67097 + node_plugin *nplug;
67098 + item_plugin *iplug;
67099 +
67100 + assert("umka-317", coord != NULL);
67101 + assert("umka-318", key != NULL);
67102 +
67103 + iplug = item_plugin_by_coord(coord);
67104 + nplug = node_plugin_by_coord(coord);
67105 +
67106 + assert("nikita-1480", iplug == data->iplug);
67107 +
67108 + size_change = space_needed(coord->node, coord, data, 0);
67109 + if (size_change > (int)znode_free_space(coord->node) &&
67110 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
67111 + && (flags & COPI_DONT_ALLOCATE)) {
67112 + /* we are forced to use free space of coord->node and new data
67113 + does not fit into it. */
67114 + return -E_NODE_FULL;
67115 + }
67116 +
67117 + /* shortcut paste without carry() overhead.
67118 +
67119 + Only possible if:
67120 +
67121 + - there is enough free space
67122 +
67123 + - paste is not into the leftmost unit in a node (otherwise
67124 + it would require updating of delimiting key in a parent)
67125 +
67126 + - node plugin agrees with this
67127 +
67128 + - item plugin agrees with us
67129 + */
67130 + if (size_change <= (int)znode_free_space(coord->node) &&
67131 + (coord->item_pos != 0 ||
67132 + coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
67133 + coord->unit_pos != 0 && nplug->fast_paste != NULL &&
67134 + nplug->fast_paste(coord) &&
67135 + iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
67136 + if (size_change > 0)
67137 + nplug->change_item_size(coord, size_change);
67138 + /* NOTE-NIKITA: huh? where @key is used? */
67139 + result = iplug->b.paste(coord, data, NULL);
67140 + if (size_change < 0)
67141 + nplug->change_item_size(coord, size_change);
67142 + znode_make_dirty(coord->node);
67143 + } else
67144 + /* otherwise do full-fledged carry(). */
67145 + result = paste_with_carry(coord, lh, data, key, flags);
67146 + return result;
67147 +}
67148 +
67149 +/* this either appends or truncates item @coord */
67150 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
67151 + reiser4_item_data * data /* parameters of resize */ ,
67152 + reiser4_key * key /* key of new unit */ ,
67153 + lock_handle * lh /* lock handle of node
67154 + * being modified */ ,
67155 + cop_insert_flag flags /* carry flags */ )
67156 +{
67157 + int result;
67158 + znode *node;
67159 +
67160 + assert("nikita-362", coord != NULL);
67161 + assert("nikita-363", data != NULL);
67162 + assert("vs-245", data->length != 0);
67163 +
67164 + node = coord->node;
67165 + coord_clear_iplug(coord);
67166 + result = zload(node);
67167 + if (result != 0)
67168 + return result;
67169 +
67170 + if (data->length < 0)
67171 + result = node_plugin_by_coord(coord)->shrink_item(coord,
67172 + -data->length);
67173 + else
67174 + result = insert_into_item(coord, lh, key, data, flags);
67175 +
67176 + zrelse(node);
67177 + return result;
67178 +}
67179 +
67180 +/* insert flow @f */
67181 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
67182 +{
67183 + int result;
67184 + carry_pool *pool;
67185 + carry_level *lowest_level;
67186 + reiser4_item_data *data;
67187 + carry_op *op;
67188 +
67189 + pool =
67190 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67191 + sizeof(*data));
67192 + if (IS_ERR(pool))
67193 + return PTR_ERR(pool);
67194 + lowest_level = (carry_level *) (pool + 1);
67195 + init_carry_level(lowest_level, pool);
67196 +
67197 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
67198 + 0 /* operate directly on coord -> node */ );
67199 + if (IS_ERR(op) || (op == NULL)) {
67200 + done_carry_pool(pool);
67201 + return RETERR(op ? PTR_ERR(op) : -EIO);
67202 + }
67203 +
67204 + /* these are permanent during insert_flow */
67205 + data = (reiser4_item_data *) (lowest_level + 3);
67206 + data->user = 1;
67207 + data->iplug = item_plugin_by_id(FORMATTING_ID);
67208 + data->arg = NULL;
67209 + /* data.length and data.data will be set before calling paste or
67210 + insert */
67211 + data->length = 0;
67212 + data->data = NULL;
67213 +
67214 + op->u.insert_flow.flags = 0;
67215 + op->u.insert_flow.insert_point = coord;
67216 + op->u.insert_flow.flow = f;
67217 + op->u.insert_flow.data = data;
67218 + op->u.insert_flow.new_nodes = 0;
67219 +
67220 + lowest_level->track_type = CARRY_TRACK_CHANGE;
67221 + lowest_level->tracked = lh;
67222 +
67223 + result = reiser4_carry(lowest_level, NULL);
67224 + done_carry_pool(pool);
67225 +
67226 + return result;
67227 +}
67228 +
67229 +/* Given a coord in parent node, obtain a znode for the corresponding child */
67230 +znode *child_znode(const coord_t * parent_coord /* coord of pointer to
67231 + * child */ ,
67232 + znode * parent /* parent of child */ ,
67233 + int incore_p /* if !0 only return child if already in
67234 + * memory */ ,
67235 + int setup_dkeys_p /* if !0 update delimiting keys of
67236 + * child */ )
67237 +{
67238 + znode *child;
67239 +
67240 + assert("nikita-1374", parent_coord != NULL);
67241 + assert("nikita-1482", parent != NULL);
67242 +#if REISER4_DEBUG
67243 + if (setup_dkeys_p)
67244 + assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
67245 +#endif
67246 + assert("nikita-2947", znode_is_any_locked(parent));
67247 +
67248 + if (znode_get_level(parent) <= LEAF_LEVEL) {
67249 + /* trying to get child of leaf node */
67250 + warning("nikita-1217", "Child of maize?");
67251 + return ERR_PTR(RETERR(-EIO));
67252 + }
67253 + if (item_is_internal(parent_coord)) {
67254 + reiser4_block_nr addr;
67255 + item_plugin *iplug;
67256 + reiser4_tree *tree;
67257 +
67258 + iplug = item_plugin_by_coord(parent_coord);
67259 + assert("vs-512", iplug->s.internal.down_link);
67260 + iplug->s.internal.down_link(parent_coord, NULL, &addr);
67261 +
67262 + tree = znode_get_tree(parent);
67263 + if (incore_p)
67264 + child = zlook(tree, &addr);
67265 + else
67266 + child =
67267 + zget(tree, &addr, parent,
67268 + znode_get_level(parent) - 1,
67269 + reiser4_ctx_gfp_mask_get());
67270 + if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
67271 + set_child_delimiting_keys(parent, parent_coord, child);
67272 + } else {
67273 + warning("nikita-1483", "Internal item expected");
67274 + child = ERR_PTR(RETERR(-EIO));
67275 + }
67276 + return child;
67277 +}
67278 +
67279 +/* remove znode from transaction */
67280 +static void uncapture_znode(znode * node)
67281 +{
67282 + struct page *page;
67283 +
67284 + assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
67285 +
67286 + if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
67287 + int ret;
67288 +
67289 + /* An already allocated block goes right to the atom's delete set. */
67290 + ret =
67291 + reiser4_dealloc_block(znode_get_block(node), 0,
67292 + BA_DEFER | BA_FORMATTED);
67293 + if (ret)
67294 + warning("zam-942",
67295 + "can\'t add a block (%llu) number to atom's delete set\n",
67296 + (unsigned long long)(*znode_get_block(node)));
67297 +
67298 + spin_lock_znode(node);
67299 + /* Here we return flush reserved block which was reserved at the
67300 + * moment when this allocated node was marked dirty and still
67301 + * not used by flush in node relocation procedure. */
67302 + if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
67303 + txn_atom *atom;
67304 +
67305 + atom = jnode_get_atom(ZJNODE(node));
67306 + assert("zam-939", atom != NULL);
67307 + spin_unlock_znode(node);
67308 + flush_reserved2grabbed(atom, (__u64) 1);
67309 + spin_unlock_atom(atom);
67310 + } else
67311 + spin_unlock_znode(node);
67312 + } else {
67313 + /* znode has assigned block which is counted as "fake
67314 + allocated". Return it back to "free blocks") */
67315 + fake_allocated2free((__u64) 1, BA_FORMATTED);
67316 + }
67317 +
67318 + /*
67319 + * uncapture page from transaction. There is a possibility of a race
67320 + * with ->releasepage(): reiser4_releasepage() detaches page from this
67321 + * jnode and we have nothing to uncapture. To avoid this, get
67322 + * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
67323 + * will deal with released page itself.
67324 + */
67325 + spin_lock_znode(node);
67326 + page = znode_page(node);
67327 + if (likely(page != NULL)) {
67328 + /*
67329 + * reiser4_uncapture_page() can only be called when we are sure
67330 + * that znode is pinned in memory, which we are, because
67331 + * forget_znode() is only called from longterm_unlock_znode().
67332 + */
67333 + page_cache_get(page);
67334 + spin_unlock_znode(node);
67335 + lock_page(page);
67336 + reiser4_uncapture_page(page);
67337 + unlock_page(page);
67338 + page_cache_release(page);
67339 + } else {
67340 + txn_atom *atom;
67341 +
67342 + /* handle "flush queued" znodes */
67343 + while (1) {
67344 + atom = jnode_get_atom(ZJNODE(node));
67345 + assert("zam-943", atom != NULL);
67346 +
67347 + if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
67348 + || !atom->nr_running_queues)
67349 + break;
67350 +
67351 + spin_unlock_znode(node);
67352 + reiser4_atom_wait_event(atom);
67353 + spin_lock_znode(node);
67354 + }
67355 +
67356 + reiser4_uncapture_block(ZJNODE(node));
67357 + spin_unlock_atom(atom);
67358 + zput(node);
67359 + }
67360 +}
67361 +
67362 +/* This is called from longterm_unlock_znode() when last lock is released from
67363 + the node that has been removed from the tree. At this point node is removed
67364 + from sibling list and its lock is invalidated. */
67365 +void forget_znode(lock_handle * handle)
67366 +{
67367 + znode *node;
67368 + reiser4_tree *tree;
67369 +
67370 + assert("umka-319", handle != NULL);
67371 +
67372 + node = handle->node;
67373 + tree = znode_get_tree(node);
67374 +
67375 + assert("vs-164", znode_is_write_locked(node));
67376 + assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
67377 + assert_rw_locked(&(node->lock.guard));
67378 +
67379 + /* We assume that this node was detached from its parent before
67380 + * unlocking, it gives no way to reach this node from parent through a
67381 + * down link. The node should have no children and, thereby, can't be
67382 + * reached from them by their parent pointers. The only way to obtain a
67383 + * reference to the node is to use sibling pointers from its left and
67384 + * right neighbors. In the next several lines we remove the node from
67385 + * the sibling list. */
67386 +
67387 + write_lock_tree(tree);
67388 + sibling_list_remove(node);
67389 + znode_remove(node, tree);
67390 + write_unlock_tree(tree);
67391 +
67392 + /* Here we set JNODE_DYING and cancel all pending lock requests. It
67393 + * forces all lock requestor threads to repeat iterations of getting
67394 + * lock on a child, neighbor or parent node. But, those threads can't
67395 + * come to this node again, because this node is no longer a child,
67396 + * neighbor or parent of any other node. This order of znode
67397 + * invalidation does not allow other threads to waste cpu time is a busy
67398 + * loop, trying to lock dying object. The exception is in the flush
67399 + * code when we take node directly from atom's capture list.*/
67400 + reiser4_invalidate_lock(handle);
67401 + uncapture_znode(node);
67402 +}
67403 +
67404 +/* Check that internal item at @pointer really contains pointer to @child. */
67405 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
67406 + * @child */ ,
67407 + const znode * child /* child znode */ )
67408 +{
67409 + assert("nikita-1016", pointer != NULL);
67410 + assert("nikita-1017", child != NULL);
67411 + assert("nikita-1018", pointer->node != NULL);
67412 +
67413 + assert("nikita-1325", znode_is_any_locked(pointer->node));
67414 +
67415 + assert("nikita-2985",
67416 + znode_get_level(pointer->node) == znode_get_level(child) + 1);
67417 +
67418 + coord_clear_iplug((coord_t *) pointer);
67419 +
67420 + if (coord_is_existing_unit(pointer)) {
67421 + item_plugin *iplug;
67422 + reiser4_block_nr addr;
67423 +
67424 + if (item_is_internal(pointer)) {
67425 + iplug = item_plugin_by_coord(pointer);
67426 + assert("vs-513", iplug->s.internal.down_link);
67427 + iplug->s.internal.down_link(pointer, NULL, &addr);
67428 + /* check that cached value is correct */
67429 + if (disk_addr_eq(&addr, znode_get_block(child))) {
67430 + return NS_FOUND;
67431 + }
67432 + }
67433 + }
67434 + /* warning ("jmacd-1002", "tree pointer incorrect"); */
67435 + return NS_NOT_FOUND;
67436 +}
67437 +
67438 +/* find coord of pointer to new @child in @parent.
67439 +
67440 + Find the &coord_t in the @parent where pointer to a given @child will
67441 + be in.
67442 +
67443 +*/
67444 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67445 + znode *
67446 + child UNUSED_ARG /* child znode, passed locked */ ,
67447 + znode * left /* left brother of new node */ ,
67448 + coord_t * result /* where result is stored in */ )
67449 +{
67450 + int ret;
67451 +
67452 + assert("nikita-1486", parent != NULL);
67453 + assert("nikita-1487", child != NULL);
67454 + assert("nikita-1488", result != NULL);
67455 +
67456 + ret = find_child_ptr(parent, left, result);
67457 + if (ret != NS_FOUND) {
67458 + warning("nikita-1489", "Cannot find brother position: %i", ret);
67459 + return RETERR(-EIO);
67460 + } else {
67461 + result->between = AFTER_UNIT;
67462 + return RETERR(NS_NOT_FOUND);
67463 + }
67464 +}
67465 +
67466 +/* find coord of pointer to @child in @parent.
67467 +
67468 + Find the &coord_t in the @parent where pointer to a given @child is in.
67469 +
67470 +*/
67471 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67472 + znode * child /* child znode, passed locked */ ,
67473 + coord_t * result /* where result is stored in */ )
67474 +{
67475 + int lookup_res;
67476 + node_plugin *nplug;
67477 + /* left delimiting key of a child */
67478 + reiser4_key ld;
67479 + reiser4_tree *tree;
67480 +
67481 + assert("nikita-934", parent != NULL);
67482 + assert("nikita-935", child != NULL);
67483 + assert("nikita-936", result != NULL);
67484 + assert("zam-356", znode_is_loaded(parent));
67485 +
67486 + coord_init_zero(result);
67487 + result->node = parent;
67488 +
67489 + nplug = parent->nplug;
67490 + assert("nikita-939", nplug != NULL);
67491 +
67492 + tree = znode_get_tree(parent);
67493 + /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67494 + * not aliased to ->in_parent of some znode. Otherwise,
67495 + * parent_coord_to_coord() below would modify data protected by tree
67496 + * lock. */
67497 + read_lock_tree(tree);
67498 + /* fast path. Try to use cached value. Lock tree to keep
67499 + node->pos_in_parent and pos->*_blocknr consistent. */
67500 + if (child->in_parent.item_pos + 1 != 0) {
67501 + parent_coord_to_coord(&child->in_parent, result);
67502 + if (check_tree_pointer(result, child) == NS_FOUND) {
67503 + read_unlock_tree(tree);
67504 + return NS_FOUND;
67505 + }
67506 +
67507 + child->in_parent.item_pos = (unsigned short)~0;
67508 + }
67509 + read_unlock_tree(tree);
67510 +
67511 + /* is above failed, find some key from @child. We are looking for the
67512 + least key in a child. */
67513 + read_lock_dk(tree);
67514 + ld = *znode_get_ld_key(child);
67515 + read_unlock_dk(tree);
67516 + /*
67517 + * now, lookup parent with key just found. Note, that left delimiting
67518 + * key doesn't identify node uniquely, because (in extremely rare
67519 + * case) two nodes can have equal left delimiting keys, if one of them
67520 + * is completely filled with directory entries that all happened to be
67521 + * hash collision. But, we check block number in check_tree_pointer()
67522 + * and, so, are safe.
67523 + */
67524 + lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67525 + /* update cached pos_in_node */
67526 + if (lookup_res == NS_FOUND) {
67527 + write_lock_tree(tree);
67528 + coord_to_parent_coord(result, &child->in_parent);
67529 + write_unlock_tree(tree);
67530 + lookup_res = check_tree_pointer(result, child);
67531 + }
67532 + if (lookup_res == NS_NOT_FOUND)
67533 + lookup_res = find_child_by_addr(parent, child, result);
67534 + return lookup_res;
67535 +}
67536 +
67537 +/* find coord of pointer to @child in @parent by scanning
67538 +
67539 + Find the &coord_t in the @parent where pointer to a given @child
67540 + is in by scanning all internal items in @parent and comparing block
67541 + numbers in them with that of @child.
67542 +
67543 +*/
67544 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67545 + znode * child /* child znode, passed locked */ ,
67546 + coord_t * result /* where result is stored in */ )
67547 +{
67548 + int ret;
67549 +
67550 + assert("nikita-1320", parent != NULL);
67551 + assert("nikita-1321", child != NULL);
67552 + assert("nikita-1322", result != NULL);
67553 +
67554 + ret = NS_NOT_FOUND;
67555 +
67556 + for_all_units(result, parent) {
67557 + if (check_tree_pointer(result, child) == NS_FOUND) {
67558 + write_lock_tree(znode_get_tree(parent));
67559 + coord_to_parent_coord(result, &child->in_parent);
67560 + write_unlock_tree(znode_get_tree(parent));
67561 + ret = NS_FOUND;
67562 + break;
67563 + }
67564 + }
67565 + return ret;
67566 +}
67567 +
67568 +/* true, if @addr is "unallocated block number", which is just address, with
67569 + highest bit set. */
67570 +int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
67571 + * check */ )
67572 +{
67573 + assert("nikita-1766", addr != NULL);
67574 + cassert(sizeof(reiser4_block_nr) == 8);
67575 + return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67576 + REISER4_UNALLOCATED_STATUS_VALUE;
67577 +}
67578 +
67579 +/* returns true if removing bytes of given range of key [from_key, to_key]
67580 + causes removing of whole item @from */
67581 +static int
67582 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
67583 + const reiser4_key * to_key)
67584 +{
67585 + item_plugin *iplug;
67586 + reiser4_key key_in_item;
67587 +
67588 + assert("umka-325", from != NULL);
67589 + assert("", item_is_extent(from));
67590 +
67591 + /* check first key just for case */
67592 + item_key_by_coord(from, &key_in_item);
67593 + if (keygt(from_key, &key_in_item))
67594 + return 0;
67595 +
67596 + /* check last key */
67597 + iplug = item_plugin_by_coord(from);
67598 + assert("vs-611", iplug && iplug->s.file.append_key);
67599 +
67600 + iplug->s.file.append_key(from, &key_in_item);
67601 + set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67602 +
67603 + if (keylt(to_key, &key_in_item))
67604 + /* last byte is not removed */
67605 + return 0;
67606 + return 1;
67607 +}
67608 +
67609 +/* helper function for prepare_twig_kill(): @left and @right are formatted
67610 + * neighbors of extent item being completely removed. Load and lock neighbors
67611 + * and store lock handles into @cdata for later use by kill_hook_extent() */
67612 +static int
67613 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67614 +{
67615 + int result;
67616 + int left_loaded;
67617 + int right_loaded;
67618 +
67619 + result = 0;
67620 + left_loaded = right_loaded = 0;
67621 +
67622 + if (left != NULL) {
67623 + result = zload(left);
67624 + if (result == 0) {
67625 + left_loaded = 1;
67626 + result = longterm_lock_znode(kdata->left, left,
67627 + ZNODE_READ_LOCK,
67628 + ZNODE_LOCK_LOPRI);
67629 + }
67630 + }
67631 + if (result == 0 && right != NULL) {
67632 + result = zload(right);
67633 + if (result == 0) {
67634 + right_loaded = 1;
67635 + result = longterm_lock_znode(kdata->right, right,
67636 + ZNODE_READ_LOCK,
67637 + ZNODE_LOCK_HIPRI |
67638 + ZNODE_LOCK_NONBLOCK);
67639 + }
67640 + }
67641 + if (result != 0) {
67642 + done_lh(kdata->left);
67643 + done_lh(kdata->right);
67644 + if (left_loaded != 0)
67645 + zrelse(left);
67646 + if (right_loaded != 0)
67647 + zrelse(right);
67648 + }
67649 + return result;
67650 +}
67651 +
67652 +static void done_children(carry_kill_data * kdata)
67653 +{
67654 + if (kdata->left != NULL && kdata->left->node != NULL) {
67655 + zrelse(kdata->left->node);
67656 + done_lh(kdata->left);
67657 + }
67658 + if (kdata->right != NULL && kdata->right->node != NULL) {
67659 + zrelse(kdata->right->node);
67660 + done_lh(kdata->right);
67661 + }
67662 +}
67663 +
67664 +/* part of cut_node. It is called when cut_node is called to remove or cut part
67665 + of extent item. When head of that item is removed - we have to update right
67666 + delimiting of left neighbor of extent. When item is removed completely - we
67667 + have to set sibling link between left and right neighbor of removed
67668 + extent. This may return -E_DEADLOCK because of trying to get left neighbor
67669 + locked. So, caller should repeat an attempt
67670 +*/
67671 +/* Audited by: umka (2002.06.16) */
67672 +static int
67673 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67674 +{
67675 + int result;
67676 + reiser4_key key;
67677 + lock_handle left_lh;
67678 + lock_handle right_lh;
67679 + coord_t left_coord;
67680 + coord_t *from;
67681 + znode *left_child;
67682 + znode *right_child;
67683 + reiser4_tree *tree;
67684 + int left_zloaded_here, right_zloaded_here;
67685 +
67686 + from = kdata->params.from;
67687 + assert("umka-326", from != NULL);
67688 + assert("umka-327", kdata->params.to != NULL);
67689 +
67690 + /* for one extent item only yet */
67691 + assert("vs-591", item_is_extent(from));
67692 + assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67693 +
67694 + if ((kdata->params.from_key
67695 + && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67696 + || from->unit_pos != 0) {
67697 + /* head of item @from is not removed, there is nothing to
67698 + worry about */
67699 + return 0;
67700 + }
67701 +
67702 + result = 0;
67703 + left_zloaded_here = 0;
67704 + right_zloaded_here = 0;
67705 +
67706 + left_child = right_child = NULL;
67707 +
67708 + coord_dup(&left_coord, from);
67709 + init_lh(&left_lh);
67710 + init_lh(&right_lh);
67711 + if (coord_prev_unit(&left_coord)) {
67712 + /* @from is leftmost item in its node */
67713 + if (!locked_left_neighbor) {
67714 + result =
67715 + reiser4_get_left_neighbor(&left_lh, from->node,
67716 + ZNODE_READ_LOCK,
67717 + GN_CAN_USE_UPPER_LEVELS);
67718 + switch (result) {
67719 + case 0:
67720 + break;
67721 + case -E_NO_NEIGHBOR:
67722 + /* there is no formatted node to the left of
67723 + from->node */
67724 + warning("vs-605",
67725 + "extent item has smallest key in "
67726 + "the tree and it is about to be removed");
67727 + return 0;
67728 + case -E_DEADLOCK:
67729 + /* need to restart */
67730 + default:
67731 + return result;
67732 + }
67733 +
67734 + /* we have acquired left neighbor of from->node */
67735 + result = zload(left_lh.node);
67736 + if (result)
67737 + goto done;
67738 +
67739 + locked_left_neighbor = left_lh.node;
67740 + } else {
67741 + /* squalloc_right_twig_cut should have supplied locked
67742 + * left neighbor */
67743 + assert("vs-834",
67744 + znode_is_write_locked(locked_left_neighbor));
67745 + result = zload(locked_left_neighbor);
67746 + if (result)
67747 + return result;
67748 + }
67749 +
67750 + left_zloaded_here = 1;
67751 + coord_init_last_unit(&left_coord, locked_left_neighbor);
67752 + }
67753 +
67754 + if (!item_is_internal(&left_coord)) {
67755 + /* what else but extent can be on twig level */
67756 + assert("vs-606", item_is_extent(&left_coord));
67757 +
67758 + /* there is no left formatted child */
67759 + if (left_zloaded_here)
67760 + zrelse(locked_left_neighbor);
67761 + done_lh(&left_lh);
67762 + return 0;
67763 + }
67764 +
67765 + tree = znode_get_tree(left_coord.node);
67766 + left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67767 +
67768 + if (IS_ERR(left_child)) {
67769 + result = PTR_ERR(left_child);
67770 + goto done;
67771 + }
67772 +
67773 + /* left child is acquired, calculate new right delimiting key for it
67774 + and get right child if it is necessary */
67775 + if (item_removed_completely
67776 + (from, kdata->params.from_key, kdata->params.to_key)) {
67777 + /* try to get right child of removed item */
67778 + coord_t right_coord;
67779 +
67780 + assert("vs-607",
67781 + kdata->params.to->unit_pos ==
67782 + coord_last_unit_pos(kdata->params.to));
67783 + coord_dup(&right_coord, kdata->params.to);
67784 + if (coord_next_unit(&right_coord)) {
67785 + /* @to is rightmost unit in the node */
67786 + result =
67787 + reiser4_get_right_neighbor(&right_lh, from->node,
67788 + ZNODE_READ_LOCK,
67789 + GN_CAN_USE_UPPER_LEVELS);
67790 + switch (result) {
67791 + case 0:
67792 + result = zload(right_lh.node);
67793 + if (result)
67794 + goto done;
67795 +
67796 + right_zloaded_here = 1;
67797 + coord_init_first_unit(&right_coord,
67798 + right_lh.node);
67799 + item_key_by_coord(&right_coord, &key);
67800 + break;
67801 +
67802 + case -E_NO_NEIGHBOR:
67803 + /* there is no formatted node to the right of
67804 + from->node */
67805 + read_lock_dk(tree);
67806 + key = *znode_get_rd_key(from->node);
67807 + read_unlock_dk(tree);
67808 + right_coord.node = NULL;
67809 + result = 0;
67810 + break;
67811 + default:
67812 + /* real error */
67813 + goto done;
67814 + }
67815 + } else {
67816 + /* there is an item to the right of @from - take its key */
67817 + item_key_by_coord(&right_coord, &key);
67818 + }
67819 +
67820 + /* try to get right child of @from */
67821 + if (right_coord.node && /* there is right neighbor of @from */
67822 + item_is_internal(&right_coord)) { /* it is internal item */
67823 + right_child = child_znode(&right_coord,
67824 + right_coord.node, 1, 0);
67825 +
67826 + if (IS_ERR(right_child)) {
67827 + result = PTR_ERR(right_child);
67828 + goto done;
67829 + }
67830 +
67831 + }
67832 + /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67833 + update of right delimiting key of left_child */
67834 + result = prepare_children(left_child, right_child, kdata);
67835 + } else {
67836 + /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67837 + result = prepare_children(left_child, NULL, kdata);
67838 + }
67839 +
67840 + done:
67841 + if (right_child)
67842 + zput(right_child);
67843 + if (right_zloaded_here)
67844 + zrelse(right_lh.node);
67845 + done_lh(&right_lh);
67846 +
67847 + if (left_child)
67848 + zput(left_child);
67849 + if (left_zloaded_here)
67850 + zrelse(locked_left_neighbor);
67851 + done_lh(&left_lh);
67852 + return result;
67853 +}
67854 +
67855 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67856 + are to be cut completely */
67857 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67858 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
67859 + const reiser4_key * to_key, /* last key to be removed */
67860 + reiser4_key *
67861 + smallest_removed /* smallest key actually removed */ )
67862 +{
67863 + int result;
67864 + carry_pool *pool;
67865 + carry_level *lowest_level;
67866 + carry_cut_data *cut_data;
67867 + carry_op *op;
67868 +
67869 + assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67870 +
67871 + pool =
67872 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67873 + sizeof(*cut_data));
67874 + if (IS_ERR(pool))
67875 + return PTR_ERR(pool);
67876 + lowest_level = (carry_level *) (pool + 1);
67877 + init_carry_level(lowest_level, pool);
67878 +
67879 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67880 + assert("vs-1509", op != 0);
67881 + if (IS_ERR(op)) {
67882 + done_carry_pool(pool);
67883 + return PTR_ERR(op);
67884 + }
67885 +
67886 + cut_data = (carry_cut_data *) (lowest_level + 3);
67887 + cut_data->params.from = from;
67888 + cut_data->params.to = to;
67889 + cut_data->params.from_key = from_key;
67890 + cut_data->params.to_key = to_key;
67891 + cut_data->params.smallest_removed = smallest_removed;
67892 +
67893 + op->u.cut_or_kill.is_cut = 1;
67894 + op->u.cut_or_kill.u.cut = cut_data;
67895 +
67896 + result = reiser4_carry(lowest_level, NULL);
67897 + done_carry_pool(pool);
67898 +
67899 + return result;
67900 +}
67901 +
67902 +/* cut part of the node
67903 +
67904 + Cut part or whole content of node.
67905 +
67906 + cut data between @from and @to of @from->node and call carry() to make
67907 + corresponding changes in the tree. @from->node may become empty. If so -
67908 + pointer to it will be removed. Neighboring nodes are not changed. Smallest
67909 + removed key is stored in @smallest_removed
67910 +
67911 +*/
67912 +int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
67913 + coord_t * to, /* coord of the last unit/item that will be eliminated */
67914 + const reiser4_key * from_key, /* first key to be removed */
67915 + const reiser4_key * to_key, /* last key to be removed */
67916 + reiser4_key * smallest_removed, /* smallest key actually removed */
67917 + znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
67918 + * locked (in squalloc_right_twig_cut, namely) */
67919 + struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
67920 + invalidate pages together with item pointing to them */
67921 + int truncate)
67922 +{ /* this call is made for file truncate) */
67923 + int result;
67924 + carry_pool *pool;
67925 + carry_level *lowest_level;
67926 + carry_kill_data *kdata;
67927 + lock_handle *left_child;
67928 + lock_handle *right_child;
67929 + carry_op *op;
67930 +
67931 + assert("umka-328", from != NULL);
67932 + assert("vs-316", !node_is_empty(from->node));
67933 + assert("nikita-1812", coord_is_existing_unit(from)
67934 + && coord_is_existing_unit(to));
67935 +
67936 + /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67937 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67938 + sizeof(carry_kill_data) +
67939 + 2 * sizeof(lock_handle) +
67940 + 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67941 + if (IS_ERR(pool))
67942 + return PTR_ERR(pool);
67943 +
67944 + lowest_level = (carry_level *) (pool + 1);
67945 + init_carry_level(lowest_level, pool);
67946 +
67947 + kdata = (carry_kill_data *) (lowest_level + 3);
67948 + left_child = (lock_handle *) (kdata + 1);
67949 + right_child = left_child + 1;
67950 +
67951 + init_lh(left_child);
67952 + init_lh(right_child);
67953 +
67954 + kdata->params.from = from;
67955 + kdata->params.to = to;
67956 + kdata->params.from_key = from_key;
67957 + kdata->params.to_key = to_key;
67958 + kdata->params.smallest_removed = smallest_removed;
67959 + kdata->params.truncate = truncate;
67960 + kdata->flags = 0;
67961 + kdata->inode = inode;
67962 + kdata->left = left_child;
67963 + kdata->right = right_child;
67964 + /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67965 + kdata->buf = (char *)(right_child + 1);
67966 +
67967 + if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67968 + /* left child of extent item may have to get updated right
67969 + delimiting key and to get linked with right child of extent
67970 + @from if it will be removed completely */
67971 + result = prepare_twig_kill(kdata, locked_left_neighbor);
67972 + if (result) {
67973 + done_children(kdata);
67974 + done_carry_pool(pool);
67975 + return result;
67976 + }
67977 + }
67978 +
67979 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67980 + if (IS_ERR(op) || (op == NULL)) {
67981 + done_children(kdata);
67982 + done_carry_pool(pool);
67983 + return RETERR(op ? PTR_ERR(op) : -EIO);
67984 + }
67985 +
67986 + op->u.cut_or_kill.is_cut = 0;
67987 + op->u.cut_or_kill.u.kill = kdata;
67988 +
67989 + result = reiser4_carry(lowest_level, NULL);
67990 +
67991 + done_children(kdata);
67992 + done_carry_pool(pool);
67993 + return result;
67994 +}
67995 +
67996 +void
67997 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67998 +{
67999 + if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
68000 + pgoff_t start_pg, end_pg;
68001 +
68002 + start_pg = start >> PAGE_CACHE_SHIFT;
68003 + end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
68004 +
68005 + if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
68006 + /*
68007 + * kill up to the page boundary.
68008 + */
68009 + assert("vs-123456", start_pg == end_pg);
68010 + reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
68011 + truncate);
68012 + } else if (start_pg != end_pg) {
68013 + /*
68014 + * page boundary is within killed portion of node.
68015 + */
68016 + assert("vs-654321", end_pg - start_pg == 1);
68017 + reiser4_invalidate_pages(inode->i_mapping, end_pg,
68018 + end_pg - start_pg, 1);
68019 + }
68020 + }
68021 + inode_sub_bytes(inode, end - start);
68022 +}
68023 +
68024 +/**
68025 + * Delete whole @node from the reiser4 tree without loading it.
68026 + *
68027 + * @left: locked left neighbor,
68028 + * @node: node to be deleted,
68029 + * @smallest_removed: leftmost key of deleted node,
68030 + * @object: inode pointer, if we truncate a file body.
68031 + * @truncate: true if called for file truncate.
68032 + *
68033 + * @return: 0 if success, error code otherwise.
68034 + *
68035 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
68036 + * contains the right value of the smallest removed key from the previous
68037 + * cut_worker() iteration. This is needed for proper accounting of
68038 + * "i_blocks" and "i_bytes" fields of the @object.
68039 + */
68040 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
68041 + struct inode *object, int truncate)
68042 +{
68043 + lock_handle parent_lock;
68044 + coord_t cut_from;
68045 + coord_t cut_to;
68046 + reiser4_tree *tree;
68047 + int ret;
68048 +
68049 + assert("zam-937", node != NULL);
68050 + assert("zam-933", znode_is_write_locked(node));
68051 + assert("zam-999", smallest_removed != NULL);
68052 +
68053 + init_lh(&parent_lock);
68054 +
68055 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
68056 + if (ret)
68057 + return ret;
68058 +
68059 + assert("zam-934", !znode_above_root(parent_lock.node));
68060 +
68061 + ret = zload(parent_lock.node);
68062 + if (ret)
68063 + goto failed_nozrelse;
68064 +
68065 + ret = find_child_ptr(parent_lock.node, node, &cut_from);
68066 + if (ret)
68067 + goto failed;
68068 +
68069 + /* decrement child counter and set parent pointer to NULL before
68070 + deleting the list from parent node because of checks in
68071 + internal_kill_item_hook (we can delete the last item from the parent
68072 + node, the parent node is going to be deleted and its c_count should
68073 + be zero). */
68074 +
68075 + tree = znode_get_tree(node);
68076 + write_lock_tree(tree);
68077 + init_parent_coord(&node->in_parent, NULL);
68078 + --parent_lock.node->c_count;
68079 + write_unlock_tree(tree);
68080 +
68081 + assert("zam-989", item_is_internal(&cut_from));
68082 +
68083 + /* @node should be deleted after unlocking. */
68084 + ZF_SET(node, JNODE_HEARD_BANSHEE);
68085 +
68086 + /* remove a pointer from the parent node to the node being deleted. */
68087 + coord_dup(&cut_to, &cut_from);
68088 + /* FIXME: shouldn't this be kill_node_content */
68089 + ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
68090 + if (ret)
68091 + /* FIXME(Zam): Should we re-connect the node to its parent if
68092 + * cut_node fails? */
68093 + goto failed;
68094 +
68095 + {
68096 + reiser4_tree *tree = current_tree;
68097 + __u64 start_offset = 0, end_offset = 0;
68098 +
68099 + read_lock_tree(tree);
68100 + write_lock_dk(tree);
68101 + if (object) {
68102 + /* We use @smallest_removed and the left delimiting of
68103 + * the current node for @object->i_blocks, i_bytes
68104 + * calculation. We assume that the items after the
68105 + * *@smallest_removed key have been deleted from the
68106 + * file body. */
68107 + start_offset = get_key_offset(znode_get_ld_key(node));
68108 + end_offset = get_key_offset(smallest_removed);
68109 + }
68110 +
68111 + assert("zam-1021", znode_is_connected(node));
68112 + if (node->left)
68113 + znode_set_rd_key(node->left, znode_get_rd_key(node));
68114 +
68115 + *smallest_removed = *znode_get_ld_key(node);
68116 +
68117 + write_unlock_dk(tree);
68118 + read_unlock_tree(tree);
68119 +
68120 + if (object) {
68121 + /* we used to perform actions which are to be performed on items on their removal from tree in
68122 + special item method - kill_hook. Here for optimization reasons we avoid reading node
68123 + containing item we remove and can not call item's kill hook. Instead we call function which
68124 + does exactly the same things as tail kill hook in assumption that node we avoid reading
68125 + contains only one item and that item is a tail one. */
68126 + fake_kill_hook_tail(object, start_offset, end_offset,
68127 + truncate);
68128 + }
68129 + }
68130 + failed:
68131 + zrelse(parent_lock.node);
68132 + failed_nozrelse:
68133 + done_lh(&parent_lock);
68134 +
68135 + return ret;
68136 +}
68137 +
68138 +static int can_delete(const reiser4_key *key, znode *node)
68139 +{
68140 + int result;
68141 +
68142 + read_lock_dk(current_tree);
68143 + result = keyle(key, znode_get_ld_key(node));
68144 + read_unlock_dk(current_tree);
68145 + return result;
68146 +}
68147 +
68148 +/**
68149 + * This subroutine is not optimal but implementation seems to
68150 + * be easier).
68151 + *
68152 + * @tap: the point deletion process begins from,
68153 + * @from_key: the beginning of the deleted key range,
68154 + * @to_key: the end of the deleted key range,
68155 + * @smallest_removed: the smallest removed key,
68156 + * @truncate: true if called for file truncate.
68157 + * @progress: return true if a progress in file items deletions was made,
68158 + * @smallest_removed value is actual in that case.
68159 + *
68160 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
68161 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
68162 + */
68163 +int
68164 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
68165 + const reiser4_key * to_key,
68166 + reiser4_key * smallest_removed, struct inode *object,
68167 + int truncate, int *progress)
68168 +{
68169 + lock_handle next_node_lock;
68170 + coord_t left_coord;
68171 + int result;
68172 +
68173 + assert("zam-931", tap->coord->node != NULL);
68174 + assert("zam-932", znode_is_write_locked(tap->coord->node));
68175 +
68176 + *progress = 0;
68177 + init_lh(&next_node_lock);
68178 +
68179 + while (1) {
68180 + znode *node; /* node from which items are cut */
68181 + node_plugin *nplug; /* node plugin for @node */
68182 +
68183 + node = tap->coord->node;
68184 +
68185 + /* Move next_node_lock to the next node on the left. */
68186 + result =
68187 + reiser4_get_left_neighbor(&next_node_lock, node,
68188 + ZNODE_WRITE_LOCK,
68189 + GN_CAN_USE_UPPER_LEVELS);
68190 + if (result != 0 && result != -E_NO_NEIGHBOR)
68191 + break;
68192 + /* Check can we delete the node as a whole. */
68193 + if (*progress && znode_get_level(node) == LEAF_LEVEL &&
68194 + can_delete(from_key, node)) {
68195 + result = reiser4_delete_node(node, smallest_removed,
68196 + object, truncate);
68197 + } else {
68198 + result = reiser4_tap_load(tap);
68199 + if (result)
68200 + return result;
68201 +
68202 + /* Prepare the second (right) point for cut_node() */
68203 + if (*progress)
68204 + coord_init_last_unit(tap->coord, node);
68205 +
68206 + else if (item_plugin_by_coord(tap->coord)->b.lookup ==
68207 + NULL)
68208 + /* set rightmost unit for the items without lookup method */
68209 + tap->coord->unit_pos =
68210 + coord_last_unit_pos(tap->coord);
68211 +
68212 + nplug = node->nplug;
68213 +
68214 + assert("vs-686", nplug);
68215 + assert("vs-687", nplug->lookup);
68216 +
68217 + /* left_coord is leftmost unit cut from @node */
68218 + result = nplug->lookup(node, from_key,
68219 + FIND_MAX_NOT_MORE_THAN,
68220 + &left_coord);
68221 +
68222 + if (IS_CBKERR(result))
68223 + break;
68224 +
68225 + /* adjust coordinates so that they are set to existing units */
68226 + if (coord_set_to_right(&left_coord)
68227 + || coord_set_to_left(tap->coord)) {
68228 + result = 0;
68229 + break;
68230 + }
68231 +
68232 + if (coord_compare(&left_coord, tap->coord) ==
68233 + COORD_CMP_ON_RIGHT) {
68234 + /* keys from @from_key to @to_key are not in the tree */
68235 + result = 0;
68236 + break;
68237 + }
68238 +
68239 + if (left_coord.item_pos != tap->coord->item_pos) {
68240 + /* do not allow to cut more than one item. It is added to solve problem of truncating
68241 + partially converted files. If file is partially converted there may exist a twig node
68242 + containing both internal item or items pointing to leaf nodes with formatting items
68243 + and extent item. We do not want to kill internal items being at twig node here
68244 + because cut_tree_worker assumes killing them from level level */
68245 + coord_dup(&left_coord, tap->coord);
68246 + assert("vs-1652",
68247 + coord_is_existing_unit(&left_coord));
68248 + left_coord.unit_pos = 0;
68249 + }
68250 +
68251 + /* cut data from one node */
68252 + /* *smallest_removed = *reiser4_min_key(); */
68253 + result =
68254 + kill_node_content(&left_coord, tap->coord, from_key,
68255 + to_key, smallest_removed,
68256 + next_node_lock.node, object,
68257 + truncate);
68258 + reiser4_tap_relse(tap);
68259 + }
68260 + if (result)
68261 + break;
68262 +
68263 + ++(*progress);
68264 +
68265 + /* Check whether all items with keys >= from_key were removed
68266 + * from the tree. */
68267 + if (keyle(smallest_removed, from_key))
68268 + /* result = 0; */
68269 + break;
68270 +
68271 + if (next_node_lock.node == NULL)
68272 + break;
68273 +
68274 + result = reiser4_tap_move(tap, &next_node_lock);
68275 + done_lh(&next_node_lock);
68276 + if (result)
68277 + break;
68278 +
68279 + /* Break long reiser4_cut_tree operation (deletion of a large
68280 + file) if atom requires commit. */
68281 + if (*progress > CUT_TREE_MIN_ITERATIONS
68282 + && current_atom_should_commit()) {
68283 + result = -E_REPEAT;
68284 + break;
68285 + }
68286 + }
68287 + done_lh(&next_node_lock);
68288 + /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */
68289 + return result;
68290 +}
68291 +
68292 +/* there is a fundamental problem with optimizing deletes: VFS does it
68293 + one file at a time. Another problem is that if an item can be
68294 + anything, then deleting items must be done one at a time. It just
68295 + seems clean to writes this to specify a from and a to key, and cut
68296 + everything between them though. */
68297 +
68298 +/* use this function with care if deleting more than what is part of a single file. */
68299 +/* do not use this when cutting a single item, it is suboptimal for that */
68300 +
68301 +/* You are encouraged to write plugin specific versions of this. It
68302 + cannot be optimal for all plugins because it works item at a time,
68303 + and some plugins could sometimes work node at a time. Regular files
68304 + however are not optimizable to work node at a time because of
68305 + extents needing to free the blocks they point to.
68306 +
68307 + Optimizations compared to v3 code:
68308 +
68309 + It does not balance (that task is left to memory pressure code).
68310 +
68311 + Nodes are deleted only if empty.
68312 +
68313 + Uses extents.
68314 +
68315 + Performs read-ahead of formatted nodes whose contents are part of
68316 + the deletion.
68317 +*/
68318 +
68319 +/**
68320 + * Delete everything from the reiser4 tree between two keys: @from_key and
68321 + * @to_key.
68322 + *
68323 + * @from_key: the beginning of the deleted key range,
68324 + * @to_key: the end of the deleted key range,
68325 + * @smallest_removed: the smallest removed key,
68326 + * @object: owner of cutting items.
68327 + * @truncate: true if called for file truncate.
68328 + * @progress: return true if a progress in file items deletions was made,
68329 + * @smallest_removed value is actual in that case.
68330 + *
68331 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
68332 + * operation was interrupted for allowing atom commit .
68333 + */
68334 +
68335 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
68336 + const reiser4_key * to_key,
68337 + reiser4_key * smallest_removed_p,
68338 + struct inode *object, int truncate, int *progress)
68339 +{
68340 + lock_handle lock;
68341 + int result;
68342 + tap_t tap;
68343 + coord_t right_coord;
68344 + reiser4_key smallest_removed;
68345 + int (*cut_tree_worker) (tap_t *, const reiser4_key *,
68346 + const reiser4_key *, reiser4_key *,
68347 + struct inode *, int, int *);
68348 + STORE_COUNTERS;
68349 +
68350 + assert("umka-329", tree != NULL);
68351 + assert("umka-330", from_key != NULL);
68352 + assert("umka-331", to_key != NULL);
68353 + assert("zam-936", keyle(from_key, to_key));
68354 +
68355 + if (smallest_removed_p == NULL)
68356 + smallest_removed_p = &smallest_removed;
68357 +
68358 + init_lh(&lock);
68359 +
68360 + do {
68361 + /* Find rightmost item to cut away from the tree. */
68362 + result = reiser4_object_lookup(object, to_key, &right_coord,
68363 + &lock, ZNODE_WRITE_LOCK,
68364 + FIND_MAX_NOT_MORE_THAN,
68365 + TWIG_LEVEL, LEAF_LEVEL,
68366 + CBK_UNIQUE, NULL /*ra_info */);
68367 + if (result != CBK_COORD_FOUND)
68368 + break;
68369 + if (object == NULL
68370 + || inode_file_plugin(object)->cut_tree_worker == NULL)
68371 + cut_tree_worker = cut_tree_worker_common;
68372 + else
68373 + cut_tree_worker =
68374 + inode_file_plugin(object)->cut_tree_worker;
68375 + reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
68376 + result =
68377 + cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
68378 + object, truncate, progress);
68379 + reiser4_tap_done(&tap);
68380 +
68381 + reiser4_preempt_point();
68382 +
68383 + } while (0);
68384 +
68385 + done_lh(&lock);
68386 +
68387 + if (result) {
68388 + switch (result) {
68389 + case -E_NO_NEIGHBOR:
68390 + result = 0;
68391 + break;
68392 + case -E_DEADLOCK:
68393 + result = -E_REPEAT;
68394 + case -E_REPEAT:
68395 + case -ENOMEM:
68396 + case -ENOENT:
68397 + break;
68398 + default:
68399 + warning("nikita-2861", "failure: %i", result);
68400 + }
68401 + }
68402 +
68403 + CHECK_COUNTERS;
68404 + return result;
68405 +}
68406 +
68407 +/* repeat reiser4_cut_tree_object until everything is deleted.
68408 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
68409 + * is returned by cut_tree_object. */
68410 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68411 + const reiser4_key * to, struct inode *inode, int truncate)
68412 +{
68413 + int result;
68414 + int progress;
68415 +
68416 + do {
68417 + result = reiser4_cut_tree_object(tree, from, to, NULL,
68418 + inode, truncate, &progress);
68419 + } while (result == -E_REPEAT);
68420 +
68421 + return result;
68422 +}
68423 +
68424 +/* finishing reiser4 initialization */
68425 +int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
68426 + * initialized */ ,
68427 + const reiser4_block_nr * root_block /* address of a root block
68428 + * on a disk */ ,
68429 + tree_level height /* height of a tree */ ,
68430 + node_plugin * nplug /* default node plugin */ )
68431 +{
68432 + int result;
68433 +
68434 + assert("nikita-306", tree != NULL);
68435 + assert("nikita-307", root_block != NULL);
68436 + assert("nikita-308", height > 0);
68437 + assert("nikita-309", nplug != NULL);
68438 + assert("zam-587", tree->super != NULL);
68439 +
68440 + tree->root_block = *root_block;
68441 + tree->height = height;
68442 + tree->estimate_one_insert = calc_estimate_one_insert(height);
68443 + tree->nplug = nplug;
68444 +
68445 + tree->znode_epoch = 1ull;
68446 +
68447 + cbk_cache_init(&tree->cbk_cache);
68448 +
68449 + result = znodes_tree_init(tree);
68450 + if (result == 0)
68451 + result = jnodes_tree_init(tree);
68452 + if (result == 0) {
68453 + tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68454 + reiser4_ctx_gfp_mask_get());
68455 + if (IS_ERR(tree->uber)) {
68456 + result = PTR_ERR(tree->uber);
68457 + tree->uber = NULL;
68458 + }
68459 + }
68460 + return result;
68461 +}
68462 +
68463 +/* release resources associated with @tree */
68464 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68465 +{
68466 + if (tree == NULL)
68467 + return;
68468 +
68469 + if (tree->uber != NULL) {
68470 + zput(tree->uber);
68471 + tree->uber = NULL;
68472 + }
68473 + znodes_tree_done(tree);
68474 + jnodes_tree_done(tree);
68475 + cbk_cache_done(&tree->cbk_cache);
68476 +}
68477 +
68478 +/* Make Linus happy.
68479 + Local variables:
68480 + c-indentation-style: "K&R"
68481 + mode-name: "LC"
68482 + c-basic-offset: 8
68483 + tab-width: 8
68484 + fill-column: 120
68485 + scroll-step: 1
68486 + End:
68487 +*/
68488 diff -urN linux-2.6.33.orig/fs/reiser4/tree.h linux-2.6.33/fs/reiser4/tree.h
68489 --- linux-2.6.33.orig/fs/reiser4/tree.h 1970-01-01 01:00:00.000000000 +0100
68490 +++ linux-2.6.33/fs/reiser4/tree.h 2010-03-04 19:33:22.000000000 +0100
68491 @@ -0,0 +1,577 @@
68492 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68493 + * reiser4/README */
68494 +
68495 +/* Tree operations. See fs/reiser4/tree.c for comments */
68496 +
68497 +#if !defined( __REISER4_TREE_H__ )
68498 +#define __REISER4_TREE_H__
68499 +
68500 +#include "forward.h"
68501 +#include "debug.h"
68502 +#include "dformat.h"
68503 +#include "plugin/node/node.h"
68504 +#include "plugin/plugin.h"
68505 +#include "znode.h"
68506 +#include "tap.h"
68507 +
68508 +#include <linux/types.h> /* for __u?? */
68509 +#include <linux/fs.h> /* for struct super_block */
68510 +#include <linux/spinlock.h>
68511 +#include <linux/sched.h> /* for struct task_struct */
68512 +
68513 +/* fictive block number never actually used */
68514 +extern const reiser4_block_nr UBER_TREE_ADDR;
68515 +
68516 +/* &cbk_cache_slot - entry in a coord cache.
68517 +
68518 + This is entry in a coord_by_key (cbk) cache, represented by
68519 + &cbk_cache.
68520 +
68521 +*/
68522 +typedef struct cbk_cache_slot {
68523 + /* cached node */
68524 + znode *node;
68525 + /* linkage to the next cbk cache slot in a LRU order */
68526 + struct list_head lru;
68527 +} cbk_cache_slot;
68528 +
68529 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
68530 +
68531 + cbk_cache is supposed to speed up tree lookups by caching results of recent
68532 + successful lookups (we don't cache negative results as dentry cache
68533 + does). Cache consists of relatively small number of entries kept in a LRU
68534 + order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68535 + which we can obtain a range of keys that covered by this znode. Before
68536 + embarking into real tree traversal we scan cbk_cache slot by slot and for
68537 + each slot check whether key we are looking for is between minimal and
68538 + maximal keys for node pointed to by this slot. If no match is found, real
68539 + tree traversal is performed and if result is successful, appropriate entry
68540 + is inserted into cache, possibly pulling least recently used entry out of
68541 + it.
68542 +
68543 + Tree spin lock is used to protect coord cache. If contention for this
68544 + lock proves to be too high, more finer grained locking can be added.
68545 +
68546 + Invariants involving parts of this data-type:
68547 +
68548 + [cbk-cache-invariant]
68549 +*/
68550 +typedef struct cbk_cache {
68551 + /* serializator */
68552 + rwlock_t guard;
68553 + int nr_slots;
68554 + /* head of LRU list of cache slots */
68555 + struct list_head lru;
68556 + /* actual array of slots */
68557 + cbk_cache_slot *slot;
68558 +} cbk_cache;
68559 +
68560 +/* level_lookup_result - possible outcome of looking up key at some level.
68561 + This is used by coord_by_key when traversing tree downward. */
68562 +typedef enum {
68563 + /* continue to the next level */
68564 + LOOKUP_CONT,
68565 + /* done. Either required item was found, or we can prove it
68566 + doesn't exist, or some error occurred. */
68567 + LOOKUP_DONE,
68568 + /* restart traversal from the root. Infamous "repetition". */
68569 + LOOKUP_REST
68570 +} level_lookup_result;
68571 +
68572 +/* This is representation of internal reiser4 tree where all file-system
68573 + data and meta-data are stored. This structure is passed to all tree
68574 + manipulation functions. It's different from the super block because:
68575 + we don't want to limit ourselves to strictly one to one mapping
68576 + between super blocks and trees, and, because they are logically
68577 + different: there are things in a super block that have no relation to
68578 + the tree (bitmaps, journalling area, mount options, etc.) and there
68579 + are things in a tree that bear no relation to the super block, like
68580 + tree of znodes.
68581 +
68582 + At this time, there is only one tree
68583 + per filesystem, and this struct is part of the super block. We only
68584 + call the super block the super block for historical reasons (most
68585 + other filesystems call the per filesystem metadata the super block).
68586 +*/
68587 +
68588 +struct reiser4_tree {
68589 + /* block_nr == 0 is fake znode. Write lock it, while changing
68590 + tree height. */
68591 + /* disk address of root node of a tree */
68592 + reiser4_block_nr root_block;
68593 +
68594 + /* level of the root node. If this is 1, tree consists of root
68595 + node only */
68596 + tree_level height;
68597 +
68598 + /*
68599 + * this is cached here avoid calling plugins through function
68600 + * dereference all the time.
68601 + */
68602 + __u64 estimate_one_insert;
68603 +
68604 + /* cache of recent tree lookup results */
68605 + cbk_cache cbk_cache;
68606 +
68607 + /* hash table to look up znodes by block number. */
68608 + z_hash_table zhash_table;
68609 + z_hash_table zfake_table;
68610 + /* hash table to look up jnodes by inode and offset. */
68611 + j_hash_table jhash_table;
68612 +
68613 + /* lock protecting:
68614 + - parent pointers,
68615 + - sibling pointers,
68616 + - znode hash table
68617 + - coord cache
68618 + */
68619 + /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68620 + hoping they will be less contented. We can use one spin lock per one
68621 + znode hash bucket. With adding of some code complexity, sibling
68622 + pointers can be protected by both znode spin locks. However it looks
68623 + more SMP scalable we should test this locking change on n-ways (n >
68624 + 4) SMP machines. Current 4-ways machine test does not show that tree
68625 + lock is contented and it is a bottleneck (2003.07.25). */
68626 +
68627 + rwlock_t tree_lock;
68628 +
68629 + /* lock protecting delimiting keys */
68630 + rwlock_t dk_lock;
68631 +
68632 + /* spin lock protecting znode_epoch */
68633 + spinlock_t epoch_lock;
68634 + /* version stamp used to mark znode updates. See seal.[ch] for more
68635 + * information. */
68636 + __u64 znode_epoch;
68637 +
68638 + znode *uber;
68639 + node_plugin *nplug;
68640 + struct super_block *super;
68641 + struct {
68642 + /* carry flags used for insertion of new nodes */
68643 + __u32 new_node_flags;
68644 + /* carry flags used for insertion of new extents */
68645 + __u32 new_extent_flags;
68646 + /* carry flags used for paste operations */
68647 + __u32 paste_flags;
68648 + /* carry flags used for insert operations */
68649 + __u32 insert_flags;
68650 + } carry;
68651 +};
68652 +
68653 +extern int reiser4_init_tree(reiser4_tree * tree,
68654 + const reiser4_block_nr * root_block,
68655 + tree_level height, node_plugin * default_plugin);
68656 +extern void reiser4_done_tree(reiser4_tree * tree);
68657 +
68658 +/* cbk flags: options for coord_by_key() */
68659 +typedef enum {
68660 + /* coord_by_key() is called for insertion. This is necessary because
68661 + of extents being located at the twig level. For explanation, see
68662 + comment just above is_next_item_internal().
68663 + */
68664 + CBK_FOR_INSERT = (1 << 0),
68665 + /* coord_by_key() is called with key that is known to be unique */
68666 + CBK_UNIQUE = (1 << 1),
68667 + /* coord_by_key() can trust delimiting keys. This options is not user
68668 + accessible. coord_by_key() will set it automatically. It will be
68669 + only cleared by special-case in extents-on-the-twig-level handling
68670 + where it is necessary to insert item with a key smaller than
68671 + leftmost key in a node. This is necessary because of extents being
68672 + located at the twig level. For explanation, see comment just above
68673 + is_next_item_internal().
68674 + */
68675 + CBK_TRUST_DK = (1 << 2),
68676 + CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
68677 + CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
68678 + CBK_DKSET = (1 << 5),
68679 + CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
68680 + CBK_IN_CACHE = (1 << 7), /* node is already in cache */
68681 + CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
68682 + * lock */
68683 +} cbk_flags;
68684 +
68685 +/* insertion outcome. IBK = insert by key */
68686 +typedef enum {
68687 + IBK_INSERT_OK = 0,
68688 + IBK_ALREADY_EXISTS = -EEXIST,
68689 + IBK_IO_ERROR = -EIO,
68690 + IBK_NO_SPACE = -E_NODE_FULL,
68691 + IBK_OOM = -ENOMEM
68692 +} insert_result;
68693 +
68694 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68695 +
68696 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68697 + lock_handle * lh, void *arg);
68698 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68699 + lock_handle * lh,
68700 + tree_iterate_actor_t actor, void *arg,
68701 + znode_lock_mode mode, int through_units_p);
68702 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68703 + znode_lock_request pri, lock_handle * lh);
68704 +
68705 +/* return node plugin of @node */
68706 +static inline node_plugin *node_plugin_by_node(const znode *
68707 + node /* node to query */ )
68708 +{
68709 + assert("vs-213", node != NULL);
68710 + assert("vs-214", znode_is_loaded(node));
68711 +
68712 + return node->nplug;
68713 +}
68714 +
68715 +/* number of items in @node */
68716 +static inline pos_in_node_t node_num_items(const znode * node)
68717 +{
68718 + assert("nikita-2754", znode_is_loaded(node));
68719 + assert("nikita-2468",
68720 + node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68721 +
68722 + return node->nr_items;
68723 +}
68724 +
68725 +/* Return the number of items at the present node. Asserts coord->node !=
68726 + NULL. */
68727 +static inline unsigned coord_num_items(const coord_t * coord)
68728 +{
68729 + assert("jmacd-9805", coord->node != NULL);
68730 +
68731 + return node_num_items(coord->node);
68732 +}
68733 +
68734 +/* true if @node is empty */
68735 +static inline int node_is_empty(const znode * node)
68736 +{
68737 + return node_num_items(node) == 0;
68738 +}
68739 +
68740 +typedef enum {
68741 + SHIFTED_SOMETHING = 0,
68742 + SHIFT_NO_SPACE = -E_NODE_FULL,
68743 + SHIFT_IO_ERROR = -EIO,
68744 + SHIFT_OOM = -ENOMEM,
68745 +} shift_result;
68746 +
68747 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68748 +extern int is_coord_in_node(const coord_t * coord);
68749 +extern int key_in_node(const reiser4_key *, const coord_t *);
68750 +extern void coord_item_move_to(coord_t * coord, int items);
68751 +extern void coord_unit_move_to(coord_t * coord, int units);
68752 +
68753 +/* there are two types of repetitive accesses (ra): intra-syscall
68754 + (local) and inter-syscall (global). Local ra is used when
68755 + during single syscall we add/delete several items and units in the
68756 + same place in a tree. Note that plan-A fragments local ra by
68757 + separating stat-data and file body in key-space. Global ra is
68758 + used when user does repetitive modifications in the same place in a
68759 + tree.
68760 +
68761 + Our ra implementation serves following purposes:
68762 + 1 it affects balancing decisions so that next operation in a row
68763 + can be performed faster;
68764 + 2 it affects lower-level read-ahead in page-cache;
68765 + 3 it allows to avoid unnecessary lookups by maintaining some state
68766 + across several operations (this is only for local ra);
68767 + 4 it leaves room for lazy-micro-balancing: when we start a sequence of
68768 + operations they are performed without actually doing any intra-node
68769 + shifts, until we finish sequence or scope of sequence leaves
68770 + current node, only then we really pack node (local ra only).
68771 +*/
68772 +
68773 +/* another thing that can be useful is to keep per-tree and/or
68774 + per-process cache of recent lookups. This cache can be organised as a
68775 + list of block numbers of formatted nodes sorted by starting key in
68776 + this node. Balancings should invalidate appropriate parts of this
68777 + cache.
68778 +*/
68779 +
68780 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68781 + coord_t * coord, lock_handle * handle,
68782 + znode_lock_mode lock, lookup_bias bias,
68783 + tree_level lock_level, tree_level stop_level,
68784 + __u32 flags, ra_info_t *);
68785 +
68786 +lookup_result reiser4_object_lookup(struct inode *object,
68787 + const reiser4_key * key,
68788 + coord_t * coord,
68789 + lock_handle * lh,
68790 + znode_lock_mode lock_mode,
68791 + lookup_bias bias,
68792 + tree_level lock_level,
68793 + tree_level stop_level,
68794 + __u32 flags, ra_info_t * info);
68795 +
68796 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68797 + reiser4_item_data * data, coord_t * coord,
68798 + lock_handle * lh,
68799 + tree_level stop_level, __u32 flags);
68800 +insert_result insert_by_coord(coord_t * coord,
68801 + reiser4_item_data * data, const reiser4_key * key,
68802 + lock_handle * lh, __u32);
68803 +insert_result insert_extent_by_coord(coord_t * coord,
68804 + reiser4_item_data * data,
68805 + const reiser4_key * key, lock_handle * lh);
68806 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68807 + const reiser4_key * to_key,
68808 + reiser4_key * smallest_removed);
68809 +int kill_node_content(coord_t * from, coord_t * to,
68810 + const reiser4_key * from_key, const reiser4_key * to_key,
68811 + reiser4_key * smallest_removed,
68812 + znode * locked_left_neighbor, struct inode *inode,
68813 + int truncate);
68814 +
68815 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68816 + reiser4_key * key, lock_handle * lh, cop_insert_flag);
68817 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68818 + reiser4_item_data * data, unsigned);
68819 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68820 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
68821 + coord_t * result);
68822 +
68823 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68824 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68825 +
68826 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68827 +
68828 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68829 + const reiser4_key *, reiser4_key *,
68830 + struct inode *, int, int *);
68831 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68832 + const reiser4_key *, reiser4_key *,
68833 + struct inode *, int, int *);
68834 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68835 + const reiser4_key * to, struct inode *, int);
68836 +
68837 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68838 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68839 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68840 + znode * left, coord_t * result);
68841 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68842 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68843 + znode * child);
68844 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
68845 + int incore_p, int setup_dkeys_p);
68846 +
68847 +extern int cbk_cache_init(cbk_cache * cache);
68848 +extern void cbk_cache_done(cbk_cache * cache);
68849 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68850 +
68851 +extern char *sprint_address(const reiser4_block_nr * block);
68852 +
68853 +#if REISER4_DEBUG
68854 +extern void print_coord_content(const char *prefix, coord_t * p);
68855 +extern void reiser4_print_address(const char *prefix,
68856 + const reiser4_block_nr * block);
68857 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68858 + __u32 flags);
68859 +extern void check_dkeys(znode *node);
68860 +#else
68861 +#define print_coord_content(p, c) noop
68862 +#define reiser4_print_address(p, b) noop
68863 +#endif
68864 +
68865 +extern void forget_znode(lock_handle * handle);
68866 +extern int deallocate_znode(znode * node);
68867 +
68868 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68869 +
68870 +/* struct used internally to pack all numerous arguments of tree lookup.
68871 + Used to avoid passing a lot of arguments to helper functions. */
68872 +typedef struct cbk_handle {
68873 + /* tree we are in */
68874 + reiser4_tree *tree;
68875 + /* key we are going after */
68876 + const reiser4_key *key;
68877 + /* coord we will store result in */
68878 + coord_t *coord;
68879 + /* type of lock to take on target node */
68880 + znode_lock_mode lock_mode;
68881 + /* lookup bias. See comments at the declaration of lookup_bias */
68882 + lookup_bias bias;
68883 + /* lock level: level starting from which tree traversal starts taking
68884 + * write locks. */
68885 + tree_level lock_level;
68886 + /* level where search will stop. Either item will be found between
68887 + lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68888 + returned.
68889 + */
68890 + tree_level stop_level;
68891 + /* level we are currently at */
68892 + tree_level level;
68893 + /* block number of @active node. Tree traversal operates on two
68894 + nodes: active and parent. */
68895 + reiser4_block_nr block;
68896 + /* put here error message to be printed by caller */
68897 + const char *error;
68898 + /* result passed back to caller */
68899 + lookup_result result;
68900 + /* lock handles for active and parent */
68901 + lock_handle *parent_lh;
68902 + lock_handle *active_lh;
68903 + reiser4_key ld_key;
68904 + reiser4_key rd_key;
68905 + /* flags, passed to the cbk routine. Bits of this bitmask are defined
68906 + in tree.h:cbk_flags enum. */
68907 + __u32 flags;
68908 + ra_info_t *ra_info;
68909 + struct inode *object;
68910 +} cbk_handle;
68911 +
68912 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68913 +
68914 +/* eottl.c */
68915 +extern int handle_eottl(cbk_handle *h, int *outcome);
68916 +
68917 +int lookup_multikey(cbk_handle * handle, int nr_keys);
68918 +int lookup_couple(reiser4_tree * tree,
68919 + const reiser4_key * key1, const reiser4_key * key2,
68920 + coord_t * coord1, coord_t * coord2,
68921 + lock_handle * lh1, lock_handle * lh2,
68922 + znode_lock_mode lock_mode, lookup_bias bias,
68923 + tree_level lock_level, tree_level stop_level, __u32 flags,
68924 + int *result1, int *result2);
68925 +
68926 +static inline void read_lock_tree(reiser4_tree *tree)
68927 +{
68928 + /* check that tree is not locked */
68929 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68930 + LOCK_CNT_NIL(read_locked_tree) &&
68931 + LOCK_CNT_NIL(write_locked_tree)));
68932 + /* check that spinlocks of lower priorities are not held */
68933 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68934 + LOCK_CNT_NIL(rw_locked_dk) &&
68935 + LOCK_CNT_NIL(spin_locked_stack)));
68936 +
68937 + read_lock(&(tree->tree_lock));
68938 +
68939 + LOCK_CNT_INC(read_locked_tree);
68940 + LOCK_CNT_INC(rw_locked_tree);
68941 + LOCK_CNT_INC(spin_locked);
68942 +}
68943 +
68944 +static inline void read_unlock_tree(reiser4_tree *tree)
68945 +{
68946 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68947 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68948 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68949 +
68950 + LOCK_CNT_DEC(read_locked_tree);
68951 + LOCK_CNT_DEC(rw_locked_tree);
68952 + LOCK_CNT_DEC(spin_locked);
68953 +
68954 + read_unlock(&(tree->tree_lock));
68955 +}
68956 +
68957 +static inline void write_lock_tree(reiser4_tree *tree)
68958 +{
68959 + /* check that tree is not locked */
68960 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68961 + LOCK_CNT_NIL(read_locked_tree) &&
68962 + LOCK_CNT_NIL(write_locked_tree)));
68963 + /* check that spinlocks of lower priorities are not held */
68964 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68965 + LOCK_CNT_NIL(rw_locked_dk) &&
68966 + LOCK_CNT_NIL(spin_locked_stack)));
68967 +
68968 + write_lock(&(tree->tree_lock));
68969 +
68970 + LOCK_CNT_INC(write_locked_tree);
68971 + LOCK_CNT_INC(rw_locked_tree);
68972 + LOCK_CNT_INC(spin_locked);
68973 +}
68974 +
68975 +static inline void write_unlock_tree(reiser4_tree *tree)
68976 +{
68977 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68978 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68979 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68980 +
68981 + LOCK_CNT_DEC(write_locked_tree);
68982 + LOCK_CNT_DEC(rw_locked_tree);
68983 + LOCK_CNT_DEC(spin_locked);
68984 +
68985 + write_unlock(&(tree->tree_lock));
68986 +}
68987 +
68988 +static inline void read_lock_dk(reiser4_tree *tree)
68989 +{
68990 + /* check that dk is not locked */
68991 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68992 + LOCK_CNT_NIL(read_locked_dk) &&
68993 + LOCK_CNT_NIL(write_locked_dk)));
68994 + /* check that spinlocks of lower priorities are not held */
68995 + assert("", LOCK_CNT_NIL(spin_locked_stack));
68996 +
68997 + read_lock(&((tree)->dk_lock));
68998 +
68999 + LOCK_CNT_INC(read_locked_dk);
69000 + LOCK_CNT_INC(rw_locked_dk);
69001 + LOCK_CNT_INC(spin_locked);
69002 +}
69003 +
69004 +static inline void read_unlock_dk(reiser4_tree *tree)
69005 +{
69006 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
69007 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
69008 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69009 +
69010 + LOCK_CNT_DEC(read_locked_dk);
69011 + LOCK_CNT_DEC(rw_locked_dk);
69012 + LOCK_CNT_DEC(spin_locked);
69013 +
69014 + read_unlock(&(tree->dk_lock));
69015 +}
69016 +
69017 +static inline void write_lock_dk(reiser4_tree *tree)
69018 +{
69019 + /* check that dk is not locked */
69020 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
69021 + LOCK_CNT_NIL(read_locked_dk) &&
69022 + LOCK_CNT_NIL(write_locked_dk)));
69023 + /* check that spinlocks of lower priorities are not held */
69024 + assert("", LOCK_CNT_NIL(spin_locked_stack));
69025 +
69026 + write_lock(&((tree)->dk_lock));
69027 +
69028 + LOCK_CNT_INC(write_locked_dk);
69029 + LOCK_CNT_INC(rw_locked_dk);
69030 + LOCK_CNT_INC(spin_locked);
69031 +}
69032 +
69033 +static inline void write_unlock_dk(reiser4_tree *tree)
69034 +{
69035 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
69036 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
69037 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69038 +
69039 + LOCK_CNT_DEC(write_locked_dk);
69040 + LOCK_CNT_DEC(rw_locked_dk);
69041 + LOCK_CNT_DEC(spin_locked);
69042 +
69043 + write_unlock(&(tree->dk_lock));
69044 +}
69045 +
69046 +/* estimate api. Implementation is in estimate.c */
69047 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
69048 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
69049 +reiser4_block_nr estimate_insert_flow(tree_level);
69050 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
69051 +reiser4_block_nr calc_estimate_one_insert(tree_level);
69052 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
69053 +reiser4_block_nr estimate_insert_cluster(struct inode *);
69054 +reiser4_block_nr estimate_update_cluster(struct inode *);
69055 +
69056 +/* __REISER4_TREE_H__ */
69057 +#endif
69058 +
69059 +/* Make Linus happy.
69060 + Local variables:
69061 + c-indentation-style: "K&R"
69062 + mode-name: "LC"
69063 + c-basic-offset: 8
69064 + tab-width: 8
69065 + fill-column: 120
69066 + scroll-step: 1
69067 + End:
69068 +*/
69069 diff -urN linux-2.6.33.orig/fs/reiser4/tree_mod.c linux-2.6.33/fs/reiser4/tree_mod.c
69070 --- linux-2.6.33.orig/fs/reiser4/tree_mod.c 1970-01-01 01:00:00.000000000 +0100
69071 +++ linux-2.6.33/fs/reiser4/tree_mod.c 2010-03-04 19:33:22.000000000 +0100
69072 @@ -0,0 +1,386 @@
69073 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69074 + * reiser4/README */
69075 +
69076 +/*
69077 + * Functions to add/delete new nodes to/from the tree.
69078 + *
69079 + * Functions from this file are used by carry (see carry*) to handle:
69080 + *
69081 + * . insertion of new formatted node into tree
69082 + *
69083 + * . addition of new tree root, increasing tree height
69084 + *
69085 + * . removing tree root, decreasing tree height
69086 + *
69087 + */
69088 +
69089 +#include "forward.h"
69090 +#include "debug.h"
69091 +#include "dformat.h"
69092 +#include "key.h"
69093 +#include "coord.h"
69094 +#include "plugin/plugin.h"
69095 +#include "jnode.h"
69096 +#include "znode.h"
69097 +#include "tree_mod.h"
69098 +#include "block_alloc.h"
69099 +#include "tree_walk.h"
69100 +#include "tree.h"
69101 +#include "super.h"
69102 +
69103 +#include <linux/err.h>
69104 +
69105 +static int add_child_ptr(znode * parent, znode * child);
69106 +/* warning only issued if error is not -E_REPEAT */
69107 +#define ewarning( error, ... ) \
69108 + if( ( error ) != -E_REPEAT ) \
69109 + warning( __VA_ARGS__ )
69110 +
69111 +/* allocate new node on the @level and immediately on the right of @brother. */
69112 +znode * reiser4_new_node(znode * brother /* existing left neighbor
69113 + * of new node */,
69114 + tree_level level /* tree level at which new node is to
69115 + * be allocated */)
69116 +{
69117 + znode *result;
69118 + int retcode;
69119 + reiser4_block_nr blocknr;
69120 +
69121 + assert("nikita-930", brother != NULL);
69122 + assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
69123 +
69124 + retcode = assign_fake_blocknr_formatted(&blocknr);
69125 + if (retcode == 0) {
69126 + result =
69127 + zget(znode_get_tree(brother), &blocknr, NULL, level,
69128 + reiser4_ctx_gfp_mask_get());
69129 + if (IS_ERR(result)) {
69130 + ewarning(PTR_ERR(result), "nikita-929",
69131 + "Cannot allocate znode for carry: %li",
69132 + PTR_ERR(result));
69133 + return result;
69134 + }
69135 + /* cheap test, can be executed even when debugging is off */
69136 + if (!znode_just_created(result)) {
69137 + warning("nikita-2213",
69138 + "Allocated already existing block: %llu",
69139 + (unsigned long long)blocknr);
69140 + zput(result);
69141 + return ERR_PTR(RETERR(-EIO));
69142 + }
69143 +
69144 + assert("nikita-931", result != NULL);
69145 + result->nplug = znode_get_tree(brother)->nplug;
69146 + assert("nikita-933", result->nplug != NULL);
69147 +
69148 + retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
69149 + if (retcode == 0) {
69150 + ZF_SET(result, JNODE_CREATED);
69151 + zrelse(result);
69152 + } else {
69153 + zput(result);
69154 + result = ERR_PTR(retcode);
69155 + }
69156 + } else {
69157 + /* failure to allocate new node during balancing.
69158 + This should never happen. Ever. Returning -E_REPEAT
69159 + is not viable solution, because "out of disk space"
69160 + is not transient error that will go away by itself.
69161 + */
69162 + ewarning(retcode, "nikita-928",
69163 + "Cannot allocate block for carry: %i", retcode);
69164 + result = ERR_PTR(retcode);
69165 + }
69166 + assert("nikita-1071", result != NULL);
69167 + return result;
69168 +}
69169 +
69170 +/* allocate new root and add it to the tree
69171 +
69172 + This helper function is called by add_new_root().
69173 +
69174 +*/
69175 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
69176 + znode * fake /* "fake" znode */ )
69177 +{
69178 + reiser4_tree *tree = znode_get_tree(old_root);
69179 + znode *new_root = NULL; /* to shut gcc up */
69180 + int result;
69181 +
69182 + assert("nikita-1069", old_root != NULL);
69183 + assert("umka-262", fake != NULL);
69184 + assert("umka-263", tree != NULL);
69185 +
69186 + /* "fake" znode---one always hanging just above current root. This
69187 + node is locked when new root is created or existing root is
69188 + deleted. Downward tree traversal takes lock on it before taking
69189 + lock on a root node. This avoids race conditions with root
69190 + manipulations.
69191 +
69192 + */
69193 + assert("nikita-1348", znode_above_root(fake));
69194 + assert("nikita-1211", znode_is_root(old_root));
69195 +
69196 + result = 0;
69197 + if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
69198 + warning("nikita-1344", "Tree is too tall: %i", tree->height);
69199 + /* ext2 returns -ENOSPC when it runs out of free inodes with a
69200 + following comment (fs/ext2/ialloc.c:441): Is it really
69201 + ENOSPC?
69202 +
69203 + -EXFULL? -EINVAL?
69204 + */
69205 + result = RETERR(-ENOSPC);
69206 + } else {
69207 + /* Allocate block for new root. It's not that
69208 + important where it will be allocated, as root is
69209 + almost always in memory. Moreover, allocate on
69210 + flush can be going here.
69211 + */
69212 + assert("nikita-1448", znode_is_root(old_root));
69213 + new_root = reiser4_new_node(fake, tree->height + 1);
69214 + if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
69215 + lock_handle rlh;
69216 +
69217 + init_lh(&rlh);
69218 + result =
69219 + longterm_lock_znode(&rlh, new_root,
69220 + ZNODE_WRITE_LOCK,
69221 + ZNODE_LOCK_LOPRI);
69222 + if (result == 0) {
69223 + parent_coord_t *in_parent;
69224 +
69225 + znode_make_dirty(fake);
69226 +
69227 + /* new root is a child of "fake" node */
69228 + write_lock_tree(tree);
69229 +
69230 + ++tree->height;
69231 +
69232 + /* recalculate max balance overhead */
69233 + tree->estimate_one_insert =
69234 + estimate_one_insert_item(tree);
69235 +
69236 + tree->root_block = *znode_get_block(new_root);
69237 + in_parent = &new_root->in_parent;
69238 + init_parent_coord(in_parent, fake);
69239 + /* manually insert new root into sibling
69240 + * list. With this all nodes involved into
69241 + * balancing are connected after balancing is
69242 + * done---useful invariant to check. */
69243 + sibling_list_insert_nolock(new_root, NULL);
69244 + write_unlock_tree(tree);
69245 +
69246 + /* insert into new root pointer to the
69247 + @old_root. */
69248 + assert("nikita-1110",
69249 + WITH_DATA(new_root,
69250 + node_is_empty(new_root)));
69251 + write_lock_dk(tree);
69252 + znode_set_ld_key(new_root, reiser4_min_key());
69253 + znode_set_rd_key(new_root, reiser4_max_key());
69254 + write_unlock_dk(tree);
69255 + if (REISER4_DEBUG) {
69256 + ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
69257 + ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
69258 + ZF_SET(old_root, JNODE_ORPHAN);
69259 + }
69260 + result = add_child_ptr(new_root, old_root);
69261 + done_lh(&rlh);
69262 + }
69263 + zrelse(new_root);
69264 + }
69265 + }
69266 + if (result != 0)
69267 + new_root = ERR_PTR(result);
69268 + return new_root;
69269 +}
69270 +
69271 +/* build &reiser4_item_data for inserting child pointer
69272 +
69273 + Build &reiser4_item_data that can be later used to insert pointer to @child
69274 + in its parent.
69275 +
69276 +*/
69277 +void build_child_ptr_data(znode * child /* node pointer to which will be
69278 + * inserted */ ,
69279 + reiser4_item_data * data /* where to store result */ )
69280 +{
69281 + assert("nikita-1116", child != NULL);
69282 + assert("nikita-1117", data != NULL);
69283 +
69284 + /*
69285 + * NOTE: use address of child's blocknr as address of data to be
69286 + * inserted. As result of this data gets into on-disk structure in cpu
69287 + * byte order. internal's create_hook converts it to little endian byte
69288 + * order.
69289 + */
69290 + data->data = (char *)znode_get_block(child);
69291 + /* data -> data is kernel space */
69292 + data->user = 0;
69293 + data->length = sizeof(reiser4_block_nr);
69294 + /* FIXME-VS: hardcoded internal item? */
69295 +
69296 + /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
69297 + data->iplug = item_plugin_by_id(NODE_POINTER_ID);
69298 +}
69299 +
69300 +/* add pointer to @child into empty @parent.
69301 +
69302 + This is used when pointer to old root is inserted into new root which is
69303 + empty.
69304 +*/
69305 +static int add_child_ptr(znode * parent, znode * child)
69306 +{
69307 + coord_t coord;
69308 + reiser4_item_data data;
69309 + int result;
69310 + reiser4_key key;
69311 +
69312 + assert("nikita-1111", parent != NULL);
69313 + assert("nikita-1112", child != NULL);
69314 + assert("nikita-1115",
69315 + znode_get_level(parent) == znode_get_level(child) + 1);
69316 +
69317 + result = zload(parent);
69318 + if (result != 0)
69319 + return result;
69320 + assert("nikita-1113", node_is_empty(parent));
69321 + coord_init_first_unit(&coord, parent);
69322 +
69323 + build_child_ptr_data(child, &data);
69324 + data.arg = NULL;
69325 +
69326 + read_lock_dk(znode_get_tree(parent));
69327 + key = *znode_get_ld_key(child);
69328 + read_unlock_dk(znode_get_tree(parent));
69329 +
69330 + result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
69331 + NULL);
69332 + znode_make_dirty(parent);
69333 + zrelse(parent);
69334 + return result;
69335 +}
69336 +
69337 +/* actually remove tree root */
69338 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
69339 + * being removed */,
69340 + znode * old_root /* root node that is being
69341 + * removed */ ,
69342 + znode * new_root /* new root---sole child of
69343 + * @old_root */,
69344 + const reiser4_block_nr * new_root_blk /* disk address of
69345 + * @new_root */)
69346 +{
69347 + znode *uber;
69348 + int result;
69349 + lock_handle handle_for_uber;
69350 +
69351 + assert("umka-265", tree != NULL);
69352 + assert("nikita-1198", new_root != NULL);
69353 + assert("nikita-1199",
69354 + znode_get_level(new_root) + 1 == znode_get_level(old_root));
69355 +
69356 + assert("nikita-1201", znode_is_write_locked(old_root));
69357 +
69358 + assert("nikita-1203",
69359 + disk_addr_eq(new_root_blk, znode_get_block(new_root)));
69360 +
69361 + init_lh(&handle_for_uber);
69362 + /* obtain and lock "fake" znode protecting changes in tree height. */
69363 + result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
69364 + &handle_for_uber);
69365 + if (result == 0) {
69366 + uber = handle_for_uber.node;
69367 +
69368 + znode_make_dirty(uber);
69369 +
69370 + /* don't take long term lock a @new_root. Take spinlock. */
69371 +
69372 + write_lock_tree(tree);
69373 +
69374 + tree->root_block = *new_root_blk;
69375 + --tree->height;
69376 +
69377 + /* recalculate max balance overhead */
69378 + tree->estimate_one_insert = estimate_one_insert_item(tree);
69379 +
69380 + assert("nikita-1202",
69381 + tree->height == znode_get_level(new_root));
69382 +
69383 + /* new root is child on "fake" node */
69384 + init_parent_coord(&new_root->in_parent, uber);
69385 + ++uber->c_count;
69386 +
69387 + /* sibling_list_insert_nolock(new_root, NULL); */
69388 + write_unlock_tree(tree);
69389 +
69390 + /* reinitialise old root. */
69391 + result = node_plugin_by_node(old_root)->init(old_root);
69392 + znode_make_dirty(old_root);
69393 + if (result == 0) {
69394 + assert("nikita-1279", node_is_empty(old_root));
69395 + ZF_SET(old_root, JNODE_HEARD_BANSHEE);
69396 + old_root->c_count = 0;
69397 + }
69398 + }
69399 + done_lh(&handle_for_uber);
69400 +
69401 + return result;
69402 +}
69403 +
69404 +/* remove tree root
69405 +
69406 + This function removes tree root, decreasing tree height by one. Tree root
69407 + and its only child (that is going to become new tree root) are write locked
69408 + at the entry.
69409 +
69410 + To remove tree root we need to take lock on special "fake" znode that
69411 + protects changes of tree height. See comments in reiser4_add_tree_root() for
69412 + more on this.
69413 +
69414 + Also parent pointers have to be updated in
69415 + old and new root. To simplify code, function is split into two parts: outer
69416 + reiser4_kill_tree_root() collects all necessary arguments and calls
69417 + reiser4_kill_root() to do the actual job.
69418 +
69419 +*/
69420 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69421 + removing*/)
69422 +{
69423 + int result;
69424 + coord_t down_link;
69425 + znode *new_root;
69426 + reiser4_tree *tree;
69427 +
69428 + assert("umka-266", current_tree != NULL);
69429 + assert("nikita-1194", old_root != NULL);
69430 + assert("nikita-1196", znode_is_root(old_root));
69431 + assert("nikita-1200", node_num_items(old_root) == 1);
69432 + assert("nikita-1401", znode_is_write_locked(old_root));
69433 +
69434 + coord_init_first_unit(&down_link, old_root);
69435 +
69436 + tree = znode_get_tree(old_root);
69437 + new_root = child_znode(&down_link, old_root, 0, 1);
69438 + if (!IS_ERR(new_root)) {
69439 + result =
69440 + reiser4_kill_root(tree, old_root, new_root,
69441 + znode_get_block(new_root));
69442 + zput(new_root);
69443 + } else
69444 + result = PTR_ERR(new_root);
69445 +
69446 + return result;
69447 +}
69448 +
69449 +/* Make Linus happy.
69450 + Local variables:
69451 + c-indentation-style: "K&R"
69452 + mode-name: "LC"
69453 + c-basic-offset: 8
69454 + tab-width: 8
69455 + fill-column: 120
69456 + scroll-step: 1
69457 + End:
69458 +*/
69459 diff -urN linux-2.6.33.orig/fs/reiser4/tree_mod.h linux-2.6.33/fs/reiser4/tree_mod.h
69460 --- linux-2.6.33.orig/fs/reiser4/tree_mod.h 1970-01-01 01:00:00.000000000 +0100
69461 +++ linux-2.6.33/fs/reiser4/tree_mod.h 2010-03-04 19:33:22.000000000 +0100
69462 @@ -0,0 +1,29 @@
69463 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69464 + * reiser4/README */
69465 +
69466 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69467 + * comments. */
69468 +
69469 +#if !defined( __REISER4_TREE_MOD_H__ )
69470 +#define __REISER4_TREE_MOD_H__
69471 +
69472 +#include "forward.h"
69473 +
69474 +znode *reiser4_new_node(znode * brother, tree_level level);
69475 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69476 +int reiser4_kill_tree_root(znode * old_root);
69477 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
69478 +
69479 +/* __REISER4_TREE_MOD_H__ */
69480 +#endif
69481 +
69482 +/* Make Linus happy.
69483 + Local variables:
69484 + c-indentation-style: "K&R"
69485 + mode-name: "LC"
69486 + c-basic-offset: 8
69487 + tab-width: 8
69488 + fill-column: 120
69489 + scroll-step: 1
69490 + End:
69491 +*/
69492 diff -urN linux-2.6.33.orig/fs/reiser4/tree_walk.c linux-2.6.33/fs/reiser4/tree_walk.c
69493 --- linux-2.6.33.orig/fs/reiser4/tree_walk.c 1970-01-01 01:00:00.000000000 +0100
69494 +++ linux-2.6.33/fs/reiser4/tree_walk.c 2010-03-04 19:33:22.000000000 +0100
69495 @@ -0,0 +1,927 @@
69496 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69497 + * reiser4/README */
69498 +
69499 +/* Routines and macros to:
69500 +
69501 + get_left_neighbor()
69502 +
69503 + get_right_neighbor()
69504 +
69505 + get_parent()
69506 +
69507 + get_first_child()
69508 +
69509 + get_last_child()
69510 +
69511 + various routines to walk the whole tree and do things to it like
69512 + repack it, or move it to tertiary storage. Please make them as
69513 + generic as is reasonable.
69514 +
69515 +*/
69516 +
69517 +#include "forward.h"
69518 +#include "debug.h"
69519 +#include "dformat.h"
69520 +#include "coord.h"
69521 +#include "plugin/item/item.h"
69522 +#include "jnode.h"
69523 +#include "znode.h"
69524 +#include "tree_walk.h"
69525 +#include "tree.h"
69526 +#include "super.h"
69527 +
69528 +/* These macros are used internally in tree_walk.c in attempt to make
69529 + lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69530 + lock_left_neighbor */
69531 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69532 +#define FIELD_OFFSET(name) offsetof(znode, name)
69533 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69534 +#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
69535 +#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
69536 +
69537 +/* This is the generic procedure to get and lock `generic' neighbor (left or
69538 + right neighbor or parent). It implements common algorithm for all cases of
69539 + getting lock on neighbor node, only znode structure field is different in
69540 + each case. This is parameterized by ptr_offset argument, which is byte
69541 + offset for the pointer to the desired neighbor within the current node's
69542 + znode structure. This function should be called with the tree lock held */
69543 +static int lock_neighbor(
69544 + /* resulting lock handle */
69545 + lock_handle * result,
69546 + /* znode to lock */
69547 + znode * node,
69548 + /* pointer to neighbor (or parent) znode field offset, in bytes from
69549 + the base address of znode structure */
69550 + int ptr_offset,
69551 + /* lock mode for longterm_lock_znode call */
69552 + znode_lock_mode mode,
69553 + /* lock request for longterm_lock_znode call */
69554 + znode_lock_request req,
69555 + /* GN_* flags */
69556 + int flags, int rlocked)
69557 +{
69558 + reiser4_tree *tree = znode_get_tree(node);
69559 + znode *neighbor;
69560 + int ret;
69561 +
69562 + assert("umka-236", node != NULL);
69563 + assert("umka-237", tree != NULL);
69564 + assert_rw_locked(&(tree->tree_lock));
69565 +
69566 + if (flags & GN_TRY_LOCK)
69567 + req |= ZNODE_LOCK_NONBLOCK;
69568 + if (flags & GN_SAME_ATOM)
69569 + req |= ZNODE_LOCK_DONT_FUSE;
69570 +
69571 + /* get neighbor's address by using of sibling link, quit while loop
69572 + (and return) if link is not available. */
69573 + while (1) {
69574 + neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69575 +
69576 + /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69577 + * node pointed by it is not connected.
69578 + *
69579 + * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69580 + * check and allows passing reference to not connected znode to
69581 + * subsequent longterm_lock_znode() call. This kills possible
69582 + * busy loop if we are trying to get longterm lock on locked but
69583 + * not yet connected parent node. */
69584 + if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69585 + || znode_is_connected(neighbor))) {
69586 + return RETERR(-E_NO_NEIGHBOR);
69587 + }
69588 +
69589 + /* protect it from deletion. */
69590 + zref(neighbor);
69591 +
69592 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69593 +
69594 + ret = longterm_lock_znode(result, neighbor, mode, req);
69595 +
69596 + /* The lock handle obtains its own reference, release the one from above. */
69597 + zput(neighbor);
69598 +
69599 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69600 +
69601 + /* restart if node we got reference to is being
69602 + invalidated. we should not get reference to this node
69603 + again. */
69604 + if (ret == -EINVAL)
69605 + continue;
69606 + if (ret)
69607 + return ret;
69608 +
69609 + /* check if neighbor link still points to just locked znode;
69610 + the link could have been changed while the process slept. */
69611 + if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69612 + return 0;
69613 +
69614 + /* znode was locked by mistake; unlock it and restart locking
69615 + process from beginning. */
69616 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69617 + longterm_unlock_znode(result);
69618 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69619 + }
69620 +}
69621 +
69622 +/* get parent node with longterm lock, accepts GN* flags. */
69623 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69624 + znode * node /* child node */ ,
69625 + znode_lock_mode mode
69626 + /* type of lock: read or write */ ,
69627 + int flags /* GN_* flags */ )
69628 +{
69629 + int result;
69630 +
69631 + read_lock_tree(znode_get_tree(node));
69632 + result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69633 + ZNODE_LOCK_HIPRI, flags, 1);
69634 + read_unlock_tree(znode_get_tree(node));
69635 + return result;
69636 +}
69637 +
69638 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69639 + bit in @flags parameter */
69640 +/* Audited by: umka (2002.06.14) */
69641 +static inline int
69642 +lock_side_neighbor(lock_handle * result,
69643 + znode * node, znode_lock_mode mode, int flags, int rlocked)
69644 +{
69645 + int ret;
69646 + int ptr_offset;
69647 + znode_lock_request req;
69648 +
69649 + if (flags & GN_GO_LEFT) {
69650 + ptr_offset = LEFT_PTR_OFFSET;
69651 + req = ZNODE_LOCK_LOPRI;
69652 + } else {
69653 + ptr_offset = RIGHT_PTR_OFFSET;
69654 + req = ZNODE_LOCK_HIPRI;
69655 + }
69656 +
69657 + ret =
69658 + lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69659 +
69660 + if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
69661 + * guarantee that neighbor is absent in the
69662 + * tree; in this case we return -ENOENT --
69663 + * means neighbor at least not found in
69664 + * cache */
69665 + return RETERR(-ENOENT);
69666 +
69667 + return ret;
69668 +}
69669 +
69670 +#if REISER4_DEBUG
69671 +
69672 +int check_sibling_list(znode * node)
69673 +{
69674 + znode *scan;
69675 + znode *next;
69676 +
69677 + assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69678 +
69679 + if (node == NULL)
69680 + return 1;
69681 +
69682 + if (ZF_ISSET(node, JNODE_RIP))
69683 + return 1;
69684 +
69685 + assert("nikita-3270", node != NULL);
69686 + assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69687 +
69688 + for (scan = node; znode_is_left_connected(scan); scan = next) {
69689 + next = scan->left;
69690 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69691 + assert("nikita-3271", znode_is_right_connected(next));
69692 + assert("nikita-3272", next->right == scan);
69693 + } else
69694 + break;
69695 + }
69696 + for (scan = node; znode_is_right_connected(scan); scan = next) {
69697 + next = scan->right;
69698 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69699 + assert("nikita-3273", znode_is_left_connected(next));
69700 + assert("nikita-3274", next->left == scan);
69701 + } else
69702 + break;
69703 + }
69704 + return 1;
69705 +}
69706 +
69707 +#endif
69708 +
69709 +/* Znode sibling pointers maintenence. */
69710 +
69711 +/* Znode sibling pointers are established between any neighbored nodes which are
69712 + in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
69713 + JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69714 + value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69715 +
69716 + Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69717 + take care about searching (hash table lookup may be required) of znode
69718 + neighbors, establishing sibling pointers between them and setting
69719 + JNODE_*_CONNECTED state bits. */
69720 +
69721 +/* adjusting of sibling pointers and `connected' states for two
69722 + neighbors; works if one neighbor is NULL (was not found). */
69723 +
69724 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69725 +void link_left_and_right(znode * left, znode * right)
69726 +{
69727 + assert("nikita-3275", check_sibling_list(left));
69728 + assert("nikita-3275", check_sibling_list(right));
69729 +
69730 + if (left != NULL) {
69731 + if (left->right == NULL) {
69732 + left->right = right;
69733 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
69734 +
69735 + ON_DEBUG(left->right_version =
69736 + atomic_inc_return(&delim_key_version);
69737 + );
69738 +
69739 + } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69740 + && left->right != right) {
69741 +
69742 + ON_DEBUG(left->right->left_version =
69743 + atomic_inc_return(&delim_key_version);
69744 + left->right_version =
69745 + atomic_inc_return(&delim_key_version););
69746 +
69747 + left->right->left = NULL;
69748 + left->right = right;
69749 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
69750 + } else
69751 + /*
69752 + * there is a race condition in renew_sibling_link()
69753 + * and assertions below check that it is only one
69754 + * there. Thread T1 calls renew_sibling_link() without
69755 + * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69756 + * node, but before T1 gets to the
69757 + * link_left_and_right(), another thread T2 creates
69758 + * neighbor node and connects it. check for
69759 + * left->right == NULL above protects T1 from
69760 + * overwriting correct left->right pointer installed
69761 + * by T2.
69762 + */
69763 + assert("nikita-3302",
69764 + right == NULL || left->right == right);
69765 + }
69766 + if (right != NULL) {
69767 + if (right->left == NULL) {
69768 + right->left = left;
69769 + ZF_SET(right, JNODE_LEFT_CONNECTED);
69770 +
69771 + ON_DEBUG(right->left_version =
69772 + atomic_inc_return(&delim_key_version);
69773 + );
69774 +
69775 + } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69776 + && right->left != left) {
69777 +
69778 + ON_DEBUG(right->left->right_version =
69779 + atomic_inc_return(&delim_key_version);
69780 + right->left_version =
69781 + atomic_inc_return(&delim_key_version););
69782 +
69783 + right->left->right = NULL;
69784 + right->left = left;
69785 + ZF_SET(right, JNODE_LEFT_CONNECTED);
69786 +
69787 + } else
69788 + assert("nikita-3303",
69789 + left == NULL || right->left == left);
69790 + }
69791 + assert("nikita-3275", check_sibling_list(left));
69792 + assert("nikita-3275", check_sibling_list(right));
69793 +}
69794 +
69795 +/* Audited by: umka (2002.06.14) */
69796 +static void link_znodes(znode * first, znode * second, int to_left)
69797 +{
69798 + if (to_left)
69799 + link_left_and_right(second, first);
69800 + else
69801 + link_left_and_right(first, second);
69802 +}
69803 +
69804 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69805 + coord's unit position in horizontal direction, even across node
69806 + boundary. Should be called under tree lock, it protects nonexistence of
69807 + sibling link on parent level, if lock_side_neighbor() fails with
69808 + -ENOENT. */
69809 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69810 +{
69811 + int ret;
69812 + znode *node;
69813 + reiser4_tree *tree;
69814 +
69815 + assert("umka-243", coord != NULL);
69816 + assert("umka-244", handle != NULL);
69817 + assert("zam-1069", handle->node == NULL);
69818 +
69819 + ret =
69820 + (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69821 + coord_next_unit(coord);
69822 + if (!ret)
69823 + return 0;
69824 +
69825 + ret =
69826 + lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69827 + if (ret)
69828 + return ret;
69829 +
69830 + node = handle->node;
69831 + tree = znode_get_tree(node);
69832 + write_unlock_tree(tree);
69833 +
69834 + coord_init_zero(coord);
69835 +
69836 + /* We avoid synchronous read here if it is specified by flag. */
69837 + if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69838 + ret = jstartio(ZJNODE(handle->node));
69839 + if (!ret)
69840 + ret = -E_REPEAT;
69841 + goto error_locked;
69842 + }
69843 +
69844 + /* corresponded zrelse() should be called by the clients of
69845 + far_next_coord(), in place when this node gets unlocked. */
69846 + ret = zload(handle->node);
69847 + if (ret)
69848 + goto error_locked;
69849 +
69850 + if (flags & GN_GO_LEFT)
69851 + coord_init_last_unit(coord, node);
69852 + else
69853 + coord_init_first_unit(coord, node);
69854 +
69855 + if (0) {
69856 + error_locked:
69857 + longterm_unlock_znode(handle);
69858 + }
69859 + write_lock_tree(tree);
69860 + return ret;
69861 +}
69862 +
69863 +/* Very significant function which performs a step in horizontal direction
69864 + when sibling pointer is not available. Actually, it is only function which
69865 + does it.
69866 + Note: this function does not restore locking status at exit,
69867 + caller should does care about proper unlocking and zrelsing */
69868 +static int
69869 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69870 + tree_level level, int flags, int *nr_locked)
69871 +{
69872 + int ret;
69873 + int to_left = flags & GN_GO_LEFT;
69874 + reiser4_block_nr da;
69875 + /* parent of the neighbor node; we set it to parent until not sharing
69876 + of one parent between child and neighbor node is detected */
69877 + znode *side_parent = coord->node;
69878 + reiser4_tree *tree = znode_get_tree(child);
69879 + znode *neighbor = NULL;
69880 +
69881 + assert("umka-245", coord != NULL);
69882 + assert("umka-246", handle != NULL);
69883 + assert("umka-247", child != NULL);
69884 + assert("umka-303", tree != NULL);
69885 +
69886 + init_lh(handle);
69887 + write_lock_tree(tree);
69888 + ret = far_next_coord(coord, handle, flags);
69889 +
69890 + if (ret) {
69891 + if (ret != -ENOENT) {
69892 + write_unlock_tree(tree);
69893 + return ret;
69894 + }
69895 + } else {
69896 + item_plugin *iplug;
69897 +
69898 + if (handle->node != NULL) {
69899 + (*nr_locked)++;
69900 + side_parent = handle->node;
69901 + }
69902 +
69903 + /* does coord object points to internal item? We do not
69904 + support sibling pointers between znode for formatted and
69905 + unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69906 + iplug = item_plugin_by_coord(coord);
69907 + if (!item_is_internal(coord)) {
69908 + link_znodes(child, NULL, to_left);
69909 + write_unlock_tree(tree);
69910 + /* we know there can't be formatted neighbor */
69911 + return RETERR(-E_NO_NEIGHBOR);
69912 + }
69913 + write_unlock_tree(tree);
69914 +
69915 + iplug->s.internal.down_link(coord, NULL, &da);
69916 +
69917 + if (flags & GN_NO_ALLOC) {
69918 + neighbor = zlook(tree, &da);
69919 + } else {
69920 + neighbor =
69921 + zget(tree, &da, side_parent, level,
69922 + reiser4_ctx_gfp_mask_get());
69923 + }
69924 +
69925 + if (IS_ERR(neighbor)) {
69926 + ret = PTR_ERR(neighbor);
69927 + return ret;
69928 + }
69929 +
69930 + if (neighbor)
69931 + /* update delimiting keys */
69932 + set_child_delimiting_keys(coord->node, coord, neighbor);
69933 +
69934 + write_lock_tree(tree);
69935 + }
69936 +
69937 + if (likely(neighbor == NULL ||
69938 + (znode_get_level(child) == znode_get_level(neighbor)
69939 + && child != neighbor)))
69940 + link_znodes(child, neighbor, to_left);
69941 + else {
69942 + warning("nikita-3532",
69943 + "Sibling nodes on the different levels: %i != %i\n",
69944 + znode_get_level(child), znode_get_level(neighbor));
69945 + ret = RETERR(-EIO);
69946 + }
69947 +
69948 + write_unlock_tree(tree);
69949 +
69950 + /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69951 + if (neighbor != NULL && (flags & GN_NO_ALLOC))
69952 + /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69953 + zput(neighbor);
69954 +
69955 + return ret;
69956 +}
69957 +
69958 +/* This function is for establishing of one side relation. */
69959 +/* Audited by: umka (2002.06.14) */
69960 +static int connect_one_side(coord_t * coord, znode * node, int flags)
69961 +{
69962 + coord_t local;
69963 + lock_handle handle;
69964 + int nr_locked;
69965 + int ret;
69966 +
69967 + assert("umka-248", coord != NULL);
69968 + assert("umka-249", node != NULL);
69969 +
69970 + coord_dup_nocheck(&local, coord);
69971 +
69972 + init_lh(&handle);
69973 +
69974 + ret =
69975 + renew_sibling_link(&local, &handle, node, znode_get_level(node),
69976 + flags | GN_NO_ALLOC, &nr_locked);
69977 +
69978 + if (handle.node != NULL) {
69979 + /* complementary operations for zload() and lock() in far_next_coord() */
69980 + zrelse(handle.node);
69981 + longterm_unlock_znode(&handle);
69982 + }
69983 +
69984 + /* we catch error codes which are not interesting for us because we
69985 + run renew_sibling_link() only for znode connection. */
69986 + if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69987 + return 0;
69988 +
69989 + return ret;
69990 +}
69991 +
69992 +/* if @child is not in `connected' state, performs hash searches for left and
69993 + right neighbor nodes and establishes horizontal sibling links */
69994 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69995 +int connect_znode(coord_t * parent_coord, znode * child)
69996 +{
69997 + reiser4_tree *tree = znode_get_tree(child);
69998 + int ret = 0;
69999 +
70000 + assert("zam-330", parent_coord != NULL);
70001 + assert("zam-331", child != NULL);
70002 + assert("zam-332", parent_coord->node != NULL);
70003 + assert("umka-305", tree != NULL);
70004 +
70005 + /* it is trivial to `connect' root znode because it can't have
70006 + neighbors */
70007 + if (znode_above_root(parent_coord->node)) {
70008 + child->left = NULL;
70009 + child->right = NULL;
70010 + ZF_SET(child, JNODE_LEFT_CONNECTED);
70011 + ZF_SET(child, JNODE_RIGHT_CONNECTED);
70012 +
70013 + ON_DEBUG(child->left_version =
70014 + atomic_inc_return(&delim_key_version);
70015 + child->right_version =
70016 + atomic_inc_return(&delim_key_version););
70017 +
70018 + return 0;
70019 + }
70020 +
70021 + /* load parent node */
70022 + coord_clear_iplug(parent_coord);
70023 + ret = zload(parent_coord->node);
70024 +
70025 + if (ret != 0)
70026 + return ret;
70027 +
70028 + /* protect `connected' state check by tree_lock */
70029 + read_lock_tree(tree);
70030 +
70031 + if (!znode_is_right_connected(child)) {
70032 + read_unlock_tree(tree);
70033 + /* connect right (default is right) */
70034 + ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
70035 + if (ret)
70036 + goto zrelse_and_ret;
70037 +
70038 + read_lock_tree(tree);
70039 + }
70040 +
70041 + ret = znode_is_left_connected(child);
70042 +
70043 + read_unlock_tree(tree);
70044 +
70045 + if (!ret) {
70046 + ret =
70047 + connect_one_side(parent_coord, child,
70048 + GN_NO_ALLOC | GN_GO_LEFT);
70049 + } else
70050 + ret = 0;
70051 +
70052 + zrelse_and_ret:
70053 + zrelse(parent_coord->node);
70054 +
70055 + return ret;
70056 +}
70057 +
70058 +/* this function is like renew_sibling_link() but allocates neighbor node if
70059 + it doesn't exist and `connects' it. It may require making two steps in
70060 + horizontal direction, first one for neighbor node finding/allocation,
70061 + second one is for finding neighbor of neighbor to connect freshly allocated
70062 + znode. */
70063 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
70064 +static int
70065 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
70066 +{
70067 + coord_t local;
70068 + lock_handle empty[2];
70069 + reiser4_tree *tree = znode_get_tree(node);
70070 + znode *neighbor = NULL;
70071 + int nr_locked = 0;
70072 + int ret;
70073 +
70074 + assert("umka-250", coord != NULL);
70075 + assert("umka-251", node != NULL);
70076 + assert("umka-307", tree != NULL);
70077 + assert("umka-308", level <= tree->height);
70078 +
70079 + /* umka (2002.06.14)
70080 + Here probably should be a check for given "level" validness.
70081 + Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
70082 + */
70083 +
70084 + coord_dup(&local, coord);
70085 +
70086 + ret =
70087 + renew_sibling_link(&local, &empty[0], node, level,
70088 + flags & ~GN_NO_ALLOC, &nr_locked);
70089 + if (ret)
70090 + goto out;
70091 +
70092 + /* tree lock is not needed here because we keep parent node(s) locked
70093 + and reference to neighbor znode incremented */
70094 + neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
70095 +
70096 + read_lock_tree(tree);
70097 + ret = znode_is_connected(neighbor);
70098 + read_unlock_tree(tree);
70099 + if (ret) {
70100 + ret = 0;
70101 + goto out;
70102 + }
70103 +
70104 + ret =
70105 + renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
70106 + flags | GN_NO_ALLOC, &nr_locked);
70107 + /* second renew_sibling_link() call is used for znode connection only,
70108 + so we can live with these errors */
70109 + if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
70110 + ret = 0;
70111 +
70112 + out:
70113 +
70114 + for (--nr_locked; nr_locked >= 0; --nr_locked) {
70115 + zrelse(empty[nr_locked].node);
70116 + longterm_unlock_znode(&empty[nr_locked]);
70117 + }
70118 +
70119 + if (neighbor != NULL)
70120 + /* decrement znode reference counter without actually
70121 + releasing it. */
70122 + atomic_dec(&ZJNODE(neighbor)->x_count);
70123 +
70124 + return ret;
70125 +}
70126 +
70127 +/*
70128 + reiser4_get_neighbor() -- lock node's neighbor.
70129 +
70130 + reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
70131 + given parameter) using sibling link to it. If sibling link is not available
70132 + (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
70133 + level up for information about neighbor's disk address. We lock node's
70134 + parent, if it is common parent for both 'node' and its neighbor, neighbor's
70135 + disk address is in next (to left or to right) down link from link that points
70136 + to original node. If not, we need to lock parent's neighbor, read its content
70137 + and take first(last) downlink with neighbor's disk address. That locking
70138 + could be done by using sibling link and lock_neighbor() function, if sibling
70139 + link exists. In another case we have to go level up again until we find
70140 + common parent or valid sibling link. Then go down
70141 + allocating/connecting/locking/reading nodes until neighbor of first one is
70142 + locked.
70143 +
70144 + @neighbor: result lock handle,
70145 + @node: a node which we lock neighbor of,
70146 + @lock_mode: lock mode {LM_READ, LM_WRITE},
70147 + @flags: logical OR of {GN_*} (see description above) subset.
70148 +
70149 + @return: 0 if success, negative value if lock was impossible due to an error
70150 + or lack of neighbor node.
70151 +*/
70152 +
70153 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
70154 +int
70155 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70156 + znode_lock_mode lock_mode, int flags)
70157 +{
70158 + reiser4_tree *tree = znode_get_tree(node);
70159 + lock_handle path[REAL_MAX_ZTREE_HEIGHT];
70160 +
70161 + coord_t coord;
70162 +
70163 + tree_level base_level;
70164 + tree_level h = 0;
70165 + int ret;
70166 +
70167 + assert("umka-252", tree != NULL);
70168 + assert("umka-253", neighbor != NULL);
70169 + assert("umka-254", node != NULL);
70170 +
70171 + base_level = znode_get_level(node);
70172 +
70173 + assert("umka-310", base_level <= tree->height);
70174 +
70175 + coord_init_zero(&coord);
70176 +
70177 + again:
70178 + /* first, we try to use simple lock_neighbor() which requires sibling
70179 + link existence */
70180 + read_lock_tree(tree);
70181 + ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
70182 + read_unlock_tree(tree);
70183 + if (!ret) {
70184 + /* load znode content if it was specified */
70185 + if (flags & GN_LOAD_NEIGHBOR) {
70186 + ret = zload(node);
70187 + if (ret)
70188 + longterm_unlock_znode(neighbor);
70189 + }
70190 + return ret;
70191 + }
70192 +
70193 + /* only -ENOENT means we may look upward and try to connect
70194 + @node with its neighbor (if @flags allow us to do it) */
70195 + if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
70196 + return ret;
70197 +
70198 + /* before establishing of sibling link we lock parent node; it is
70199 + required by renew_neighbor() to work. */
70200 + init_lh(&path[0]);
70201 + ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
70202 + if (ret)
70203 + return ret;
70204 + if (znode_above_root(path[0].node)) {
70205 + longterm_unlock_znode(&path[0]);
70206 + return RETERR(-E_NO_NEIGHBOR);
70207 + }
70208 +
70209 + while (1) {
70210 + znode *child = (h == 0) ? node : path[h - 1].node;
70211 + znode *parent = path[h].node;
70212 +
70213 + ret = zload(parent);
70214 + if (ret)
70215 + break;
70216 +
70217 + ret = find_child_ptr(parent, child, &coord);
70218 +
70219 + if (ret) {
70220 + zrelse(parent);
70221 + break;
70222 + }
70223 +
70224 + /* try to establish missing sibling link */
70225 + ret = renew_neighbor(&coord, child, h + base_level, flags);
70226 +
70227 + zrelse(parent);
70228 +
70229 + switch (ret) {
70230 + case 0:
70231 + /* unlocking of parent znode prevents simple
70232 + deadlock situation */
70233 + done_lh(&path[h]);
70234 +
70235 + /* depend on tree level we stay on we repeat first
70236 + locking attempt ... */
70237 + if (h == 0)
70238 + goto again;
70239 +
70240 + /* ... or repeat establishing of sibling link at
70241 + one level below. */
70242 + --h;
70243 + break;
70244 +
70245 + case -ENOENT:
70246 + /* sibling link is not available -- we go
70247 + upward. */
70248 + init_lh(&path[h + 1]);
70249 + ret =
70250 + reiser4_get_parent(&path[h + 1], parent,
70251 + ZNODE_READ_LOCK);
70252 + if (ret)
70253 + goto fail;
70254 + ++h;
70255 + if (znode_above_root(path[h].node)) {
70256 + ret = RETERR(-E_NO_NEIGHBOR);
70257 + goto fail;
70258 + }
70259 + break;
70260 +
70261 + case -E_DEADLOCK:
70262 + /* there was lock request from hi-pri locker. if
70263 + it is possible we unlock last parent node and
70264 + re-lock it again. */
70265 + for (; reiser4_check_deadlock(); h--) {
70266 + done_lh(&path[h]);
70267 + if (h == 0)
70268 + goto fail;
70269 + }
70270 +
70271 + break;
70272 +
70273 + default: /* other errors. */
70274 + goto fail;
70275 + }
70276 + }
70277 + fail:
70278 + ON_DEBUG(check_lock_node_data(node));
70279 + ON_DEBUG(check_lock_data());
70280 +
70281 + /* unlock path */
70282 + do {
70283 + /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
70284 + fail; path[0] is already done_lh-ed, therefore
70285 + longterm_unlock_znode(&path[h]); is not applicable */
70286 + done_lh(&path[h]);
70287 + --h;
70288 + } while (h + 1 != 0);
70289 +
70290 + return ret;
70291 +}
70292 +
70293 +/* remove node from sibling list */
70294 +/* Audited by: umka (2002.06.14) */
70295 +void sibling_list_remove(znode * node)
70296 +{
70297 + reiser4_tree *tree;
70298 +
70299 + tree = znode_get_tree(node);
70300 + assert("umka-255", node != NULL);
70301 + assert_rw_write_locked(&(tree->tree_lock));
70302 + assert("nikita-3275", check_sibling_list(node));
70303 +
70304 + write_lock_dk(tree);
70305 + if (znode_is_right_connected(node) && node->right != NULL &&
70306 + znode_is_left_connected(node) && node->left != NULL) {
70307 + assert("zam-32245",
70308 + keyeq(znode_get_rd_key(node),
70309 + znode_get_ld_key(node->right)));
70310 + znode_set_rd_key(node->left, znode_get_ld_key(node->right));
70311 + }
70312 + write_unlock_dk(tree);
70313 +
70314 + if (znode_is_right_connected(node) && node->right != NULL) {
70315 + assert("zam-322", znode_is_left_connected(node->right));
70316 + node->right->left = node->left;
70317 + ON_DEBUG(node->right->left_version =
70318 + atomic_inc_return(&delim_key_version);
70319 + );
70320 + }
70321 + if (znode_is_left_connected(node) && node->left != NULL) {
70322 + assert("zam-323", znode_is_right_connected(node->left));
70323 + node->left->right = node->right;
70324 + ON_DEBUG(node->left->right_version =
70325 + atomic_inc_return(&delim_key_version);
70326 + );
70327 + }
70328 +
70329 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
70330 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70331 + ON_DEBUG(node->left = node->right = NULL;
70332 + node->left_version = atomic_inc_return(&delim_key_version);
70333 + node->right_version = atomic_inc_return(&delim_key_version););
70334 + assert("nikita-3276", check_sibling_list(node));
70335 +}
70336 +
70337 +/* disconnect node from sibling list */
70338 +void sibling_list_drop(znode * node)
70339 +{
70340 + znode *right;
70341 + znode *left;
70342 +
70343 + assert("nikita-2464", node != NULL);
70344 + assert("nikita-3277", check_sibling_list(node));
70345 +
70346 + right = node->right;
70347 + if (right != NULL) {
70348 + assert("nikita-2465", znode_is_left_connected(right));
70349 + right->left = NULL;
70350 + ON_DEBUG(right->left_version =
70351 + atomic_inc_return(&delim_key_version);
70352 + );
70353 + }
70354 + left = node->left;
70355 + if (left != NULL) {
70356 + assert("zam-323", znode_is_right_connected(left));
70357 + left->right = NULL;
70358 + ON_DEBUG(left->right_version =
70359 + atomic_inc_return(&delim_key_version);
70360 + );
70361 + }
70362 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
70363 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70364 + ON_DEBUG(node->left = node->right = NULL;
70365 + node->left_version = atomic_inc_return(&delim_key_version);
70366 + node->right_version = atomic_inc_return(&delim_key_version););
70367 +}
70368 +
70369 +/* Insert new node into sibling list. Regular balancing inserts new node
70370 + after (at right side) existing and locked node (@before), except one case
70371 + of adding new tree root node. @before should be NULL in that case. */
70372 +void sibling_list_insert_nolock(znode * new, znode * before)
70373 +{
70374 + assert("zam-334", new != NULL);
70375 + assert("nikita-3298", !znode_is_left_connected(new));
70376 + assert("nikita-3299", !znode_is_right_connected(new));
70377 + assert("nikita-3300", new->left == NULL);
70378 + assert("nikita-3301", new->right == NULL);
70379 + assert("nikita-3278", check_sibling_list(new));
70380 + assert("nikita-3279", check_sibling_list(before));
70381 +
70382 + if (before != NULL) {
70383 + assert("zam-333", znode_is_connected(before));
70384 + new->right = before->right;
70385 + new->left = before;
70386 + ON_DEBUG(new->right_version =
70387 + atomic_inc_return(&delim_key_version);
70388 + new->left_version =
70389 + atomic_inc_return(&delim_key_version););
70390 + if (before->right != NULL) {
70391 + before->right->left = new;
70392 + ON_DEBUG(before->right->left_version =
70393 + atomic_inc_return(&delim_key_version);
70394 + );
70395 + }
70396 + before->right = new;
70397 + ON_DEBUG(before->right_version =
70398 + atomic_inc_return(&delim_key_version);
70399 + );
70400 + } else {
70401 + new->right = NULL;
70402 + new->left = NULL;
70403 + ON_DEBUG(new->right_version =
70404 + atomic_inc_return(&delim_key_version);
70405 + new->left_version =
70406 + atomic_inc_return(&delim_key_version););
70407 + }
70408 + ZF_SET(new, JNODE_LEFT_CONNECTED);
70409 + ZF_SET(new, JNODE_RIGHT_CONNECTED);
70410 + assert("nikita-3280", check_sibling_list(new));
70411 + assert("nikita-3281", check_sibling_list(before));
70412 +}
70413 +
70414 +/*
70415 + Local variables:
70416 + c-indentation-style: "K&R"
70417 + mode-name: "LC"
70418 + c-basic-offset: 8
70419 + tab-width: 8
70420 + fill-column: 80
70421 + End:
70422 +*/
70423 diff -urN linux-2.6.33.orig/fs/reiser4/tree_walk.h linux-2.6.33/fs/reiser4/tree_walk.h
70424 --- linux-2.6.33.orig/fs/reiser4/tree_walk.h 1970-01-01 01:00:00.000000000 +0100
70425 +++ linux-2.6.33/fs/reiser4/tree_walk.h 2010-03-04 19:33:22.000000000 +0100
70426 @@ -0,0 +1,125 @@
70427 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70428 +
70429 +/* definitions of reiser4 tree walk functions */
70430 +
70431 +#ifndef __FS_REISER4_TREE_WALK_H__
70432 +#define __FS_REISER4_TREE_WALK_H__
70433 +
70434 +#include "debug.h"
70435 +#include "forward.h"
70436 +
70437 +/* establishes horizontal links between cached znodes */
70438 +int connect_znode(coord_t * coord, znode * node);
70439 +
70440 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70441 + have the following common arguments:
70442 +
70443 + return codes:
70444 +
70445 + @return : 0 - OK,
70446 +
70447 +ZAM-FIXME-HANS: wrong return code name. Change them all.
70448 + -ENOENT - neighbor is not in cache, what is detected by sibling
70449 + link absence.
70450 +
70451 + -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70452 + found (because we are left-/right- most node of the
70453 + tree, for example). Also, this return code is for
70454 + reiser4_get_parent() when we see no parent link -- it
70455 + means that our node is root node.
70456 +
70457 + -E_DEADLOCK - deadlock detected (request from high-priority process
70458 + received), other error codes are conformed to
70459 + /usr/include/asm/errno.h .
70460 +*/
70461 +
70462 +int
70463 +reiser4_get_parent_flags(lock_handle * result, znode * node,
70464 + znode_lock_mode mode, int flags);
70465 +
70466 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
70467 +typedef enum {
70468 + /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70469 + * find not allocated not connected neigbor by going though upper
70470 + * levels */
70471 + GN_CAN_USE_UPPER_LEVELS = 0x1,
70472 + /* locking left neighbor instead of right one */
70473 + GN_GO_LEFT = 0x2,
70474 + /* automatically load neighbor node content */
70475 + GN_LOAD_NEIGHBOR = 0x4,
70476 + /* return -E_REPEAT if can't lock */
70477 + GN_TRY_LOCK = 0x8,
70478 + /* used internally in tree_walk.c, causes renew_sibling to not
70479 + allocate neighbor znode, but only search for it in znode cache */
70480 + GN_NO_ALLOC = 0x10,
70481 + /* do not go across atom boundaries */
70482 + GN_SAME_ATOM = 0x20,
70483 + /* allow to lock not connected nodes */
70484 + GN_ALLOW_NOT_CONNECTED = 0x40,
70485 + /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70486 + GN_ASYNC = 0x80
70487 +} znode_get_neigbor_flags;
70488 +
70489 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
70490 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
70491 + znode_lock_mode mode)
70492 +{
70493 + return reiser4_get_parent_flags(result, node, mode,
70494 + GN_ALLOW_NOT_CONNECTED);
70495 +}
70496 +
70497 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70498 + znode_lock_mode lock_mode, int flags);
70499 +
70500 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
70501 +static inline int
70502 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70503 + int flags)
70504 +{
70505 + return reiser4_get_neighbor(result, node, lock_mode,
70506 + flags | GN_GO_LEFT);
70507 +}
70508 +
70509 +static inline int
70510 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70511 + int flags)
70512 +{
70513 + ON_DEBUG(check_lock_node_data(node));
70514 + ON_DEBUG(check_lock_data());
70515 + return reiser4_get_neighbor(result, node, lock_mode,
70516 + flags & (~GN_GO_LEFT));
70517 +}
70518 +
70519 +extern void sibling_list_remove(znode * node);
70520 +extern void sibling_list_drop(znode * node);
70521 +extern void sibling_list_insert_nolock(znode * new, znode * before);
70522 +extern void link_left_and_right(znode * left, znode * right);
70523 +
70524 +/* Functions called by tree_walk() when tree_walk() ... */
70525 +struct tree_walk_actor {
70526 + /* ... meets a formatted node, */
70527 + int (*process_znode) (tap_t *, void *);
70528 + /* ... meets an extent, */
70529 + int (*process_extent) (tap_t *, void *);
70530 + /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70531 + * node or extent processing functions. */
70532 + int (*before) (void *);
70533 +};
70534 +
70535 +#if REISER4_DEBUG
70536 +int check_sibling_list(znode * node);
70537 +#else
70538 +#define check_sibling_list(n) (1)
70539 +#endif
70540 +
70541 +#endif /* __FS_REISER4_TREE_WALK_H__ */
70542 +
70543 +/*
70544 + Local variables:
70545 + c-indentation-style: "K&R"
70546 + mode-name: "LC"
70547 + c-basic-offset: 8
70548 + tab-width: 8
70549 + fill-column: 120
70550 + End:
70551 +*/
70552 diff -urN linux-2.6.33.orig/fs/reiser4/txnmgr.c linux-2.6.33/fs/reiser4/txnmgr.c
70553 --- linux-2.6.33.orig/fs/reiser4/txnmgr.c 1970-01-01 01:00:00.000000000 +0100
70554 +++ linux-2.6.33/fs/reiser4/txnmgr.c 2010-03-04 19:33:22.000000000 +0100
70555 @@ -0,0 +1,3165 @@
70556 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70557 + * reiser4/README */
70558 +
70559 +/* Joshua MacDonald wrote the first draft of this code. */
70560 +
70561 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70562 +filesystem scales only as well as its worst locking design. You need to
70563 +substantially restructure this code. Josh was not as experienced a programmer
70564 +as you. Particularly review how the locking style differs from what you did
70565 +for znodes usingt hi-lo priority locking, and present to me an opinion on
70566 +whether the differences are well founded. */
70567 +
70568 +/* I cannot help but to disagree with the sentiment above. Locking of
70569 + * transaction manager is _not_ badly designed, and, at the very least, is not
70570 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70571 + * locking on znodes, especially on the root node of the tree. --nikita,
70572 + * 2003.10.13 */
70573 +
70574 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
70575 + txnmgr processes capture_block requests and manages the relationship between jnodes and
70576 + atoms through the various stages of a transcrash, and it also oversees the fusion and
70577 + capture-on-copy processes. The main difficulty with this task is maintaining a
70578 + deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
70579 + difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70580 + must be broken. The main requirement is that atom-fusion be deadlock free, so once you
70581 + hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
70582 + that any time you check the atom-pointer of a jnode or handle and then try to lock that
70583 + atom, you must use trylock() and possibly reverse the order.
70584 +
70585 + This code implements the design documented at:
70586 +
70587 + http://namesys.com/txn-doc.html
70588 +
70589 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70590 +above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
70591 +topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
70592 +year old --- define all technical terms used.
70593 +
70594 +*/
70595 +
70596 +/* Thoughts on the external transaction interface:
70597 +
70598 + In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70599 + creates state that lasts for the duration of a system call and is called at the start
70600 + of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70601 + occupying the scope of a single system call. We wish to give certain applications an
70602 + interface to begin and close (commit) transactions. Since our implementation of
70603 + transactions does not yet support isolation, allowing an application to open a
70604 + transaction implies trusting it to later close the transaction. Part of the
70605 + transaction interface will be aimed at enabling that trust, but the interface for
70606 + actually using transactions is fairly narrow.
70607 +
70608 + BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
70609 + this identifier into a string that a shell-script could use, allowing you to start a
70610 + transaction by issuing a command. Once open, the transcrash should be set in the task
70611 + structure, and there should be options (I suppose) to allow it to be carried across
70612 + fork/exec. A transcrash has several options:
70613 +
70614 + - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70615 + on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
70616 + capture on reads as well, it should set READ_FUSING.
70617 +
70618 + - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70619 + eventually close (or else the machine must crash). If the application dies an
70620 + unexpected death with an open transcrash, for example, or if it hangs for a long
70621 + duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70622 + This is a dangerous option, but it is one way to solve the problem until isolated
70623 + transcrashes are available for untrusted applications.
70624 +
70625 + It seems to be what databases do, though it is unclear how one avoids a DoS attack
70626 + creating a vulnerability based on resource starvation. Guaranteeing that some
70627 + minimum amount of computational resources are made available would seem more correct
70628 + than guaranteeing some amount of time. When we again have someone to code the work,
70629 + this issue should be considered carefully. -Hans
70630 +
70631 + RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70632 + many dirty blocks it expects. The reserve_blocks interface should be called at a point
70633 + where it is safe for the application to fail, because the system may not be able to
70634 + grant the allocation and the application must be able to back-out. For this reason,
70635 + the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70636 + the application may also wish to extend the allocation after beginning its transcrash.
70637 +
70638 + CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70639 + modifications that require transaction protection. When isolated transactions are
70640 + supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
70641 + RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70642 + CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70643 + why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70644 +
70645 + For actually implementing these out-of-system-call-scopped transcrashes, the
70646 + reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70647 + transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
70648 + "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70649 +*/
70650 +
70651 +/* Extending the other system call interfaces for future transaction features:
70652 +
70653 + Specialized applications may benefit from passing flags to the ordinary system call
70654 + interface such as read(), write(), or stat(). For example, the application specifies
70655 + WRITE_FUSING by default but wishes to add that a certain read() command should be
70656 + treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
70657 + read, or the file-data read? These issues are straight-forward, but there are a lot of
70658 + them and adding the necessary flags-passing code will be tedious.
70659 +
70660 + When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70661 + flag, which specifies that although it is a read operation being requested, a
70662 + write-lock should be taken. The reason is that read-locks are shared while write-locks
70663 + are exclusive, so taking a read-lock when a later-write is known in advance will often
70664 + leads to deadlock. If a reader knows it will write later, it should issue read
70665 + requests with the RMW flag set.
70666 +*/
70667 +
70668 +/*
70669 + The znode/atom deadlock avoidance.
70670 +
70671 + FIXME(Zam): writing of this comment is in progress.
70672 +
70673 + The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70674 + long-term locking, which makes reiser4 locking scheme more complex. It had
70675 + deadlocks until we implement deadlock avoidance algorithms. That deadlocks
70676 + looked as the following: one stopped thread waits for a long-term lock on
70677 + znode, the thread who owns that lock waits when fusion with another atom will
70678 + be allowed.
70679 +
70680 + The source of the deadlocks is an optimization of not capturing index nodes
70681 + for read. Let's prove it. Suppose we have dumb node capturing scheme which
70682 + unconditionally captures each block before locking it.
70683 +
70684 + That scheme has no deadlocks. Let's begin with the thread which stage is
70685 + ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
70686 + a capture because it's stage allows fusion with any atom except which are
70687 + being committed currently. A process of atom commit can't deadlock because
70688 + atom commit procedure does not acquire locks and does not fuse with other
70689 + atoms. Reiser4 does capturing right before going to sleep inside the
70690 + longtertm_lock_znode() function, it means the znode which we want to lock is
70691 + already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
70692 + continue the analysis we understand that no one process in the sequence may
70693 + waits atom fusion. Thereby there are no deadlocks of described kind.
70694 +
70695 + The capturing optimization makes the deadlocks possible. A thread can wait a
70696 + lock which owner did not captured that node. The lock owner's current atom
70697 + is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70698 + state. A deadlock is possible when that atom meets another one which is in
70699 + ASTAGE_CAPTURE_WAIT already.
70700 +
70701 + The deadlock avoidance scheme includes two algorithms:
70702 +
70703 + First algorithm is used when a thread captures a node which is locked but not
70704 + captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
70705 + moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
70706 + being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70707 + routine which forces all lock owners to join with current atom is executed.
70708 +
70709 + Second algorithm does not allow to skip capturing of already captured nodes.
70710 +
70711 + Both algorithms together prevent waiting a longterm lock without atom fusion
70712 + with atoms of all lock owners, which is a key thing for getting atom/znode
70713 + locking deadlocks.
70714 +*/
70715 +
70716 +/*
70717 + * Transactions and mmap(2).
70718 + *
70719 + * 1. Transactions are not supported for accesses through mmap(2), because
70720 + * this would effectively amount to user-level transactions whose duration
70721 + * is beyond control of the kernel.
70722 + *
70723 + * 2. That said, we still want to preserve some decency with regard to
70724 + * mmap(2). During normal write(2) call, following sequence of events
70725 + * happens:
70726 + *
70727 + * 1. page is created;
70728 + *
70729 + * 2. jnode is created, dirtied and captured into current atom.
70730 + *
70731 + * 3. extent is inserted and modified.
70732 + *
70733 + * Steps (2) and (3) take place under long term lock on the twig node.
70734 + *
70735 + * When file is accessed through mmap(2) page is always created during
70736 + * page fault.
70737 + * After this (in reiser4_readpage()->reiser4_readpage_extent()):
70738 + *
70739 + * 1. if access is made to non-hole page new jnode is created, (if
70740 + * necessary)
70741 + *
70742 + * 2. if access is made to the hole page, jnode is not created (XXX
70743 + * not clear why).
70744 + *
70745 + * Also, even if page is created by write page fault it is not marked
70746 + * dirty immediately by handle_mm_fault(). Probably this is to avoid races
70747 + * with page write-out.
70748 + *
70749 + * Dirty bit installed by hardware is only transferred to the struct page
70750 + * later, when page is unmapped (in zap_pte_range(), or
70751 + * try_to_unmap_one()).
70752 + *
70753 + * So, with mmap(2) we have to handle following irksome situations:
70754 + *
70755 + * 1. there exists modified page (clean or dirty) without jnode
70756 + *
70757 + * 2. there exists modified page (clean or dirty) with clean jnode
70758 + *
70759 + * 3. clean page which is a part of atom can be transparently modified
70760 + * at any moment through mapping without becoming dirty.
70761 + *
70762 + * (1) and (2) can lead to the out-of-memory situation: ->writepage()
70763 + * doesn't know what to do with such pages and ->sync_sb()/->writepages()
70764 + * don't see them, because these methods operate on atoms.
70765 + *
70766 + * (3) can lead to the loss of data: suppose we have dirty page with dirty
70767 + * captured jnode captured by some atom. As part of early flush (for
70768 + * example) page was written out. Dirty bit was cleared on both page and
70769 + * jnode. After this page is modified through mapping, but kernel doesn't
70770 + * notice and just discards page and jnode as part of commit. (XXX
70771 + * actually it doesn't, because to reclaim page ->releasepage() has to be
70772 + * called and before this dirty bit will be transferred to the struct
70773 + * page).
70774 + *
70775 + */
70776 +
70777 +#include "debug.h"
70778 +#include "txnmgr.h"
70779 +#include "jnode.h"
70780 +#include "znode.h"
70781 +#include "block_alloc.h"
70782 +#include "tree.h"
70783 +#include "wander.h"
70784 +#include "ktxnmgrd.h"
70785 +#include "super.h"
70786 +#include "page_cache.h"
70787 +#include "reiser4.h"
70788 +#include "vfs_ops.h"
70789 +#include "inode.h"
70790 +#include "flush.h"
70791 +
70792 +#include <asm/atomic.h>
70793 +#include <linux/types.h>
70794 +#include <linux/fs.h>
70795 +#include <linux/mm.h>
70796 +#include <linux/slab.h>
70797 +#include <linux/pagemap.h>
70798 +#include <linux/writeback.h>
70799 +#include <linux/swap.h> /* for totalram_pages */
70800 +
70801 +static void atom_free(txn_atom * atom);
70802 +
70803 +static int commit_txnh(txn_handle * txnh);
70804 +
70805 +static void wakeup_atom_waitfor_list(txn_atom * atom);
70806 +static void wakeup_atom_waiting_list(txn_atom * atom);
70807 +
70808 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70809 +
70810 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70811 +
70812 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70813 +
70814 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
70815 + txn_capture mode);
70816 +
70817 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70818 +
70819 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
70820 +
70821 +void reiser4_invalidate_list(struct list_head *);
70822 +
70823 +/* GENERIC STRUCTURES */
70824 +
70825 +typedef struct _txn_wait_links txn_wait_links;
70826 +
70827 +struct _txn_wait_links {
70828 + lock_stack *_lock_stack;
70829 + struct list_head _fwaitfor_link;
70830 + struct list_head _fwaiting_link;
70831 + int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70832 + int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70833 +};
70834 +
70835 +/* FIXME: In theory, we should be using the slab cache init & destructor
70836 + methods instead of, e.g., jnode_init, etc. */
70837 +static struct kmem_cache *_atom_slab = NULL;
70838 +/* this is for user-visible, cross system-call transactions. */
70839 +static struct kmem_cache *_txnh_slab = NULL;
70840 +
70841 +/**
70842 + * init_txnmgr_static - create transaction manager slab caches
70843 + *
70844 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70845 + * initialization.
70846 + */
70847 +int init_txnmgr_static(void)
70848 +{
70849 + assert("jmacd-600", _atom_slab == NULL);
70850 + assert("jmacd-601", _txnh_slab == NULL);
70851 +
70852 + ON_DEBUG(atomic_set(&flush_cnt, 0));
70853 +
70854 + _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70855 + SLAB_HWCACHE_ALIGN |
70856 + SLAB_RECLAIM_ACCOUNT, NULL);
70857 + if (_atom_slab == NULL)
70858 + return RETERR(-ENOMEM);
70859 +
70860 + _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70861 + SLAB_HWCACHE_ALIGN, NULL);
70862 + if (_txnh_slab == NULL) {
70863 + kmem_cache_destroy(_atom_slab);
70864 + _atom_slab = NULL;
70865 + return RETERR(-ENOMEM);
70866 + }
70867 +
70868 + return 0;
70869 +}
70870 +
70871 +/**
70872 + * done_txnmgr_static - delete txn_atom and txn_handle caches
70873 + *
70874 + * This is called on reiser4 module unloading or system shutdown.
70875 + */
70876 +void done_txnmgr_static(void)
70877 +{
70878 + destroy_reiser4_cache(&_atom_slab);
70879 + destroy_reiser4_cache(&_txnh_slab);
70880 +}
70881 +
70882 +/**
70883 + * init_txnmgr - initialize a new transaction manager
70884 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70885 + *
70886 + * This is called on mount. Makes necessary initializations.
70887 + */
70888 +void reiser4_init_txnmgr(txn_mgr *mgr)
70889 +{
70890 + assert("umka-169", mgr != NULL);
70891 +
70892 + mgr->atom_count = 0;
70893 + mgr->id_count = 1;
70894 + INIT_LIST_HEAD(&mgr->atoms_list);
70895 + spin_lock_init(&mgr->tmgr_lock);
70896 + mutex_init(&mgr->commit_mutex);
70897 +}
70898 +
70899 +/**
70900 + * reiser4_done_txnmgr - stop transaction manager
70901 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70902 + *
70903 + * This is called on umount. Does sanity checks.
70904 + */
70905 +void reiser4_done_txnmgr(txn_mgr *mgr)
70906 +{
70907 + assert("umka-170", mgr != NULL);
70908 + assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70909 + assert("umka-1702", mgr->atom_count == 0);
70910 +}
70911 +
70912 +/* Initialize a transaction handle. */
70913 +/* Audited by: umka (2002.06.13) */
70914 +static void txnh_init(txn_handle * txnh, txn_mode mode)
70915 +{
70916 + assert("umka-171", txnh != NULL);
70917 +
70918 + txnh->mode = mode;
70919 + txnh->atom = NULL;
70920 + reiser4_ctx_gfp_mask_set();
70921 + txnh->flags = 0;
70922 + spin_lock_init(&txnh->hlock);
70923 + INIT_LIST_HEAD(&txnh->txnh_link);
70924 +}
70925 +
70926 +#if REISER4_DEBUG
70927 +/* Check if a transaction handle is clean. */
70928 +static int txnh_isclean(txn_handle * txnh)
70929 +{
70930 + assert("umka-172", txnh != NULL);
70931 + return txnh->atom == NULL &&
70932 + LOCK_CNT_NIL(spin_locked_txnh);
70933 +}
70934 +#endif
70935 +
70936 +/* Initialize an atom. */
70937 +static void atom_init(txn_atom * atom)
70938 +{
70939 + int level;
70940 +
70941 + assert("umka-173", atom != NULL);
70942 +
70943 + memset(atom, 0, sizeof(txn_atom));
70944 +
70945 + atom->stage = ASTAGE_FREE;
70946 + atom->start_time = jiffies;
70947 +
70948 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70949 + INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70950 +
70951 + INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70952 + INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70953 + INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70954 + INIT_LIST_HEAD(&atom->inodes);
70955 + spin_lock_init(&(atom->alock));
70956 + /* list of transaction handles */
70957 + INIT_LIST_HEAD(&atom->txnh_list);
70958 + /* link to transaction manager's list of atoms */
70959 + INIT_LIST_HEAD(&atom->atom_link);
70960 + INIT_LIST_HEAD(&atom->fwaitfor_list);
70961 + INIT_LIST_HEAD(&atom->fwaiting_list);
70962 + blocknr_set_init(&atom->delete_set);
70963 + blocknr_set_init(&atom->wandered_map);
70964 +
70965 + init_atom_fq_parts(atom);
70966 +}
70967 +
70968 +#if REISER4_DEBUG
70969 +/* Check if an atom is clean. */
70970 +static int atom_isclean(txn_atom * atom)
70971 +{
70972 + int level;
70973 +
70974 + assert("umka-174", atom != NULL);
70975 +
70976 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70977 + if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70978 + return 0;
70979 + }
70980 + }
70981 +
70982 + return atom->stage == ASTAGE_FREE &&
70983 + atom->txnh_count == 0 &&
70984 + atom->capture_count == 0 &&
70985 + atomic_read(&atom->refcount) == 0 &&
70986 + (&atom->atom_link == atom->atom_link.next &&
70987 + &atom->atom_link == atom->atom_link.prev) &&
70988 + list_empty_careful(&atom->txnh_list) &&
70989 + list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70990 + list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70991 + list_empty_careful(ATOM_WB_LIST(atom)) &&
70992 + list_empty_careful(&atom->fwaitfor_list) &&
70993 + list_empty_careful(&atom->fwaiting_list) &&
70994 + atom_fq_parts_are_clean(atom);
70995 +}
70996 +#endif
70997 +
70998 +/* Begin a transaction in this context. Currently this uses the reiser4_context's
70999 + trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
71000 + this will be extended to allow transaction handles to span several contexts. */
71001 +/* Audited by: umka (2002.06.13) */
71002 +void reiser4_txn_begin(reiser4_context * context)
71003 +{
71004 + assert("jmacd-544", context->trans == NULL);
71005 +
71006 + context->trans = &context->trans_in_ctx;
71007 +
71008 + /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
71009 + transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
71010 + stack allocated right now, but we would like to allow for dynamically allocated
71011 + transcrashes that span multiple system calls.
71012 + */
71013 + txnh_init(context->trans, TXN_WRITE_FUSING);
71014 +}
71015 +
71016 +/* Finish a transaction handle context. */
71017 +int reiser4_txn_end(reiser4_context * context)
71018 +{
71019 + long ret = 0;
71020 + txn_handle *txnh;
71021 +
71022 + assert("umka-283", context != NULL);
71023 + assert("nikita-3012", reiser4_schedulable());
71024 + assert("vs-24", context == get_current_context());
71025 + assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
71026 +
71027 + txnh = context->trans;
71028 + if (txnh != NULL) {
71029 + if (txnh->atom != NULL)
71030 + ret = commit_txnh(txnh);
71031 + assert("jmacd-633", txnh_isclean(txnh));
71032 + context->trans = NULL;
71033 + }
71034 + return ret;
71035 +}
71036 +
71037 +void reiser4_txn_restart(reiser4_context * context)
71038 +{
71039 + reiser4_txn_end(context);
71040 + reiser4_preempt_point();
71041 + reiser4_txn_begin(context);
71042 +}
71043 +
71044 +void reiser4_txn_restart_current(void)
71045 +{
71046 + reiser4_txn_restart(get_current_context());
71047 +}
71048 +
71049 +/* TXN_ATOM */
71050 +
71051 +/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
71052 + is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
71053 + return NULL. */
71054 +static txn_atom *txnh_get_atom(txn_handle * txnh)
71055 +{
71056 + txn_atom *atom;
71057 +
71058 + assert("umka-180", txnh != NULL);
71059 + assert_spin_not_locked(&(txnh->hlock));
71060 +
71061 + while (1) {
71062 + spin_lock_txnh(txnh);
71063 + atom = txnh->atom;
71064 +
71065 + if (atom == NULL)
71066 + break;
71067 +
71068 + if (spin_trylock_atom(atom))
71069 + break;
71070 +
71071 + atomic_inc(&atom->refcount);
71072 +
71073 + spin_unlock_txnh(txnh);
71074 + spin_lock_atom(atom);
71075 + spin_lock_txnh(txnh);
71076 +
71077 + if (txnh->atom == atom) {
71078 + atomic_dec(&atom->refcount);
71079 + break;
71080 + }
71081 +
71082 + spin_unlock_txnh(txnh);
71083 + atom_dec_and_unlock(atom);
71084 + }
71085 +
71086 + return atom;
71087 +}
71088 +
71089 +/* Get the current atom and spinlock it if current atom present. May return NULL */
71090 +txn_atom *get_current_atom_locked_nocheck(void)
71091 +{
71092 + reiser4_context *cx;
71093 + txn_atom *atom;
71094 + txn_handle *txnh;
71095 +
71096 + cx = get_current_context();
71097 + assert("zam-437", cx != NULL);
71098 +
71099 + txnh = cx->trans;
71100 + assert("zam-435", txnh != NULL);
71101 +
71102 + atom = txnh_get_atom(txnh);
71103 +
71104 + spin_unlock_txnh(txnh);
71105 + return atom;
71106 +}
71107 +
71108 +/* Get the atom belonging to a jnode, which is initially locked. Return with
71109 + both jnode and atom locked. This performs the necessary spin_trylock to
71110 + break the lock-ordering cycle. Assumes the jnode is already locked, and
71111 + returns NULL if atom is not set. */
71112 +txn_atom *jnode_get_atom(jnode * node)
71113 +{
71114 + txn_atom *atom;
71115 +
71116 + assert("umka-181", node != NULL);
71117 +
71118 + while (1) {
71119 + assert_spin_locked(&(node->guard));
71120 +
71121 + atom = node->atom;
71122 + /* node is not in any atom */
71123 + if (atom == NULL)
71124 + break;
71125 +
71126 + /* If atom is not locked, grab the lock and return */
71127 + if (spin_trylock_atom(atom))
71128 + break;
71129 +
71130 + /* At least one jnode belongs to this atom it guarantees that
71131 + * atom->refcount > 0, we can safely increment refcount. */
71132 + atomic_inc(&atom->refcount);
71133 + spin_unlock_jnode(node);
71134 +
71135 + /* re-acquire spin locks in the right order */
71136 + spin_lock_atom(atom);
71137 + spin_lock_jnode(node);
71138 +
71139 + /* check if node still points to the same atom. */
71140 + if (node->atom == atom) {
71141 + atomic_dec(&atom->refcount);
71142 + break;
71143 + }
71144 +
71145 + /* releasing of atom lock and reference requires not holding
71146 + * locks on jnodes. */
71147 + spin_unlock_jnode(node);
71148 +
71149 + /* We do not sure that this atom has extra references except our
71150 + * one, so we should call proper function which may free atom if
71151 + * last reference is released. */
71152 + atom_dec_and_unlock(atom);
71153 +
71154 + /* lock jnode again for getting valid node->atom pointer
71155 + * value. */
71156 + spin_lock_jnode(node);
71157 + }
71158 +
71159 + return atom;
71160 +}
71161 +
71162 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
71163 + by flush code to indicate whether the next node (in some direction) is suitable for
71164 + flushing. */
71165 +int
71166 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
71167 +{
71168 + int compat;
71169 + txn_atom *atom;
71170 +
71171 + assert("umka-182", node != NULL);
71172 + assert("umka-183", check != NULL);
71173 +
71174 + /* Not sure what this function is supposed to do if supplied with @check that is
71175 + neither formatted nor unformatted (bitmap or so). */
71176 + assert("nikita-2373", jnode_is_znode(check)
71177 + || jnode_is_unformatted(check));
71178 +
71179 + /* Need a lock on CHECK to get its atom and to check various state bits.
71180 + Don't need a lock on NODE once we get the atom lock. */
71181 + /* It is not enough to lock two nodes and check (node->atom ==
71182 + check->atom) because atom could be locked and being fused at that
71183 + moment, jnodes of the atom of that state (being fused) can point to
71184 + different objects, but the atom is the same. */
71185 + spin_lock_jnode(check);
71186 +
71187 + atom = jnode_get_atom(check);
71188 +
71189 + if (atom == NULL) {
71190 + compat = 0;
71191 + } else {
71192 + compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
71193 +
71194 + if (compat && jnode_is_znode(check)) {
71195 + compat &= znode_is_connected(JZNODE(check));
71196 + }
71197 +
71198 + if (compat && alloc_check) {
71199 + compat &= (alloc_value == jnode_is_flushprepped(check));
71200 + }
71201 +
71202 + spin_unlock_atom(atom);
71203 + }
71204 +
71205 + spin_unlock_jnode(check);
71206 +
71207 + return compat;
71208 +}
71209 +
71210 +/* Decrement the atom's reference count and if it falls to zero, free it. */
71211 +void atom_dec_and_unlock(txn_atom * atom)
71212 +{
71213 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71214 +
71215 + assert("umka-186", atom != NULL);
71216 + assert_spin_locked(&(atom->alock));
71217 + assert("zam-1039", atomic_read(&atom->refcount) > 0);
71218 +
71219 + if (atomic_dec_and_test(&atom->refcount)) {
71220 + /* take txnmgr lock and atom lock in proper order. */
71221 + if (!spin_trylock_txnmgr(mgr)) {
71222 + /* This atom should exist after we re-acquire its
71223 + * spinlock, so we increment its reference counter. */
71224 + atomic_inc(&atom->refcount);
71225 + spin_unlock_atom(atom);
71226 + spin_lock_txnmgr(mgr);
71227 + spin_lock_atom(atom);
71228 +
71229 + if (!atomic_dec_and_test(&atom->refcount)) {
71230 + spin_unlock_atom(atom);
71231 + spin_unlock_txnmgr(mgr);
71232 + return;
71233 + }
71234 + }
71235 + assert_spin_locked(&(mgr->tmgr_lock));
71236 + atom_free(atom);
71237 + spin_unlock_txnmgr(mgr);
71238 + } else
71239 + spin_unlock_atom(atom);
71240 +}
71241 +
71242 +/* Create new atom and connect it to given transaction handle. This adds the
71243 + atom to the transaction manager's list and sets its reference count to 1, an
71244 + artificial reference which is kept until it commits. We play strange games
71245 + to avoid allocation under jnode & txnh spinlocks.*/
71246 +
71247 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
71248 +{
71249 + txn_atom *atom;
71250 + txn_mgr *mgr;
71251 +
71252 + if (REISER4_DEBUG && rofs_tree(current_tree)) {
71253 + warning("nikita-3366", "Creating atom on rofs");
71254 + dump_stack();
71255 + }
71256 +
71257 + if (*atom_alloc == NULL) {
71258 + (*atom_alloc) = kmem_cache_alloc(_atom_slab,
71259 + reiser4_ctx_gfp_mask_get());
71260 +
71261 + if (*atom_alloc == NULL)
71262 + return RETERR(-ENOMEM);
71263 + }
71264 +
71265 + /* and, also, txnmgr spin lock should be taken before jnode and txnh
71266 + locks. */
71267 + mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71268 + spin_lock_txnmgr(mgr);
71269 + spin_lock_txnh(txnh);
71270 +
71271 + /* Check whether new atom still needed */
71272 + if (txnh->atom != NULL) {
71273 + /* NOTE-NIKITA probably it is rather better to free
71274 + * atom_alloc here than thread it up to reiser4_try_capture() */
71275 +
71276 + spin_unlock_txnh(txnh);
71277 + spin_unlock_txnmgr(mgr);
71278 +
71279 + return -E_REPEAT;
71280 + }
71281 +
71282 + atom = *atom_alloc;
71283 + *atom_alloc = NULL;
71284 +
71285 + atom_init(atom);
71286 +
71287 + assert("jmacd-17", atom_isclean(atom));
71288 +
71289 + /*
71290 + * lock ordering is broken here. It is ok, as long as @atom is new
71291 + * and inaccessible for others. We can't use spin_lock_atom or
71292 + * spin_lock(&atom->alock) because they care about locking
71293 + * dependencies. spin_trylock_lock doesn't.
71294 + */
71295 + check_me("", spin_trylock_atom(atom));
71296 +
71297 + /* add atom to the end of transaction manager's list of atoms */
71298 + list_add_tail(&atom->atom_link, &mgr->atoms_list);
71299 + atom->atom_id = mgr->id_count++;
71300 + mgr->atom_count += 1;
71301 +
71302 + /* Release txnmgr lock */
71303 + spin_unlock_txnmgr(mgr);
71304 +
71305 + /* One reference until it commits. */
71306 + atomic_inc(&atom->refcount);
71307 + atom->stage = ASTAGE_CAPTURE_FUSE;
71308 + atom->super = reiser4_get_current_sb();
71309 + capture_assign_txnh_nolock(atom, txnh);
71310 +
71311 + spin_unlock_atom(atom);
71312 + spin_unlock_txnh(txnh);
71313 +
71314 + return -E_REPEAT;
71315 +}
71316 +
71317 +/* Return true if an atom is currently "open". */
71318 +static int atom_isopen(const txn_atom * atom)
71319 +{
71320 + assert("umka-185", atom != NULL);
71321 +
71322 + return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
71323 +}
71324 +
71325 +/* Return the number of pointers to this atom that must be updated during fusion. This
71326 + approximates the amount of work to be done. Fusion chooses the atom with fewer
71327 + pointers to fuse into the atom with more pointers. */
71328 +static int atom_pointer_count(const txn_atom * atom)
71329 +{
71330 + assert("umka-187", atom != NULL);
71331 +
71332 + /* This is a measure of the amount of work needed to fuse this atom
71333 + * into another. */
71334 + return atom->txnh_count + atom->capture_count;
71335 +}
71336 +
71337 +/* Called holding the atom lock, this removes the atom from the transaction manager list
71338 + and frees it. */
71339 +static void atom_free(txn_atom * atom)
71340 +{
71341 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71342 +
71343 + assert("umka-188", atom != NULL);
71344 + assert_spin_locked(&(atom->alock));
71345 +
71346 + /* Remove from the txn_mgr's atom list */
71347 + assert_spin_locked(&(mgr->tmgr_lock));
71348 + mgr->atom_count -= 1;
71349 + list_del_init(&atom->atom_link);
71350 +
71351 + /* Clean the atom */
71352 + assert("jmacd-16",
71353 + (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
71354 + atom->stage = ASTAGE_FREE;
71355 +
71356 + blocknr_set_destroy(&atom->delete_set);
71357 + blocknr_set_destroy(&atom->wandered_map);
71358 +
71359 + assert("jmacd-16", atom_isclean(atom));
71360 +
71361 + spin_unlock_atom(atom);
71362 +
71363 + kmem_cache_free(_atom_slab, atom);
71364 +}
71365 +
71366 +static int atom_is_dotard(const txn_atom * atom)
71367 +{
71368 + return time_after(jiffies, atom->start_time +
71369 + get_current_super_private()->tmgr.atom_max_age);
71370 +}
71371 +
71372 +static int atom_can_be_committed(txn_atom * atom)
71373 +{
71374 + assert_spin_locked(&(atom->alock));
71375 + assert("zam-885", atom->txnh_count > atom->nr_waiters);
71376 + return atom->txnh_count == atom->nr_waiters + 1;
71377 +}
71378 +
71379 +/* Return true if an atom should commit now. This is determined by aging, atom
71380 + size or atom flags. */
71381 +static int atom_should_commit(const txn_atom * atom)
71382 +{
71383 + assert("umka-189", atom != NULL);
71384 + return
71385 + (atom->flags & ATOM_FORCE_COMMIT) ||
71386 + ((unsigned)atom_pointer_count(atom) >
71387 + get_current_super_private()->tmgr.atom_max_size)
71388 + || atom_is_dotard(atom);
71389 +}
71390 +
71391 +/* return 1 if current atom exists and requires commit. */
71392 +int current_atom_should_commit(void)
71393 +{
71394 + txn_atom *atom;
71395 + int result = 0;
71396 +
71397 + atom = get_current_atom_locked_nocheck();
71398 + if (atom) {
71399 + result = atom_should_commit(atom);
71400 + spin_unlock_atom(atom);
71401 + }
71402 + return result;
71403 +}
71404 +
71405 +static int atom_should_commit_asap(const txn_atom * atom)
71406 +{
71407 + unsigned int captured;
71408 + unsigned int pinnedpages;
71409 +
71410 + assert("nikita-3309", atom != NULL);
71411 +
71412 + captured = (unsigned)atom->capture_count;
71413 + pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
71414 +
71415 + return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
71416 +}
71417 +
71418 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71419 +{
71420 + jnode *first_dirty;
71421 +
71422 + list_for_each_entry(first_dirty, head, capture_link) {
71423 + if (!(flags & JNODE_FLUSH_COMMIT)) {
71424 + /*
71425 + * skip jnodes which "heard banshee" or having active
71426 + * I/O
71427 + */
71428 + if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71429 + JF_ISSET(first_dirty, JNODE_WRITEBACK))
71430 + continue;
71431 + }
71432 + return first_dirty;
71433 + }
71434 + return NULL;
71435 +}
71436 +
71437 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71438 + nodes on atom's lists */
71439 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71440 +{
71441 + jnode *first_dirty;
71442 + tree_level level;
71443 +
71444 + assert_spin_locked(&(atom->alock));
71445 +
71446 + /* The flush starts from LEAF_LEVEL (=1). */
71447 + for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71448 + if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71449 + continue;
71450 +
71451 + first_dirty =
71452 + find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71453 + flags);
71454 + if (first_dirty)
71455 + return first_dirty;
71456 + }
71457 +
71458 + /* znode-above-root is on the list #0. */
71459 + return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71460 +}
71461 +
71462 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71463 +{
71464 + jnode *cur;
71465 +
71466 + assert("zam-905", atom_is_protected(atom));
71467 +
71468 + cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71469 + while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71470 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71471 +
71472 + spin_lock_jnode(cur);
71473 + if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71474 + if (JF_ISSET(cur, JNODE_DIRTY)) {
71475 + queue_jnode(fq, cur);
71476 + } else {
71477 + /* move jnode to atom's clean list */
71478 + list_move_tail(&cur->capture_link,
71479 + ATOM_CLEAN_LIST(atom));
71480 + }
71481 + }
71482 + spin_unlock_jnode(cur);
71483 +
71484 + cur = next;
71485 + }
71486 +}
71487 +
71488 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71489 + * jnodes to disk. */
71490 +static int submit_wb_list(void)
71491 +{
71492 + int ret;
71493 + flush_queue_t *fq;
71494 +
71495 + fq = get_fq_for_current_atom();
71496 + if (IS_ERR(fq))
71497 + return PTR_ERR(fq);
71498 +
71499 + dispatch_wb_list(fq->atom, fq);
71500 + spin_unlock_atom(fq->atom);
71501 +
71502 + ret = reiser4_write_fq(fq, NULL, 1);
71503 + reiser4_fq_put(fq);
71504 +
71505 + return ret;
71506 +}
71507 +
71508 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
71509 +static int current_atom_complete_writes(void)
71510 +{
71511 + int ret;
71512 +
71513 + /* Each jnode from that list was modified and dirtied when it had i/o
71514 + * request running already. After i/o completion we have to resubmit
71515 + * them to disk again.*/
71516 + ret = submit_wb_list();
71517 + if (ret < 0)
71518 + return ret;
71519 +
71520 + /* Wait all i/o completion */
71521 + ret = current_atom_finish_all_fq();
71522 + if (ret)
71523 + return ret;
71524 +
71525 + /* Scan wb list again; all i/o should be completed, we re-submit dirty
71526 + * nodes to disk */
71527 + ret = submit_wb_list();
71528 + if (ret < 0)
71529 + return ret;
71530 +
71531 + /* Wait all nodes we just submitted */
71532 + return current_atom_finish_all_fq();
71533 +}
71534 +
71535 +#if REISER4_DEBUG
71536 +
71537 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71538 +{
71539 + if (atom == NULL) {
71540 + printk("%s: no atom\n", prefix);
71541 + return;
71542 + }
71543 +
71544 + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71545 + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71546 + atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71547 + atom->txnh_count, atom->capture_count, atom->stage,
71548 + atom->start_time, atom->flushed);
71549 +}
71550 +
71551 +#else /* REISER4_DEBUG */
71552 +
71553 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71554 +
71555 +#endif /* REISER4_DEBUG */
71556 +
71557 +#define TOOMANYFLUSHES (1 << 13)
71558 +
71559 +/* Called with the atom locked and no open "active" transaction handlers except
71560 + ours, this function calls flush_current_atom() until all dirty nodes are
71561 + processed. Then it initiates commit processing.
71562 +
71563 + Called by the single remaining open "active" txnh, which is closing. Other
71564 + open txnhs belong to processes which wait atom commit in commit_txnh()
71565 + routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
71566 + long as we hold the atom lock none of the jnodes can be captured and/or
71567 + locked.
71568 +
71569 + Return value is an error code if commit fails.
71570 +*/
71571 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71572 +{
71573 + reiser4_super_info_data *sbinfo = get_current_super_private();
71574 + long ret = 0;
71575 + /* how many times jnode_flush() was called as a part of attempt to
71576 + * commit this atom. */
71577 + int flushiters;
71578 +
71579 + assert("zam-888", atom != NULL && *atom != NULL);
71580 + assert_spin_locked(&((*atom)->alock));
71581 + assert("zam-887", get_current_context()->trans->atom == *atom);
71582 + assert("jmacd-151", atom_isopen(*atom));
71583 +
71584 + assert("nikita-3184",
71585 + get_current_super_private()->delete_mutex_owner != current);
71586 +
71587 + for (flushiters = 0;; ++flushiters) {
71588 + ret =
71589 + flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71590 + JNODE_FLUSH_COMMIT,
71591 + LONG_MAX /* nr_to_write */ ,
71592 + nr_submitted, atom, NULL);
71593 + if (ret != -E_REPEAT)
71594 + break;
71595 +
71596 + /* if atom's dirty list contains one znode which is
71597 + HEARD_BANSHEE and is locked we have to allow lock owner to
71598 + continue and uncapture that znode */
71599 + reiser4_preempt_point();
71600 +
71601 + *atom = get_current_atom_locked();
71602 + if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71603 + warning("nikita-3176",
71604 + "Flushing like mad: %i", flushiters);
71605 + reiser4_info_atom("atom", *atom);
71606 + DEBUGON(flushiters > (1 << 20));
71607 + }
71608 + }
71609 +
71610 + if (ret)
71611 + return ret;
71612 +
71613 + assert_spin_locked(&((*atom)->alock));
71614 +
71615 + if (!atom_can_be_committed(*atom)) {
71616 + spin_unlock_atom(*atom);
71617 + return RETERR(-E_REPEAT);
71618 + }
71619 +
71620 + if ((*atom)->capture_count == 0)
71621 + goto done;
71622 +
71623 + /* Up to this point we have been flushing and after flush is called we
71624 + return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
71625 + at this point, commit should be successful. */
71626 + reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71627 + ON_DEBUG(((*atom)->committer = current));
71628 + spin_unlock_atom(*atom);
71629 +
71630 + ret = current_atom_complete_writes();
71631 + if (ret)
71632 + return ret;
71633 +
71634 + assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71635 +
71636 + /* isolate critical code path which should be executed by only one
71637 + * thread using tmgr mutex */
71638 + mutex_lock(&sbinfo->tmgr.commit_mutex);
71639 +
71640 + ret = reiser4_write_logs(nr_submitted);
71641 + if (ret < 0)
71642 + reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71643 +
71644 + /* The atom->ovrwr_nodes list is processed under commit mutex held
71645 + because of bitmap nodes which are captured by special way in
71646 + reiser4_pre_commit_hook_bitmap(), that way does not include
71647 + capture_fuse_wait() as a capturing of other nodes does -- the commit
71648 + mutex is used for transaction isolation instead. */
71649 + reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71650 + mutex_unlock(&sbinfo->tmgr.commit_mutex);
71651 +
71652 + reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71653 + reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71654 + assert("zam-927", list_empty(&(*atom)->inodes));
71655 +
71656 + spin_lock_atom(*atom);
71657 + done:
71658 + reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71659 + ON_DEBUG((*atom)->committer = NULL);
71660 +
71661 + /* Atom's state changes, so wake up everybody waiting for this
71662 + event. */
71663 + wakeup_atom_waiting_list(*atom);
71664 +
71665 + /* Decrement the "until commit" reference, at least one txnh (the caller) is
71666 + still open. */
71667 + atomic_dec(&(*atom)->refcount);
71668 +
71669 + assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71670 + assert("jmacd-1062", (*atom)->capture_count == 0);
71671 + BUG_ON((*atom)->capture_count != 0);
71672 + assert_spin_locked(&((*atom)->alock));
71673 +
71674 + return ret;
71675 +}
71676 +
71677 +/* TXN_TXNH */
71678 +
71679 +/**
71680 + * force_commit_atom - commit current atom and wait commit completion
71681 + * @txnh:
71682 + *
71683 + * Commits current atom and wait commit completion; current atom and @txnh have
71684 + * to be spinlocked before call, this function unlocks them on exit.
71685 + */
71686 +int force_commit_atom(txn_handle *txnh)
71687 +{
71688 + txn_atom *atom;
71689 +
71690 + assert("zam-837", txnh != NULL);
71691 + assert_spin_locked(&(txnh->hlock));
71692 + assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71693 +
71694 + atom = txnh->atom;
71695 +
71696 + assert("zam-834", atom != NULL);
71697 + assert_spin_locked(&(atom->alock));
71698 +
71699 + /*
71700 + * Set flags for atom and txnh: forcing atom commit and waiting for
71701 + * commit completion
71702 + */
71703 + txnh->flags |= TXNH_WAIT_COMMIT;
71704 + atom->flags |= ATOM_FORCE_COMMIT;
71705 +
71706 + spin_unlock_txnh(txnh);
71707 + spin_unlock_atom(atom);
71708 +
71709 + /* commit is here */
71710 + reiser4_txn_restart_current();
71711 + return 0;
71712 +}
71713 +
71714 +/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
71715 + * should we commit all atoms including new ones which are created after this
71716 + * functions is called. */
71717 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71718 +{
71719 + int ret;
71720 + txn_atom *atom;
71721 + txn_mgr *mgr;
71722 + txn_handle *txnh;
71723 + unsigned long start_time = jiffies;
71724 + reiser4_context *ctx = get_current_context();
71725 +
71726 + assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71727 + assert("nikita-3058", reiser4_commit_check_locks());
71728 +
71729 + reiser4_txn_restart_current();
71730 +
71731 + mgr = &get_super_private(super)->tmgr;
71732 +
71733 + txnh = ctx->trans;
71734 +
71735 + again:
71736 +
71737 + spin_lock_txnmgr(mgr);
71738 +
71739 + list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71740 + spin_lock_atom(atom);
71741 +
71742 + /* Commit any atom which can be committed. If @commit_new_atoms
71743 + * is not set we commit only atoms which were created before
71744 + * this call is started. */
71745 + if (commit_all_atoms
71746 + || time_before_eq(atom->start_time, start_time)) {
71747 + if (atom->stage <= ASTAGE_POST_COMMIT) {
71748 + spin_unlock_txnmgr(mgr);
71749 +
71750 + if (atom->stage < ASTAGE_PRE_COMMIT) {
71751 + spin_lock_txnh(txnh);
71752 + /* Add force-context txnh */
71753 + capture_assign_txnh_nolock(atom, txnh);
71754 + ret = force_commit_atom(txnh);
71755 + if (ret)
71756 + return ret;
71757 + } else
71758 + /* wait atom commit */
71759 + reiser4_atom_wait_event(atom);
71760 +
71761 + goto again;
71762 + }
71763 + }
71764 +
71765 + spin_unlock_atom(atom);
71766 + }
71767 +
71768 +#if REISER4_DEBUG
71769 + if (commit_all_atoms) {
71770 + reiser4_super_info_data *sbinfo = get_super_private(super);
71771 + spin_lock_reiser4_super(sbinfo);
71772 + assert("zam-813",
71773 + sbinfo->blocks_fake_allocated_unformatted == 0);
71774 + assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71775 + spin_unlock_reiser4_super(sbinfo);
71776 + }
71777 +#endif
71778 +
71779 + spin_unlock_txnmgr(mgr);
71780 +
71781 + return 0;
71782 +}
71783 +
71784 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71785 + * caller */
71786 +static int atom_is_committable(txn_atom * atom)
71787 +{
71788 + return
71789 + atom->stage < ASTAGE_PRE_COMMIT &&
71790 + atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71791 +}
71792 +
71793 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71794 + * lock at exit */
71795 +int commit_some_atoms(txn_mgr * mgr)
71796 +{
71797 + int ret = 0;
71798 + txn_atom *atom;
71799 + txn_handle *txnh;
71800 + reiser4_context *ctx;
71801 + struct list_head *pos, *tmp;
71802 +
71803 + ctx = get_current_context();
71804 + assert("nikita-2444", ctx != NULL);
71805 +
71806 + txnh = ctx->trans;
71807 + spin_lock_txnmgr(mgr);
71808 +
71809 + /*
71810 + * this is to avoid gcc complain that atom might be used
71811 + * uninitialized
71812 + */
71813 + atom = NULL;
71814 +
71815 + /* look for atom to commit */
71816 + list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71817 + atom = list_entry(pos, txn_atom, atom_link);
71818 + /*
71819 + * first test without taking atom spin lock, whether it is
71820 + * eligible for committing at all
71821 + */
71822 + if (atom_is_committable(atom)) {
71823 + /* now, take spin lock and re-check */
71824 + spin_lock_atom(atom);
71825 + if (atom_is_committable(atom))
71826 + break;
71827 + spin_unlock_atom(atom);
71828 + }
71829 + }
71830 +
71831 + ret = (&mgr->atoms_list == pos);
71832 + spin_unlock_txnmgr(mgr);
71833 +
71834 + if (ret) {
71835 + /* nothing found */
71836 + spin_unlock(&mgr->daemon->guard);
71837 + return 0;
71838 + }
71839 +
71840 + spin_lock_txnh(txnh);
71841 +
71842 + BUG_ON(atom == NULL);
71843 + /* Set the atom to force committing */
71844 + atom->flags |= ATOM_FORCE_COMMIT;
71845 +
71846 + /* Add force-context txnh */
71847 + capture_assign_txnh_nolock(atom, txnh);
71848 +
71849 + spin_unlock_txnh(txnh);
71850 + spin_unlock_atom(atom);
71851 +
71852 + /* we are about to release daemon spin lock, notify daemon it
71853 + has to rescan atoms */
71854 + mgr->daemon->rescan = 1;
71855 + spin_unlock(&mgr->daemon->guard);
71856 + reiser4_txn_restart_current();
71857 + return 0;
71858 +}
71859 +
71860 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71861 +{
71862 + int atom_stage;
71863 + txn_atom *atom_2;
71864 + int repeat;
71865 +
71866 + assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71867 +
71868 + atom_stage = atom->stage;
71869 + repeat = 0;
71870 +
71871 + if (!spin_trylock_txnmgr(tmgr)) {
71872 + atomic_inc(&atom->refcount);
71873 + spin_unlock_atom(atom);
71874 + spin_lock_txnmgr(tmgr);
71875 + spin_lock_atom(atom);
71876 + repeat = 1;
71877 + if (atom->stage != atom_stage) {
71878 + spin_unlock_txnmgr(tmgr);
71879 + atom_dec_and_unlock(atom);
71880 + return -E_REPEAT;
71881 + }
71882 + atomic_dec(&atom->refcount);
71883 + }
71884 +
71885 + list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71886 + if (atom == atom_2)
71887 + continue;
71888 + /*
71889 + * if trylock does not succeed we just do not fuse with that
71890 + * atom.
71891 + */
71892 + if (spin_trylock_atom(atom_2)) {
71893 + if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71894 + spin_unlock_txnmgr(tmgr);
71895 + capture_fuse_into(atom_2, atom);
71896 + /* all locks are lost we can only repeat here */
71897 + return -E_REPEAT;
71898 + }
71899 + spin_unlock_atom(atom_2);
71900 + }
71901 + }
71902 + atom->flags |= ATOM_CANCEL_FUSION;
71903 + spin_unlock_txnmgr(tmgr);
71904 + if (repeat) {
71905 + spin_unlock_atom(atom);
71906 + return -E_REPEAT;
71907 + }
71908 + return 0;
71909 +}
71910 +
71911 +/* Calls jnode_flush for current atom if it exists; if not, just take another
71912 + atom and call jnode_flush() for him. If current transaction handle has
71913 + already assigned atom (current atom) we have to close current transaction
71914 + prior to switch to another atom or do something with current atom. This
71915 + code tries to flush current atom.
71916 +
71917 + flush_some_atom() is called as part of memory clearing process. It is
71918 + invoked from balance_dirty_pages(), pdflushd, and entd.
71919 +
71920 + If we can flush no nodes, atom is committed, because this frees memory.
71921 +
71922 + If atom is too large or too old it is committed also.
71923 +*/
71924 +int
71925 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71926 + int flags)
71927 +{
71928 + reiser4_context *ctx = get_current_context();
71929 + txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71930 + txn_handle *txnh = ctx->trans;
71931 + txn_atom *atom;
71932 + int ret;
71933 +
71934 + BUG_ON(wbc->nr_to_write == 0);
71935 + BUG_ON(*nr_submitted != 0);
71936 + assert("zam-1042", txnh != NULL);
71937 + repeat:
71938 + if (txnh->atom == NULL) {
71939 + /* current atom is not available, take first from txnmgr */
71940 + spin_lock_txnmgr(tmgr);
71941 +
71942 + /* traverse the list of all atoms */
71943 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71944 + /* lock atom before checking its state */
71945 + spin_lock_atom(atom);
71946 +
71947 + /*
71948 + * we need an atom which is not being committed and
71949 + * which has no flushers (jnode_flush() add one flusher
71950 + * at the beginning and subtract one at the end).
71951 + */
71952 + if (atom->stage < ASTAGE_PRE_COMMIT &&
71953 + atom->nr_flushers == 0) {
71954 + spin_lock_txnh(txnh);
71955 + capture_assign_txnh_nolock(atom, txnh);
71956 + spin_unlock_txnh(txnh);
71957 +
71958 + goto found;
71959 + }
71960 +
71961 + spin_unlock_atom(atom);
71962 + }
71963 +
71964 + /*
71965 + * Write throttling is case of no one atom can be
71966 + * flushed/committed.
71967 + */
71968 + if (!current_is_flush_bd_task() && !wbc->nonblocking) {
71969 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71970 + spin_lock_atom(atom);
71971 + /* Repeat the check from the above. */
71972 + if (atom->stage < ASTAGE_PRE_COMMIT
71973 + && atom->nr_flushers == 0) {
71974 + spin_lock_txnh(txnh);
71975 + capture_assign_txnh_nolock(atom, txnh);
71976 + spin_unlock_txnh(txnh);
71977 +
71978 + goto found;
71979 + }
71980 + if (atom->stage <= ASTAGE_POST_COMMIT) {
71981 + spin_unlock_txnmgr(tmgr);
71982 + /*
71983 + * we just wait until atom's flusher
71984 + * makes a progress in flushing or
71985 + * committing the atom
71986 + */
71987 + reiser4_atom_wait_event(atom);
71988 + goto repeat;
71989 + }
71990 + spin_unlock_atom(atom);
71991 + }
71992 + }
71993 + spin_unlock_txnmgr(tmgr);
71994 + return 0;
71995 + found:
71996 + spin_unlock_txnmgr(tmgr);
71997 + } else
71998 + atom = get_current_atom_locked();
71999 +
72000 + BUG_ON(atom->super != ctx->super);
72001 + assert("vs-35", atom->super == ctx->super);
72002 + if (start) {
72003 + spin_lock_jnode(start);
72004 + ret = (atom == start->atom) ? 1 : 0;
72005 + spin_unlock_jnode(start);
72006 + if (ret == 0)
72007 + start = NULL;
72008 + }
72009 + ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
72010 + if (ret == 0) {
72011 + /* flush_current_atom returns 0 only if it submitted for write
72012 + nothing */
72013 + BUG_ON(*nr_submitted != 0);
72014 + if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
72015 + if (atom->capture_count < tmgr->atom_min_size &&
72016 + !(atom->flags & ATOM_CANCEL_FUSION)) {
72017 + ret = txn_try_to_fuse_small_atom(tmgr, atom);
72018 + if (ret == -E_REPEAT) {
72019 + reiser4_preempt_point();
72020 + goto repeat;
72021 + }
72022 + }
72023 + /* if early flushing could not make more nodes clean,
72024 + * or atom is too old/large,
72025 + * we force current atom to commit */
72026 + /* wait for commit completion but only if this
72027 + * wouldn't stall pdflushd and ent thread. */
72028 + if (!wbc->nonblocking && !ctx->entd)
72029 + txnh->flags |= TXNH_WAIT_COMMIT;
72030 + atom->flags |= ATOM_FORCE_COMMIT;
72031 + }
72032 + spin_unlock_atom(atom);
72033 + } else if (ret == -E_REPEAT) {
72034 + if (*nr_submitted == 0) {
72035 + /* let others who hampers flushing (hold longterm locks,
72036 + for instance) to free the way for flush */
72037 + reiser4_preempt_point();
72038 + goto repeat;
72039 + }
72040 + ret = 0;
72041 + }
72042 +/*
72043 + if (*nr_submitted > wbc->nr_to_write)
72044 + warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
72045 +*/
72046 + reiser4_txn_restart(ctx);
72047 +
72048 + return ret;
72049 +}
72050 +
72051 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
72052 +void reiser4_invalidate_list(struct list_head *head)
72053 +{
72054 + while (!list_empty(head)) {
72055 + jnode *node;
72056 +
72057 + node = list_entry(head->next, jnode, capture_link);
72058 + spin_lock_jnode(node);
72059 + reiser4_uncapture_block(node);
72060 + jput(node);
72061 + }
72062 +}
72063 +
72064 +static void init_wlinks(txn_wait_links * wlinks)
72065 +{
72066 + wlinks->_lock_stack = get_current_lock_stack();
72067 + INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
72068 + INIT_LIST_HEAD(&wlinks->_fwaiting_link);
72069 + wlinks->waitfor_cb = NULL;
72070 + wlinks->waiting_cb = NULL;
72071 +}
72072 +
72073 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
72074 +void reiser4_atom_wait_event(txn_atom * atom)
72075 +{
72076 + txn_wait_links _wlinks;
72077 +
72078 + assert_spin_locked(&(atom->alock));
72079 + assert("nikita-3156",
72080 + lock_stack_isclean(get_current_lock_stack()) ||
72081 + atom->nr_running_queues > 0);
72082 +
72083 + init_wlinks(&_wlinks);
72084 + list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
72085 + atomic_inc(&atom->refcount);
72086 + spin_unlock_atom(atom);
72087 +
72088 + reiser4_prepare_to_sleep(_wlinks._lock_stack);
72089 + reiser4_go_to_sleep(_wlinks._lock_stack);
72090 +
72091 + spin_lock_atom(atom);
72092 + list_del(&_wlinks._fwaitfor_link);
72093 + atom_dec_and_unlock(atom);
72094 +}
72095 +
72096 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
72097 +{
72098 + assert("nikita-3535", atom != NULL);
72099 + assert_spin_locked(&(atom->alock));
72100 + assert("nikita-3536", stage <= ASTAGE_INVALID);
72101 + /* Excelsior! */
72102 + assert("nikita-3537", stage >= atom->stage);
72103 + if (atom->stage != stage) {
72104 + atom->stage = stage;
72105 + reiser4_atom_send_event(atom);
72106 + }
72107 +}
72108 +
72109 +/* wake all threads which wait for an event */
72110 +void reiser4_atom_send_event(txn_atom * atom)
72111 +{
72112 + assert_spin_locked(&(atom->alock));
72113 + wakeup_atom_waitfor_list(atom);
72114 +}
72115 +
72116 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
72117 + example, because it does fsync(2)) */
72118 +static int should_wait_commit(txn_handle * h)
72119 +{
72120 + return h->flags & TXNH_WAIT_COMMIT;
72121 +}
72122 +
72123 +typedef struct commit_data {
72124 + txn_atom *atom;
72125 + txn_handle *txnh;
72126 + long nr_written;
72127 + /* as an optimization we start committing atom by first trying to
72128 + * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
72129 + * allows to reduce stalls due to other threads waiting for atom in
72130 + * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
72131 + * preliminary flushes. */
72132 + int preflush;
72133 + /* have we waited on atom. */
72134 + int wait;
72135 + int failed;
72136 + int wake_ktxnmgrd_up;
72137 +} commit_data;
72138 +
72139 +/*
72140 + * Called from commit_txnh() repeatedly, until either error happens, or atom
72141 + * commits successfully.
72142 + */
72143 +static int try_commit_txnh(commit_data * cd)
72144 +{
72145 + int result;
72146 +
72147 + assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
72148 +
72149 + /* Get the atom and txnh locked. */
72150 + cd->atom = txnh_get_atom(cd->txnh);
72151 + assert("jmacd-309", cd->atom != NULL);
72152 + spin_unlock_txnh(cd->txnh);
72153 +
72154 + if (cd->wait) {
72155 + cd->atom->nr_waiters--;
72156 + cd->wait = 0;
72157 + }
72158 +
72159 + if (cd->atom->stage == ASTAGE_DONE)
72160 + return 0;
72161 +
72162 + if (cd->failed)
72163 + return 0;
72164 +
72165 + if (atom_should_commit(cd->atom)) {
72166 + /* if atom is _very_ large schedule it for commit as soon as
72167 + * possible. */
72168 + if (atom_should_commit_asap(cd->atom)) {
72169 + /*
72170 + * When atom is in PRE_COMMIT or later stage following
72171 + * invariant (encoded in atom_can_be_committed())
72172 + * holds: there is exactly one non-waiter transaction
72173 + * handle opened on this atom. When thread wants to
72174 + * wait until atom commits (for example sync()) it
72175 + * waits on atom event after increasing
72176 + * atom->nr_waiters (see blow in this function). It
72177 + * cannot be guaranteed that atom is already committed
72178 + * after receiving event, so loop has to be
72179 + * re-started. But if atom switched into PRE_COMMIT
72180 + * stage and became too large, we cannot change its
72181 + * state back to CAPTURE_WAIT (atom stage can only
72182 + * increase monotonically), hence this check.
72183 + */
72184 + if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
72185 + reiser4_atom_set_stage(cd->atom,
72186 + ASTAGE_CAPTURE_WAIT);
72187 + cd->atom->flags |= ATOM_FORCE_COMMIT;
72188 + }
72189 + if (cd->txnh->flags & TXNH_DONT_COMMIT) {
72190 + /*
72191 + * this thread (transaction handle that is) doesn't
72192 + * want to commit atom. Notify waiters that handle is
72193 + * closed. This can happen, for example, when we are
72194 + * under VFS directory lock and don't want to commit
72195 + * atom right now to avoid stalling other threads
72196 + * working in the same directory.
72197 + */
72198 +
72199 + /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
72200 + * commit this atom: no atom waiters and only one
72201 + * (our) open transaction handle. */
72202 + cd->wake_ktxnmgrd_up =
72203 + cd->atom->txnh_count == 1 &&
72204 + cd->atom->nr_waiters == 0;
72205 + reiser4_atom_send_event(cd->atom);
72206 + result = 0;
72207 + } else if (!atom_can_be_committed(cd->atom)) {
72208 + if (should_wait_commit(cd->txnh)) {
72209 + /* sync(): wait for commit */
72210 + cd->atom->nr_waiters++;
72211 + cd->wait = 1;
72212 + reiser4_atom_wait_event(cd->atom);
72213 + result = RETERR(-E_REPEAT);
72214 + } else {
72215 + result = 0;
72216 + }
72217 + } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
72218 + /*
72219 + * optimization: flush atom without switching it into
72220 + * ASTAGE_CAPTURE_WAIT.
72221 + *
72222 + * But don't do this for ktxnmgrd, because ktxnmgrd
72223 + * should never block on atom fusion.
72224 + */
72225 + result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
72226 + LONG_MAX, &cd->nr_written,
72227 + &cd->atom, NULL);
72228 + if (result == 0) {
72229 + spin_unlock_atom(cd->atom);
72230 + cd->preflush = 0;
72231 + result = RETERR(-E_REPEAT);
72232 + } else /* Atoms wasn't flushed
72233 + * completely. Rinse. Repeat. */
72234 + --cd->preflush;
72235 + } else {
72236 + /* We change atom state to ASTAGE_CAPTURE_WAIT to
72237 + prevent atom fusion and count ourself as an active
72238 + flusher */
72239 + reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
72240 + cd->atom->flags |= ATOM_FORCE_COMMIT;
72241 +
72242 + result =
72243 + commit_current_atom(&cd->nr_written, &cd->atom);
72244 + if (result != 0 && result != -E_REPEAT)
72245 + cd->failed = 1;
72246 + }
72247 + } else
72248 + result = 0;
72249 +
72250 +#if REISER4_DEBUG
72251 + if (result == 0)
72252 + assert_spin_locked(&(cd->atom->alock));
72253 +#endif
72254 +
72255 + /* perfectly valid assertion, except that when atom/txnh is not locked
72256 + * fusion can take place, and cd->atom points nowhere. */
72257 + /*
72258 + assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
72259 + */
72260 + return result;
72261 +}
72262 +
72263 +/* Called to commit a transaction handle. This decrements the atom's number of open
72264 + handles and if it is the last handle to commit and the atom should commit, initiates
72265 + atom commit. if commit does not fail, return number of written blocks */
72266 +static int commit_txnh(txn_handle * txnh)
72267 +{
72268 + commit_data cd;
72269 + assert("umka-192", txnh != NULL);
72270 +
72271 + memset(&cd, 0, sizeof cd);
72272 + cd.txnh = txnh;
72273 + cd.preflush = 10;
72274 +
72275 + /* calls try_commit_txnh() until either atom commits, or error
72276 + * happens */
72277 + while (try_commit_txnh(&cd) != 0)
72278 + reiser4_preempt_point();
72279 +
72280 + spin_lock_txnh(txnh);
72281 +
72282 + cd.atom->txnh_count -= 1;
72283 + txnh->atom = NULL;
72284 + /* remove transaction handle from atom's list of transaction handles */
72285 + list_del_init(&txnh->txnh_link);
72286 +
72287 + spin_unlock_txnh(txnh);
72288 + atom_dec_and_unlock(cd.atom);
72289 + /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
72290 + * because it takes time) by current thread, we do that work
72291 + * asynchronously by ktxnmgrd daemon. */
72292 + if (cd.wake_ktxnmgrd_up)
72293 + ktxnmgrd_kick(&get_current_super_private()->tmgr);
72294 +
72295 + return 0;
72296 +}
72297 +
72298 +/* TRY_CAPTURE */
72299 +
72300 +/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
72301 + condition indicates that the request should be retried, and it may block if the
72302 + txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
72303 +
72304 + This routine encodes the basic logic of block capturing described by:
72305 +
72306 + http://namesys.com/v4/v4.html
72307 +
72308 + Our goal here is to ensure that any two blocks that contain dependent modifications
72309 + should commit at the same time. This function enforces this discipline by initiating
72310 + fusion whenever a transaction handle belonging to one atom requests to read or write a
72311 + block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
72312 +
72313 + In addition, this routine handles the initial assignment of atoms to blocks and
72314 + transaction handles. These are possible outcomes of this function:
72315 +
72316 + 1. The block and handle are already part of the same atom: return immediate success
72317 +
72318 + 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
72319 + the handle to the block's atom.
72320 +
72321 + 3. The handle is assigned but the block is not: call capture_assign_block to assign
72322 + the block to the handle's atom.
72323 +
72324 + 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
72325 + to fuse atoms.
72326 +
72327 + 5. Neither block nor handle are assigned: create a new atom and assign them both.
72328 +
72329 + 6. A read request for a non-captured block: return immediate success.
72330 +
72331 + This function acquires and releases the handle's spinlock. This function is called
72332 + under the jnode lock and if the return value is 0, it returns with the jnode lock still
72333 + held. If the return is -E_REPEAT or some other error condition, the jnode lock is
72334 + released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
72335 + lock in the failure case.
72336 +*/
72337 +static int try_capture_block(
72338 + txn_handle * txnh, jnode * node, txn_capture mode,
72339 + txn_atom ** atom_alloc)
72340 +{
72341 + txn_atom *block_atom;
72342 + txn_atom *txnh_atom;
72343 +
72344 + /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
72345 + assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
72346 +
72347 + /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
72348 + * node->tree somewhere. */
72349 + assert("umka-194", txnh != NULL);
72350 + assert("umka-195", node != NULL);
72351 +
72352 + /* The jnode is already locked! Being called from reiser4_try_capture(). */
72353 + assert_spin_locked(&(node->guard));
72354 + block_atom = node->atom;
72355 +
72356 + /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
72357 + let us touch the atoms themselves. */
72358 + spin_lock_txnh(txnh);
72359 + txnh_atom = txnh->atom;
72360 + /* Process of capturing continues into one of four branches depends on
72361 + which atoms from (block atom (node->atom), current atom (txnh->atom))
72362 + exist. */
72363 + if (txnh_atom == NULL) {
72364 + if (block_atom == NULL) {
72365 + spin_unlock_txnh(txnh);
72366 + spin_unlock_jnode(node);
72367 + /* assign empty atom to the txnh and repeat */
72368 + return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
72369 + } else {
72370 + atomic_inc(&block_atom->refcount);
72371 + /* node spin-lock isn't needed anymore */
72372 + spin_unlock_jnode(node);
72373 + if (!spin_trylock_atom(block_atom)) {
72374 + spin_unlock_txnh(txnh);
72375 + spin_lock_atom(block_atom);
72376 + spin_lock_txnh(txnh);
72377 + }
72378 + /* re-check state after getting txnh and the node
72379 + * atom spin-locked */
72380 + if (node->atom != block_atom || txnh->atom != NULL) {
72381 + spin_unlock_txnh(txnh);
72382 + atom_dec_and_unlock(block_atom);
72383 + return RETERR(-E_REPEAT);
72384 + }
72385 + atomic_dec(&block_atom->refcount);
72386 + if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
72387 + (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
72388 + block_atom->txnh_count != 0))
72389 + return capture_fuse_wait(txnh, block_atom, NULL, mode);
72390 + capture_assign_txnh_nolock(block_atom, txnh);
72391 + spin_unlock_txnh(txnh);
72392 + spin_unlock_atom(block_atom);
72393 + return RETERR(-E_REPEAT);
72394 + }
72395 + } else {
72396 + /* It is time to perform deadlock prevention check over the
72397 + node we want to capture. It is possible this node was locked
72398 + for read without capturing it. The optimization which allows
72399 + to do it helps us in keeping atoms independent as long as
72400 + possible but it may cause lock/fuse deadlock problems.
72401 +
72402 + A number of similar deadlock situations with locked but not
72403 + captured nodes were found. In each situation there are two
72404 + or more threads: one of them does flushing while another one
72405 + does routine balancing or tree lookup. The flushing thread
72406 + (F) sleeps in long term locking request for node (N), another
72407 + thread (A) sleeps in trying to capture some node already
72408 + belonging the atom F, F has a state which prevents
72409 + immediately fusion .
72410 +
72411 + Deadlocks of this kind cannot happen if node N was properly
72412 + captured by thread A. The F thread fuse atoms before locking
72413 + therefore current atom of thread F and current atom of thread
72414 + A became the same atom and thread A may proceed. This does
72415 + not work if node N was not captured because the fusion of
72416 + atom does not happens.
72417 +
72418 + The following scheme solves the deadlock: If
72419 + longterm_lock_znode locks and does not capture a znode, that
72420 + znode is marked as MISSED_IN_CAPTURE. A node marked this way
72421 + is processed by the code below which restores the missed
72422 + capture and fuses current atoms of all the node lock owners
72423 + by calling the fuse_not_fused_lock_owners() function. */
72424 + if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72425 + JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72426 + if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72427 + spin_unlock_txnh(txnh);
72428 + spin_unlock_jnode(node);
72429 + fuse_not_fused_lock_owners(txnh, JZNODE(node));
72430 + return RETERR(-E_REPEAT);
72431 + }
72432 + }
72433 + if (block_atom == NULL) {
72434 + atomic_inc(&txnh_atom->refcount);
72435 + spin_unlock_txnh(txnh);
72436 + if (!spin_trylock_atom(txnh_atom)) {
72437 + spin_unlock_jnode(node);
72438 + spin_lock_atom(txnh_atom);
72439 + spin_lock_jnode(node);
72440 + }
72441 + if (txnh->atom != txnh_atom || node->atom != NULL
72442 + || JF_ISSET(node, JNODE_IS_DYING)) {
72443 + spin_unlock_jnode(node);
72444 + atom_dec_and_unlock(txnh_atom);
72445 + return RETERR(-E_REPEAT);
72446 + }
72447 + atomic_dec(&txnh_atom->refcount);
72448 + capture_assign_block_nolock(txnh_atom, node);
72449 + spin_unlock_atom(txnh_atom);
72450 + } else {
72451 + if (txnh_atom != block_atom) {
72452 + if (mode & TXN_CAPTURE_DONT_FUSE) {
72453 + spin_unlock_txnh(txnh);
72454 + spin_unlock_jnode(node);
72455 + /* we are in a "no-fusion" mode and @node is
72456 + * already part of transaction. */
72457 + return RETERR(-E_NO_NEIGHBOR);
72458 + }
72459 + return capture_init_fusion(node, txnh, mode);
72460 + }
72461 + spin_unlock_txnh(txnh);
72462 + }
72463 + }
72464 + return 0;
72465 +}
72466 +
72467 +static txn_capture
72468 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72469 +{
72470 + txn_capture cap_mode;
72471 +
72472 + assert_spin_locked(&(node->guard));
72473 +
72474 + /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72475 +
72476 + if (lock_mode == ZNODE_WRITE_LOCK) {
72477 + cap_mode = TXN_CAPTURE_WRITE;
72478 + } else if (node->atom != NULL) {
72479 + cap_mode = TXN_CAPTURE_WRITE;
72480 + } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
72481 + jnode_get_level(node) == LEAF_LEVEL) {
72482 + /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72483 + /* We only need a READ_FUSING capture at the leaf level. This
72484 + is because the internal levels of the tree (twigs included)
72485 + are redundant from the point of the user that asked for a
72486 + read-fusing transcrash. The user only wants to read-fuse
72487 + atoms due to reading uncommitted data that another user has
72488 + written. It is the file system that reads/writes the
72489 + internal tree levels, the user only reads/writes leaves. */
72490 + cap_mode = TXN_CAPTURE_READ_ATOMIC;
72491 + } else {
72492 + /* In this case (read lock at a non-leaf) there's no reason to
72493 + * capture. */
72494 + /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72495 + return 0;
72496 + }
72497 +
72498 + cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72499 + assert("nikita-3186", cap_mode != 0);
72500 + return cap_mode;
72501 +}
72502 +
72503 +/* This is an external interface to try_capture_block(), it calls
72504 + try_capture_block() repeatedly as long as -E_REPEAT is returned.
72505 +
72506 + @node: node to capture,
72507 + @lock_mode: read or write lock is used in capture mode calculation,
72508 + @flags: see txn_capture flags enumeration,
72509 + @can_coc : can copy-on-capture
72510 +
72511 + @return: 0 - node was successfully captured, -E_REPEAT - capture request
72512 + cannot be processed immediately as it was requested in flags,
72513 + < 0 - other errors.
72514 +*/
72515 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72516 + txn_capture flags)
72517 +{
72518 + txn_atom *atom_alloc = NULL;
72519 + txn_capture cap_mode;
72520 + txn_handle *txnh = get_current_context()->trans;
72521 + int ret;
72522 +
72523 + assert_spin_locked(&(node->guard));
72524 +
72525 + repeat:
72526 + if (JF_ISSET(node, JNODE_IS_DYING))
72527 + return RETERR(-EINVAL);
72528 + if (node->atom != NULL && txnh->atom == node->atom)
72529 + return 0;
72530 + cap_mode = build_capture_mode(node, lock_mode, flags);
72531 + if (cap_mode == 0 ||
72532 + (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72533 + /* Mark this node as "MISSED". It helps in further deadlock
72534 + * analysis */
72535 + if (jnode_is_znode(node))
72536 + JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72537 + return 0;
72538 + }
72539 + /* Repeat try_capture as long as -E_REPEAT is returned. */
72540 + ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72541 + /* Regardless of non_blocking:
72542 +
72543 + If ret == 0 then jnode is still locked.
72544 + If ret != 0 then jnode is unlocked.
72545 + */
72546 +#if REISER4_DEBUG
72547 + if (ret == 0)
72548 + assert_spin_locked(&(node->guard));
72549 + else
72550 + assert_spin_not_locked(&(node->guard));
72551 +#endif
72552 + assert_spin_not_locked(&(txnh->guard));
72553 +
72554 + if (ret == -E_REPEAT) {
72555 + /* E_REPEAT implies all locks were released, therefore we need
72556 + to take the jnode's lock again. */
72557 + spin_lock_jnode(node);
72558 +
72559 + /* Although this may appear to be a busy loop, it is not.
72560 + There are several conditions that cause E_REPEAT to be
72561 + returned by the call to try_capture_block, all cases
72562 + indicating some kind of state change that means you should
72563 + retry the request and will get a different result. In some
72564 + cases this could be avoided with some extra code, but
72565 + generally it is done because the necessary locks were
72566 + released as a result of the operation and repeating is the
72567 + simplest thing to do (less bug potential). The cases are:
72568 + atom fusion returns E_REPEAT after it completes (jnode and
72569 + txnh were unlocked); race conditions in assign_block,
72570 + assign_txnh, and init_fusion return E_REPEAT (trylock
72571 + failure); after going to sleep in capture_fuse_wait
72572 + (request was blocked but may now succeed). I'm not quite
72573 + sure how capture_copy works yet, but it may also return
72574 + E_REPEAT. When the request is legitimately blocked, the
72575 + requestor goes to sleep in fuse_wait, so this is not a busy
72576 + loop. */
72577 + /* NOTE-NIKITA: still don't understand:
72578 +
72579 + try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72580 +
72581 + looks like busy loop?
72582 + */
72583 + goto repeat;
72584 + }
72585 +
72586 + /* free extra atom object that was possibly allocated by
72587 + try_capture_block().
72588 +
72589 + Do this before acquiring jnode spin lock to
72590 + minimize time spent under lock. --nikita */
72591 + if (atom_alloc != NULL) {
72592 + kmem_cache_free(_atom_slab, atom_alloc);
72593 + }
72594 +
72595 + if (ret != 0) {
72596 + if (ret == -E_BLOCK) {
72597 + assert("nikita-3360",
72598 + cap_mode & TXN_CAPTURE_NONBLOCKING);
72599 + ret = -E_REPEAT;
72600 + }
72601 +
72602 + /* Failure means jnode is not locked. FIXME_LATER_JMACD May
72603 + want to fix the above code to avoid releasing the lock and
72604 + re-acquiring it, but there are cases were failure occurs
72605 + when the lock is not held, and those cases would need to be
72606 + modified to re-take the lock. */
72607 + spin_lock_jnode(node);
72608 + }
72609 +
72610 + /* Jnode is still locked. */
72611 + assert_spin_locked(&(node->guard));
72612 + return ret;
72613 +}
72614 +
72615 +static void release_two_atoms(txn_atom *one, txn_atom *two)
72616 +{
72617 + spin_unlock_atom(one);
72618 + atom_dec_and_unlock(two);
72619 + spin_lock_atom(one);
72620 + atom_dec_and_unlock(one);
72621 +}
72622 +
72623 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72624 + returned by that routine. The txn_capture request mode is computed here depending on
72625 + the transaction handle's type and the lock request. This is called from the depths of
72626 + the lock manager with the jnode lock held and it always returns with the jnode lock
72627 + held.
72628 +*/
72629 +
72630 +/* fuse all 'active' atoms of lock owners of given node. */
72631 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72632 +{
72633 + lock_handle *lh;
72634 + int repeat;
72635 + txn_atom *atomh, *atomf;
72636 + reiser4_context *me = get_current_context();
72637 + reiser4_context *ctx = NULL;
72638 +
72639 + assert_spin_not_locked(&(ZJNODE(node)->guard));
72640 + assert_spin_not_locked(&(txnh->hlock));
72641 +
72642 + repeat:
72643 + repeat = 0;
72644 + atomh = txnh_get_atom(txnh);
72645 + spin_unlock_txnh(txnh);
72646 + assert("zam-692", atomh != NULL);
72647 +
72648 + spin_lock_zlock(&node->lock);
72649 + /* inspect list of lock owners */
72650 + list_for_each_entry(lh, &node->lock.owners, owners_link) {
72651 + ctx = get_context_by_lock_stack(lh->owner);
72652 + if (ctx == me)
72653 + continue;
72654 + /* below we use two assumptions to avoid addition spin-locks
72655 + for checking the condition :
72656 +
72657 + 1) if the lock stack has lock, the transaction should be
72658 + opened, i.e. ctx->trans != NULL;
72659 +
72660 + 2) reading of well-aligned ctx->trans->atom is atomic, if it
72661 + equals to the address of spin-locked atomh, we take that
72662 + the atoms are the same, nothing has to be captured. */
72663 + if (atomh != ctx->trans->atom) {
72664 + reiser4_wake_up(lh->owner);
72665 + repeat = 1;
72666 + break;
72667 + }
72668 + }
72669 + if (repeat) {
72670 + if (!spin_trylock_txnh(ctx->trans)) {
72671 + spin_unlock_zlock(&node->lock);
72672 + spin_unlock_atom(atomh);
72673 + goto repeat;
72674 + }
72675 + atomf = ctx->trans->atom;
72676 + if (atomf == NULL) {
72677 + capture_assign_txnh_nolock(atomh, ctx->trans);
72678 + /* release zlock lock _after_ assigning the atom to the
72679 + * transaction handle, otherwise the lock owner thread
72680 + * may unlock all znodes, exit kernel context and here
72681 + * we would access an invalid transaction handle. */
72682 + spin_unlock_zlock(&node->lock);
72683 + spin_unlock_atom(atomh);
72684 + spin_unlock_txnh(ctx->trans);
72685 + goto repeat;
72686 + }
72687 + assert("zam-1059", atomf != atomh);
72688 + spin_unlock_zlock(&node->lock);
72689 + atomic_inc(&atomh->refcount);
72690 + atomic_inc(&atomf->refcount);
72691 + spin_unlock_txnh(ctx->trans);
72692 + if (atomf > atomh) {
72693 + spin_lock_atom_nested(atomf);
72694 + } else {
72695 + spin_unlock_atom(atomh);
72696 + spin_lock_atom(atomf);
72697 + spin_lock_atom_nested(atomh);
72698 + }
72699 + if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72700 + release_two_atoms(atomf, atomh);
72701 + goto repeat;
72702 + }
72703 + atomic_dec(&atomh->refcount);
72704 + atomic_dec(&atomf->refcount);
72705 + capture_fuse_into(atomf, atomh);
72706 + goto repeat;
72707 + }
72708 + spin_unlock_zlock(&node->lock);
72709 + spin_unlock_atom(atomh);
72710 +}
72711 +
72712 +/* This is the interface to capture unformatted nodes via their struct page
72713 + reference. Currently it is only used in reiser4_invalidatepage */
72714 +int try_capture_page_to_invalidate(struct page *pg)
72715 +{
72716 + int ret;
72717 + jnode *node;
72718 +
72719 + assert("umka-292", pg != NULL);
72720 + assert("nikita-2597", PageLocked(pg));
72721 +
72722 + if (IS_ERR(node = jnode_of_page(pg))) {
72723 + return PTR_ERR(node);
72724 + }
72725 +
72726 + spin_lock_jnode(node);
72727 + unlock_page(pg);
72728 +
72729 + ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72730 + spin_unlock_jnode(node);
72731 + jput(node);
72732 + lock_page(pg);
72733 + return ret;
72734 +}
72735 +
72736 +/* This informs the transaction manager when a node is deleted. Add the block to the
72737 + atom's delete set and uncapture the block.
72738 +
72739 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72740 +explanations. find all the functions that use it, and unless there is some very
72741 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72742 +move the loop to inside the function.
72743 +
72744 +VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
72745 + */
72746 +void reiser4_uncapture_page(struct page *pg)
72747 +{
72748 + jnode *node;
72749 + txn_atom *atom;
72750 +
72751 + assert("umka-199", pg != NULL);
72752 + assert("nikita-3155", PageLocked(pg));
72753 +
72754 + clear_page_dirty_for_io(pg);
72755 +
72756 + reiser4_wait_page_writeback(pg);
72757 +
72758 + node = jprivate(pg);
72759 + BUG_ON(node == NULL);
72760 +
72761 + spin_lock_jnode(node);
72762 +
72763 + atom = jnode_get_atom(node);
72764 + if (atom == NULL) {
72765 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72766 + spin_unlock_jnode(node);
72767 + return;
72768 + }
72769 +
72770 + /* We can remove jnode from transaction even if it is on flush queue
72771 + * prepped list, we only need to be sure that flush queue is not being
72772 + * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
72773 + * spin lock for protection of the prepped nodes list, instead
72774 + * write_fq() increments atom's nr_running_queues counters for the time
72775 + * when prepped list is not protected by spin lock. Here we check this
72776 + * counter if we want to remove jnode from flush queue and, if the
72777 + * counter is not zero, wait all reiser4_write_fq() for this atom to
72778 + * complete. This is not significant overhead. */
72779 + while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72780 + spin_unlock_jnode(node);
72781 + /*
72782 + * at this moment we want to wait for "atom event", viz. wait
72783 + * until @node can be removed from flush queue. But
72784 + * reiser4_atom_wait_event() cannot be called with page locked,
72785 + * because it deadlocks with jnode_extent_write(). Unlock page,
72786 + * after making sure (through page_cache_get()) that it cannot
72787 + * be released from memory.
72788 + */
72789 + page_cache_get(pg);
72790 + unlock_page(pg);
72791 + reiser4_atom_wait_event(atom);
72792 + lock_page(pg);
72793 + /*
72794 + * page may has been detached by ->writepage()->releasepage().
72795 + */
72796 + reiser4_wait_page_writeback(pg);
72797 + spin_lock_jnode(node);
72798 + page_cache_release(pg);
72799 + atom = jnode_get_atom(node);
72800 +/* VS-FIXME-HANS: improve the commenting in this function */
72801 + if (atom == NULL) {
72802 + spin_unlock_jnode(node);
72803 + return;
72804 + }
72805 + }
72806 + reiser4_uncapture_block(node);
72807 + spin_unlock_atom(atom);
72808 + jput(node);
72809 +}
72810 +
72811 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72812 + * inode's tree of jnodes */
72813 +void reiser4_uncapture_jnode(jnode * node)
72814 +{
72815 + txn_atom *atom;
72816 +
72817 + assert_spin_locked(&(node->guard));
72818 + assert("", node->pg == 0);
72819 +
72820 + atom = jnode_get_atom(node);
72821 + if (atom == NULL) {
72822 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72823 + spin_unlock_jnode(node);
72824 + return;
72825 + }
72826 +
72827 + reiser4_uncapture_block(node);
72828 + spin_unlock_atom(atom);
72829 + jput(node);
72830 +}
72831 +
72832 +/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
72833 + increases atom refcount and txnh_count, adds to txnh_list. */
72834 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72835 +{
72836 + assert("umka-200", atom != NULL);
72837 + assert("umka-201", txnh != NULL);
72838 +
72839 + assert_spin_locked(&(txnh->hlock));
72840 + assert_spin_locked(&(atom->alock));
72841 + assert("jmacd-824", txnh->atom == NULL);
72842 + assert("nikita-3540", atom_isopen(atom));
72843 + BUG_ON(txnh->atom != NULL);
72844 +
72845 + atomic_inc(&atom->refcount);
72846 + txnh->atom = atom;
72847 + reiser4_ctx_gfp_mask_set();
72848 + list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72849 + atom->txnh_count += 1;
72850 +}
72851 +
72852 +/* No-locking version of assign_block. Sets the block's atom pointer, references the
72853 + block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72854 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72855 +{
72856 + assert("umka-202", atom != NULL);
72857 + assert("umka-203", node != NULL);
72858 + assert_spin_locked(&(node->guard));
72859 + assert_spin_locked(&(atom->alock));
72860 + assert("jmacd-323", node->atom == NULL);
72861 + BUG_ON(!list_empty_careful(&node->capture_link));
72862 + assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72863 +
72864 + /* Pointer from jnode to atom is not counted in atom->refcount. */
72865 + node->atom = atom;
72866 +
72867 + list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72868 + atom->capture_count += 1;
72869 + /* reference to jnode is acquired by atom. */
72870 + jref(node);
72871 +
72872 + ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72873 +
72874 + LOCK_CNT_INC(t_refs);
72875 +}
72876 +
72877 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
72878 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72879 +{
72880 + assert_spin_locked(&(node->guard));
72881 + assert_spin_locked(&(atom->alock));
72882 + assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72883 +
72884 + JF_SET(node, JNODE_DIRTY);
72885 +
72886 + if (!JF_ISSET(node, JNODE_CLUSTER_PAGE))
72887 + get_current_context()->nr_marked_dirty++;
72888 +
72889 + /* We grab2flush_reserve one additional block only if node was
72890 + not CREATED and jnode_flush did not sort it into neither
72891 + relocate set nor overwrite one. If node is in overwrite or
72892 + relocate set we assume that atom's flush reserved counter was
72893 + already adjusted. */
72894 + if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72895 + && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72896 + && !jnode_is_cluster_page(node)) {
72897 + assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72898 + assert("vs-1506", *jnode_get_block(node) != 0);
72899 + grabbed2flush_reserved_nolock(atom, (__u64) 1);
72900 + JF_SET(node, JNODE_FLUSH_RESERVED);
72901 + }
72902 +
72903 + if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72904 + /* If the atom is not set yet, it will be added to the appropriate list in
72905 + capture_assign_block_nolock. */
72906 + /* Sometimes a node is set dirty before being captured -- the case for new
72907 + jnodes. In that case the jnode will be added to the appropriate list
72908 + in capture_assign_block_nolock. Another reason not to re-link jnode is
72909 + that jnode is on a flush queue (see flush.c for details) */
72910 +
72911 + int level = jnode_get_level(node);
72912 +
72913 + assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72914 + assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72915 + assert("nikita-2607", 0 <= level);
72916 + assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72917 +
72918 + /* move node to atom's dirty list */
72919 + list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72920 + ON_DEBUG(count_jnode
72921 + (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72922 + }
72923 +}
72924 +
72925 +/* Set the dirty status for this (spin locked) jnode. */
72926 +void jnode_make_dirty_locked(jnode * node)
72927 +{
72928 + assert("umka-204", node != NULL);
72929 + assert_spin_locked(&(node->guard));
72930 +
72931 + if (REISER4_DEBUG && rofs_jnode(node)) {
72932 + warning("nikita-3365", "Dirtying jnode on rofs");
72933 + dump_stack();
72934 + }
72935 +
72936 + /* Fast check for already dirty node */
72937 + if (!JF_ISSET(node, JNODE_DIRTY)) {
72938 + txn_atom *atom;
72939 +
72940 + atom = jnode_get_atom(node);
72941 + assert("vs-1094", atom);
72942 + /* Check jnode dirty status again because node spin lock might
72943 + * be released inside jnode_get_atom(). */
72944 + if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72945 + do_jnode_make_dirty(node, atom);
72946 + spin_unlock_atom(atom);
72947 + }
72948 +}
72949 +
72950 +/* Set the dirty status for this znode. */
72951 +void znode_make_dirty(znode * z)
72952 +{
72953 + jnode *node;
72954 + struct page *page;
72955 +
72956 + assert("umka-204", z != NULL);
72957 + assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72958 + assert("nikita-3560", znode_is_write_locked(z));
72959 +
72960 + node = ZJNODE(z);
72961 + /* znode is longterm locked, we can check dirty bit without spinlock */
72962 + if (JF_ISSET(node, JNODE_DIRTY)) {
72963 + /* znode is dirty already. All we have to do is to change znode version */
72964 + z->version = znode_build_version(jnode_get_tree(node));
72965 + return;
72966 + }
72967 +
72968 + spin_lock_jnode(node);
72969 + jnode_make_dirty_locked(node);
72970 + page = jnode_page(node);
72971 + if (page != NULL) {
72972 + /* this is useful assertion (allows one to check that no
72973 + * modifications are lost due to update of in-flight page),
72974 + * but it requires locking on page to check PG_writeback
72975 + * bit. */
72976 + /* assert("nikita-3292",
72977 + !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72978 + page_cache_get(page);
72979 +
72980 + /* jnode lock is not needed for the rest of
72981 + * znode_set_dirty(). */
72982 + spin_unlock_jnode(node);
72983 + /* reiser4 file write code calls set_page_dirty for
72984 + * unformatted nodes, for formatted nodes we do it here. */
72985 + set_page_dirty_notag(page);
72986 + page_cache_release(page);
72987 + /* bump version counter in znode */
72988 + z->version = znode_build_version(jnode_get_tree(node));
72989 + } else {
72990 + assert("zam-596", znode_above_root(JZNODE(node)));
72991 + spin_unlock_jnode(node);
72992 + }
72993 +
72994 + assert("nikita-1900", znode_is_write_locked(z));
72995 + assert("jmacd-9777", node->atom != NULL);
72996 +}
72997 +
72998 +int reiser4_sync_atom(txn_atom * atom)
72999 +{
73000 + int result;
73001 + txn_handle *txnh;
73002 +
73003 + txnh = get_current_context()->trans;
73004 +
73005 + result = 0;
73006 + if (atom != NULL) {
73007 + if (atom->stage < ASTAGE_PRE_COMMIT) {
73008 + spin_lock_txnh(txnh);
73009 + capture_assign_txnh_nolock(atom, txnh);
73010 + result = force_commit_atom(txnh);
73011 + } else if (atom->stage < ASTAGE_POST_COMMIT) {
73012 + /* wait atom commit */
73013 + reiser4_atom_wait_event(atom);
73014 + /* try once more */
73015 + result = RETERR(-E_REPEAT);
73016 + } else
73017 + spin_unlock_atom(atom);
73018 + }
73019 + return result;
73020 +}
73021 +
73022 +#if REISER4_DEBUG
73023 +
73024 +/* move jnode form one list to another
73025 + call this after atom->capture_count is updated */
73026 +void
73027 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
73028 + atom_list new_list, int check_lists)
73029 +{
73030 + struct list_head *pos;
73031 +
73032 + assert("zam-1018", atom_is_protected(atom));
73033 + assert_spin_locked(&(node->guard));
73034 + assert("", NODE_LIST(node) == old_list);
73035 +
73036 + switch (NODE_LIST(node)) {
73037 + case NOT_CAPTURED:
73038 + break;
73039 + case DIRTY_LIST:
73040 + assert("", atom->dirty > 0);
73041 + atom->dirty--;
73042 + break;
73043 + case CLEAN_LIST:
73044 + assert("", atom->clean > 0);
73045 + atom->clean--;
73046 + break;
73047 + case FQ_LIST:
73048 + assert("", atom->fq > 0);
73049 + atom->fq--;
73050 + break;
73051 + case WB_LIST:
73052 + assert("", atom->wb > 0);
73053 + atom->wb--;
73054 + break;
73055 + case OVRWR_LIST:
73056 + assert("", atom->ovrwr > 0);
73057 + atom->ovrwr--;
73058 + break;
73059 + default:
73060 + impossible("", "");
73061 + }
73062 +
73063 + switch (new_list) {
73064 + case NOT_CAPTURED:
73065 + break;
73066 + case DIRTY_LIST:
73067 + atom->dirty++;
73068 + break;
73069 + case CLEAN_LIST:
73070 + atom->clean++;
73071 + break;
73072 + case FQ_LIST:
73073 + atom->fq++;
73074 + break;
73075 + case WB_LIST:
73076 + atom->wb++;
73077 + break;
73078 + case OVRWR_LIST:
73079 + atom->ovrwr++;
73080 + break;
73081 + default:
73082 + impossible("", "");
73083 + }
73084 + ASSIGN_NODE_LIST(node, new_list);
73085 + if (0 && check_lists) {
73086 + int count;
73087 + tree_level level;
73088 +
73089 + count = 0;
73090 +
73091 + /* flush queue list */
73092 + /* reiser4_check_fq(atom); */
73093 +
73094 + /* dirty list */
73095 + count = 0;
73096 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73097 + list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
73098 + count++;
73099 + }
73100 + if (count != atom->dirty)
73101 + warning("", "dirty counter %d, real %d\n", atom->dirty,
73102 + count);
73103 +
73104 + /* clean list */
73105 + count = 0;
73106 + list_for_each(pos, ATOM_CLEAN_LIST(atom))
73107 + count++;
73108 + if (count != atom->clean)
73109 + warning("", "clean counter %d, real %d\n", atom->clean,
73110 + count);
73111 +
73112 + /* wb list */
73113 + count = 0;
73114 + list_for_each(pos, ATOM_WB_LIST(atom))
73115 + count++;
73116 + if (count != atom->wb)
73117 + warning("", "wb counter %d, real %d\n", atom->wb,
73118 + count);
73119 +
73120 + /* overwrite list */
73121 + count = 0;
73122 + list_for_each(pos, ATOM_OVRWR_LIST(atom))
73123 + count++;
73124 +
73125 + if (count != atom->ovrwr)
73126 + warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
73127 + count);
73128 + }
73129 + assert("vs-1624", atom->num_queued == atom->fq);
73130 + if (atom->capture_count !=
73131 + atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
73132 + printk
73133 + ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
73134 + atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
73135 + atom->wb, atom->fq);
73136 + assert("vs-1622",
73137 + atom->capture_count ==
73138 + atom->dirty + atom->clean + atom->ovrwr + atom->wb +
73139 + atom->fq);
73140 + }
73141 +}
73142 +
73143 +#endif
73144 +
73145 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
73146 + * lock should be taken before calling this function. */
73147 +void jnode_make_wander_nolock(jnode * node)
73148 +{
73149 + txn_atom *atom;
73150 +
73151 + assert("nikita-2431", node != NULL);
73152 + assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
73153 + assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
73154 + assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
73155 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
73156 +
73157 + atom = node->atom;
73158 +
73159 + assert("zam-895", atom != NULL);
73160 + assert("zam-894", atom_is_protected(atom));
73161 +
73162 + JF_SET(node, JNODE_OVRWR);
73163 + /* move node to atom's overwrite list */
73164 + list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
73165 + ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
73166 +}
73167 +
73168 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
73169 + * this function. */
73170 +void jnode_make_wander(jnode * node)
73171 +{
73172 + txn_atom *atom;
73173 +
73174 + spin_lock_jnode(node);
73175 + atom = jnode_get_atom(node);
73176 + assert("zam-913", atom != NULL);
73177 + assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
73178 +
73179 + jnode_make_wander_nolock(node);
73180 + spin_unlock_atom(atom);
73181 + spin_unlock_jnode(node);
73182 +}
73183 +
73184 +/* this just sets RELOC bit */
73185 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
73186 +{
73187 + assert_spin_locked(&(node->guard));
73188 + assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
73189 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
73190 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
73191 + assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
73192 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
73193 + jnode_set_reloc(node);
73194 +}
73195 +
73196 +/* Make znode RELOC and put it on flush queue */
73197 +void znode_make_reloc(znode * z, flush_queue_t * fq)
73198 +{
73199 + jnode *node;
73200 + txn_atom *atom;
73201 +
73202 + node = ZJNODE(z);
73203 + spin_lock_jnode(node);
73204 +
73205 + atom = jnode_get_atom(node);
73206 + assert("zam-919", atom != NULL);
73207 +
73208 + jnode_make_reloc_nolock(fq, node);
73209 + queue_jnode(fq, node);
73210 +
73211 + spin_unlock_atom(atom);
73212 + spin_unlock_jnode(node);
73213 +
73214 +}
73215 +
73216 +/* Make unformatted node RELOC and put it on flush queue */
73217 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
73218 +{
73219 + assert("vs-1479", jnode_is_unformatted(node));
73220 +
73221 + jnode_make_reloc_nolock(fq, node);
73222 + queue_jnode(fq, node);
73223 +}
73224 +
73225 +int reiser4_capture_super_block(struct super_block *s)
73226 +{
73227 + int result;
73228 + znode *uber;
73229 + lock_handle lh;
73230 +
73231 + init_lh(&lh);
73232 + result = get_uber_znode(reiser4_get_tree(s),
73233 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
73234 + if (result)
73235 + return result;
73236 +
73237 + uber = lh.node;
73238 + /* Grabbing one block for superblock */
73239 + result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
73240 + if (result != 0)
73241 + return result;
73242 +
73243 + znode_make_dirty(uber);
73244 +
73245 + done_lh(&lh);
73246 + return 0;
73247 +}
73248 +
73249 +/* Wakeup every handle on the atom's WAITFOR list */
73250 +static void wakeup_atom_waitfor_list(txn_atom * atom)
73251 +{
73252 + txn_wait_links *wlinks;
73253 +
73254 + assert("umka-210", atom != NULL);
73255 +
73256 + /* atom is locked */
73257 + list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
73258 + if (wlinks->waitfor_cb == NULL ||
73259 + wlinks->waitfor_cb(atom, wlinks))
73260 + /* Wake up. */
73261 + reiser4_wake_up(wlinks->_lock_stack);
73262 + }
73263 +}
73264 +
73265 +/* Wakeup every handle on the atom's WAITING list */
73266 +static void wakeup_atom_waiting_list(txn_atom * atom)
73267 +{
73268 + txn_wait_links *wlinks;
73269 +
73270 + assert("umka-211", atom != NULL);
73271 +
73272 + /* atom is locked */
73273 + list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
73274 + if (wlinks->waiting_cb == NULL ||
73275 + wlinks->waiting_cb(atom, wlinks))
73276 + /* Wake up. */
73277 + reiser4_wake_up(wlinks->_lock_stack);
73278 + }
73279 +}
73280 +
73281 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
73282 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
73283 +{
73284 + assert("nikita-3330", atom != NULL);
73285 + assert_spin_locked(&(atom->alock));
73286 +
73287 + /* atom->txnh_count == 1 is for waking waiters up if we are releasing
73288 + * last transaction handle. */
73289 + return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
73290 +}
73291 +
73292 +/* The general purpose of this function is to wait on the first of two possible events.
73293 + The situation is that a handle (and its atom atomh) is blocked trying to capture a
73294 + block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
73295 + handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
73296 + another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
73297 + needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
73298 + proceed and fuse the two atoms in the CAPTURE_WAIT state.
73299 +
73300 + In other words, if either atomh or atomf change state, the handle will be awakened,
73301 + thus there are two lists per atom: WAITING and WAITFOR.
73302 +
73303 + This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
73304 + close but it is not assigned to an atom of its own.
73305 +
73306 + Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
73307 + BOTH_ATOM_LOCKS. Result: all four locks are released.
73308 +*/
73309 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
73310 + txn_atom * atomh, txn_capture mode)
73311 +{
73312 + int ret;
73313 + txn_wait_links wlinks;
73314 +
73315 + assert("umka-213", txnh != NULL);
73316 + assert("umka-214", atomf != NULL);
73317 +
73318 + if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
73319 + spin_unlock_txnh(txnh);
73320 + spin_unlock_atom(atomf);
73321 +
73322 + if (atomh) {
73323 + spin_unlock_atom(atomh);
73324 + }
73325 +
73326 + return RETERR(-E_BLOCK);
73327 + }
73328 +
73329 + /* Initialize the waiting list links. */
73330 + init_wlinks(&wlinks);
73331 +
73332 + /* Add txnh to atomf's waitfor list, unlock atomf. */
73333 + list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
73334 + wlinks.waitfor_cb = wait_for_fusion;
73335 + atomic_inc(&atomf->refcount);
73336 + spin_unlock_atom(atomf);
73337 +
73338 + if (atomh) {
73339 + /* Add txnh to atomh's waiting list, unlock atomh. */
73340 + list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
73341 + atomic_inc(&atomh->refcount);
73342 + spin_unlock_atom(atomh);
73343 + }
73344 +
73345 + /* Go to sleep. */
73346 + spin_unlock_txnh(txnh);
73347 +
73348 + ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
73349 + if (ret == 0) {
73350 + reiser4_go_to_sleep(wlinks._lock_stack);
73351 + ret = RETERR(-E_REPEAT);
73352 + }
73353 +
73354 + /* Remove from the waitfor list. */
73355 + spin_lock_atom(atomf);
73356 +
73357 + list_del(&wlinks._fwaitfor_link);
73358 + atom_dec_and_unlock(atomf);
73359 +
73360 + if (atomh) {
73361 + /* Remove from the waiting list. */
73362 + spin_lock_atom(atomh);
73363 + list_del(&wlinks._fwaiting_link);
73364 + atom_dec_and_unlock(atomh);
73365 + }
73366 + return ret;
73367 +}
73368 +
73369 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
73370 +{
73371 + assert("zam-1067", one != two);
73372 +
73373 + /* lock the atom with lesser address first */
73374 + if (one < two) {
73375 + spin_lock_atom(one);
73376 + spin_lock_atom_nested(two);
73377 + } else {
73378 + spin_lock_atom(two);
73379 + spin_lock_atom_nested(one);
73380 + }
73381 +}
73382 +
73383 +/* Perform the necessary work to prepare for fusing two atoms, which involves
73384 + * acquiring two atom locks in the proper order. If one of the node's atom is
73385 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
73386 + * atom is not then the handle's request is put to sleep. If the node's atom
73387 + * is committing, then the node can be copy-on-captured. Otherwise, pick the
73388 + * atom with fewer pointers to be fused into the atom with more pointer and
73389 + * call capture_fuse_into.
73390 + */
73391 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
73392 +{
73393 + txn_atom * txnh_atom = txnh->atom;
73394 + txn_atom * block_atom = node->atom;
73395 +
73396 + atomic_inc(&txnh_atom->refcount);
73397 + atomic_inc(&block_atom->refcount);
73398 +
73399 + spin_unlock_txnh(txnh);
73400 + spin_unlock_jnode(node);
73401 +
73402 + lock_two_atoms(txnh_atom, block_atom);
73403 +
73404 + if (txnh->atom != txnh_atom || node->atom != block_atom ) {
73405 + release_two_atoms(txnh_atom, block_atom);
73406 + return RETERR(-E_REPEAT);
73407 + }
73408 +
73409 + atomic_dec(&txnh_atom->refcount);
73410 + atomic_dec(&block_atom->refcount);
73411 +
73412 + assert ("zam-1066", atom_isopen(txnh_atom));
73413 +
73414 + if (txnh_atom->stage >= block_atom->stage ||
73415 + (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
73416 + capture_fuse_into(txnh_atom, block_atom);
73417 + return RETERR(-E_REPEAT);
73418 + }
73419 + spin_lock_txnh(txnh);
73420 + return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73421 +}
73422 +
73423 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
73424 + the small list to point to the large atom. Returns the length of the list. */
73425 +static int
73426 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73427 + struct list_head *small_head)
73428 +{
73429 + int count = 0;
73430 + jnode *node;
73431 +
73432 + assert("umka-218", large != NULL);
73433 + assert("umka-219", large_head != NULL);
73434 + assert("umka-220", small_head != NULL);
73435 + /* small atom should be locked also. */
73436 + assert_spin_locked(&(large->alock));
73437 +
73438 + /* For every jnode on small's capture list... */
73439 + list_for_each_entry(node, small_head, capture_link) {
73440 + count += 1;
73441 +
73442 + /* With the jnode lock held, update atom pointer. */
73443 + spin_lock_jnode(node);
73444 + node->atom = large;
73445 + spin_unlock_jnode(node);
73446 + }
73447 +
73448 + /* Splice the lists. */
73449 + list_splice_init(small_head, large_head->prev);
73450 +
73451 + return count;
73452 +}
73453 +
73454 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
73455 + the small list to point to the large atom. Returns the length of the list. */
73456 +static int
73457 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73458 + struct list_head *small_head)
73459 +{
73460 + int count = 0;
73461 + txn_handle *txnh;
73462 +
73463 + assert("umka-221", large != NULL);
73464 + assert("umka-222", large_head != NULL);
73465 + assert("umka-223", small_head != NULL);
73466 +
73467 + /* Adjust every txnh to the new atom. */
73468 + list_for_each_entry(txnh, small_head, txnh_link) {
73469 + count += 1;
73470 +
73471 + /* With the txnh lock held, update atom pointer. */
73472 + spin_lock_txnh(txnh);
73473 + txnh->atom = large;
73474 + spin_unlock_txnh(txnh);
73475 + }
73476 +
73477 + /* Splice the txn_handle list. */
73478 + list_splice_init(small_head, large_head->prev);
73479 +
73480 + return count;
73481 +}
73482 +
73483 +/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
73484 + added to LARGE and their ->atom pointers are all updated. The associated counts are
73485 + updated as well, and any waiting handles belonging to either are awakened. Finally the
73486 + smaller atom's refcount is decremented.
73487 +*/
73488 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
73489 +{
73490 + int level;
73491 + unsigned zcount = 0;
73492 + unsigned tcount = 0;
73493 +
73494 + assert("umka-224", small != NULL);
73495 + assert("umka-225", small != NULL);
73496 +
73497 + assert_spin_locked(&(large->alock));
73498 + assert_spin_locked(&(small->alock));
73499 +
73500 + assert("jmacd-201", atom_isopen(small));
73501 + assert("jmacd-202", atom_isopen(large));
73502 +
73503 + /* Splice and update the per-level dirty jnode lists */
73504 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73505 + zcount +=
73506 + capture_fuse_jnode_lists(large,
73507 + ATOM_DIRTY_LIST(large, level),
73508 + ATOM_DIRTY_LIST(small, level));
73509 + }
73510 +
73511 + /* Splice and update the [clean,dirty] jnode and txnh lists */
73512 + zcount +=
73513 + capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73514 + ATOM_CLEAN_LIST(small));
73515 + zcount +=
73516 + capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73517 + ATOM_OVRWR_LIST(small));
73518 + zcount +=
73519 + capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73520 + ATOM_WB_LIST(small));
73521 + zcount +=
73522 + capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73523 + tcount +=
73524 + capture_fuse_txnh_lists(large, &large->txnh_list,
73525 + &small->txnh_list);
73526 +
73527 + /* Check our accounting. */
73528 + assert("jmacd-1063",
73529 + zcount + small->num_queued == small->capture_count);
73530 + assert("jmacd-1065", tcount == small->txnh_count);
73531 +
73532 + /* sum numbers of waiters threads */
73533 + large->nr_waiters += small->nr_waiters;
73534 + small->nr_waiters = 0;
73535 +
73536 + /* splice flush queues */
73537 + reiser4_fuse_fq(large, small);
73538 +
73539 + /* update counter of jnode on every atom' list */
73540 + ON_DEBUG(large->dirty += small->dirty;
73541 + small->dirty = 0;
73542 + large->clean += small->clean;
73543 + small->clean = 0;
73544 + large->ovrwr += small->ovrwr;
73545 + small->ovrwr = 0;
73546 + large->wb += small->wb;
73547 + small->wb = 0;
73548 + large->fq += small->fq;
73549 + small->fq = 0;);
73550 +
73551 + /* count flushers in result atom */
73552 + large->nr_flushers += small->nr_flushers;
73553 + small->nr_flushers = 0;
73554 +
73555 + /* update counts of flushed nodes */
73556 + large->flushed += small->flushed;
73557 + small->flushed = 0;
73558 +
73559 + /* Transfer list counts to large. */
73560 + large->txnh_count += small->txnh_count;
73561 + large->capture_count += small->capture_count;
73562 +
73563 + /* Add all txnh references to large. */
73564 + atomic_add(small->txnh_count, &large->refcount);
73565 + atomic_sub(small->txnh_count, &small->refcount);
73566 +
73567 + /* Reset small counts */
73568 + small->txnh_count = 0;
73569 + small->capture_count = 0;
73570 +
73571 + /* Assign the oldest start_time, merge flags. */
73572 + large->start_time = min(large->start_time, small->start_time);
73573 + large->flags |= small->flags;
73574 +
73575 + /* Merge blocknr sets. */
73576 + blocknr_set_merge(&small->delete_set, &large->delete_set);
73577 + blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73578 +
73579 + /* Merge allocated/deleted file counts */
73580 + large->nr_objects_deleted += small->nr_objects_deleted;
73581 + large->nr_objects_created += small->nr_objects_created;
73582 +
73583 + small->nr_objects_deleted = 0;
73584 + small->nr_objects_created = 0;
73585 +
73586 + /* Merge allocated blocks counts */
73587 + large->nr_blocks_allocated += small->nr_blocks_allocated;
73588 +
73589 + large->nr_running_queues += small->nr_running_queues;
73590 + small->nr_running_queues = 0;
73591 +
73592 + /* Merge blocks reserved for overwrite set. */
73593 + large->flush_reserved += small->flush_reserved;
73594 + small->flush_reserved = 0;
73595 +
73596 + if (large->stage < small->stage) {
73597 + /* Large only needs to notify if it has changed state. */
73598 + reiser4_atom_set_stage(large, small->stage);
73599 + wakeup_atom_waiting_list(large);
73600 + }
73601 +
73602 + reiser4_atom_set_stage(small, ASTAGE_INVALID);
73603 +
73604 + /* Notify any waiters--small needs to unload its wait lists. Waiters
73605 + actually remove themselves from the list before returning from the
73606 + fuse_wait function. */
73607 + wakeup_atom_waiting_list(small);
73608 +
73609 + /* Unlock atoms */
73610 + spin_unlock_atom(large);
73611 + atom_dec_and_unlock(small);
73612 +}
73613 +
73614 +/* TXNMGR STUFF */
73615 +
73616 +/* Release a block from the atom, reversing the effects of being captured,
73617 + do not release atom's reference to jnode due to holding spin-locks.
73618 + Currently this is only called when the atom commits.
73619 +
73620 + NOTE: this function does not release a (journal) reference to jnode
73621 + due to locking optimizations, you should call jput() somewhere after
73622 + calling reiser4_uncapture_block(). */
73623 +void reiser4_uncapture_block(jnode * node)
73624 +{
73625 + txn_atom *atom;
73626 +
73627 + assert("umka-226", node != NULL);
73628 + atom = node->atom;
73629 + assert("umka-228", atom != NULL);
73630 +
73631 + assert("jmacd-1021", node->atom == atom);
73632 + assert_spin_locked(&(node->guard));
73633 + assert("jmacd-1023", atom_is_protected(atom));
73634 +
73635 + JF_CLR(node, JNODE_DIRTY);
73636 + JF_CLR(node, JNODE_RELOC);
73637 + JF_CLR(node, JNODE_OVRWR);
73638 + JF_CLR(node, JNODE_CREATED);
73639 + JF_CLR(node, JNODE_WRITEBACK);
73640 + JF_CLR(node, JNODE_REPACK);
73641 +
73642 + list_del_init(&node->capture_link);
73643 + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73644 + assert("zam-925", atom_isopen(atom));
73645 + assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73646 + ON_DEBUG(atom->num_queued--);
73647 + JF_CLR(node, JNODE_FLUSH_QUEUED);
73648 + }
73649 + atom->capture_count -= 1;
73650 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73651 + node->atom = NULL;
73652 +
73653 + spin_unlock_jnode(node);
73654 + LOCK_CNT_DEC(t_refs);
73655 +}
73656 +
73657 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73658 + bitmap-based allocator code for adding modified bitmap blocks the
73659 + transaction. @atom and @node are spin locked */
73660 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73661 +{
73662 + assert("zam-538", atom_is_protected(atom));
73663 + assert_spin_locked(&(node->guard));
73664 + assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73665 + assert("zam-543", node->atom == NULL);
73666 + assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73667 +
73668 + list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73669 + jref(node);
73670 + node->atom = atom;
73671 + atom->capture_count++;
73672 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73673 +}
73674 +
73675 +static int count_deleted_blocks_actor(txn_atom * atom,
73676 + const reiser4_block_nr * a,
73677 + const reiser4_block_nr * b, void *data)
73678 +{
73679 + reiser4_block_nr *counter = data;
73680 +
73681 + assert("zam-995", data != NULL);
73682 + assert("zam-996", a != NULL);
73683 + if (b == NULL)
73684 + *counter += 1;
73685 + else
73686 + *counter += *b;
73687 + return 0;
73688 +}
73689 +
73690 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
73691 +{
73692 + reiser4_block_nr result;
73693 + txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73694 + txn_atom *atom;
73695 +
73696 + result = 0;
73697 +
73698 + spin_lock_txnmgr(tmgr);
73699 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73700 + spin_lock_atom(atom);
73701 + if (atom_isopen(atom))
73702 + blocknr_set_iterator(
73703 + atom, &atom->delete_set,
73704 + count_deleted_blocks_actor, &result, 0);
73705 + spin_unlock_atom(atom);
73706 + }
73707 + spin_unlock_txnmgr(tmgr);
73708 +
73709 + return result;
73710 +}
73711 +
73712 +/*
73713 + * Local variables:
73714 + * c-indentation-style: "K&R"
73715 + * mode-name: "LC"
73716 + * c-basic-offset: 8
73717 + * tab-width: 8
73718 + * fill-column: 79
73719 + * End:
73720 + */
73721 diff -urN linux-2.6.33.orig/fs/reiser4/txnmgr.h linux-2.6.33/fs/reiser4/txnmgr.h
73722 --- linux-2.6.33.orig/fs/reiser4/txnmgr.h 1970-01-01 01:00:00.000000000 +0100
73723 +++ linux-2.6.33/fs/reiser4/txnmgr.h 2010-03-04 19:33:22.000000000 +0100
73724 @@ -0,0 +1,701 @@
73725 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73726 + * reiser4/README */
73727 +
73728 +/* data-types and function declarations for transaction manager. See txnmgr.c
73729 + * for details. */
73730 +
73731 +#ifndef __REISER4_TXNMGR_H__
73732 +#define __REISER4_TXNMGR_H__
73733 +
73734 +#include "forward.h"
73735 +#include "dformat.h"
73736 +
73737 +#include <linux/fs.h>
73738 +#include <linux/mm.h>
73739 +#include <linux/types.h>
73740 +#include <linux/spinlock.h>
73741 +#include <asm/atomic.h>
73742 +#include <linux/wait.h>
73743 +
73744 +/* TYPE DECLARATIONS */
73745 +
73746 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73747 + A capture request dynamically assigns a block to the calling thread's transaction
73748 + handle. */
73749 +typedef enum {
73750 + /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73751 + atom should fuse in order to ensure that the block commits atomically with the
73752 + caller. */
73753 + TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73754 +
73755 + /* A READ_NONCOM request indicates that a block will be read and that the caller is
73756 + willing to read a non-committed block without causing atoms to fuse. */
73757 + TXN_CAPTURE_READ_NONCOM = (1 << 1),
73758 +
73759 + /* A READ_MODIFY request indicates that a block will be read but that the caller
73760 + wishes for the block to be captured as it will be written. This capture request
73761 + mode is not currently used, but eventually it will be useful for preventing
73762 + deadlock in read-modify-write cycles. */
73763 + TXN_CAPTURE_READ_MODIFY = (1 << 2),
73764 +
73765 + /* A WRITE capture request indicates that a block will be modified and that atoms
73766 + should fuse to make the commit atomic. */
73767 + TXN_CAPTURE_WRITE = (1 << 3),
73768 +
73769 + /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73770 + exclusive type designation from extra bits that may be supplied -- see
73771 + below. */
73772 + TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73773 + TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73774 + TXN_CAPTURE_WRITE),
73775 +
73776 + /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73777 + indicate modification will occur. */
73778 + TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73779 +
73780 + /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73781 + prefer not to sleep waiting for an aging atom to commit. */
73782 + TXN_CAPTURE_NONBLOCKING = (1 << 4),
73783 +
73784 + /* An option to reiser4_try_capture to prevent atom fusion, just simple
73785 + capturing is allowed */
73786 + TXN_CAPTURE_DONT_FUSE = (1 << 5)
73787 +
73788 + /* This macro selects only the exclusive capture request types, stripping out any
73789 + options that were supplied (i.e., NONBLOCKING). */
73790 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73791 +} txn_capture;
73792 +
73793 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73794 + difference is in the handling of read requests. A WRITE_FUSING transaction handle
73795 + defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73796 + transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73797 +typedef enum {
73798 + TXN_WRITE_FUSING = (1 << 0),
73799 + TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
73800 +} txn_mode;
73801 +
73802 +/* Every atom has a stage, which is one of these exclusive values: */
73803 +typedef enum {
73804 + /* Initially an atom is free. */
73805 + ASTAGE_FREE = 0,
73806 +
73807 + /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73808 + blocks and fuse with other atoms. */
73809 + ASTAGE_CAPTURE_FUSE = 1,
73810 +
73811 + /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73812 +
73813 + /* When an atom reaches a certain age it must do all it can to commit. An atom in
73814 + the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73815 + atoms in the CAPTURE_FUSE stage. */
73816 + ASTAGE_CAPTURE_WAIT = 2,
73817 +
73818 + /* Waiting for I/O before commit. Copy-on-capture (see
73819 + http://namesys.com/v4/v4.html). */
73820 + ASTAGE_PRE_COMMIT = 3,
73821 +
73822 + /* Post-commit overwrite I/O. Steal-on-capture. */
73823 + ASTAGE_POST_COMMIT = 4,
73824 +
73825 + /* Atom which waits for the removal of the last reference to (it? ) to
73826 + * be deleted from memory */
73827 + ASTAGE_DONE = 5,
73828 +
73829 + /* invalid atom. */
73830 + ASTAGE_INVALID = 6,
73831 +
73832 +} txn_stage;
73833 +
73834 +/* Certain flags may be set in the txn_atom->flags field. */
73835 +typedef enum {
73836 + /* Indicates that the atom should commit as soon as possible. */
73837 + ATOM_FORCE_COMMIT = (1 << 0),
73838 + /* to avoid endless loop, mark the atom (which was considered as too
73839 + * small) after failed attempt to fuse it. */
73840 + ATOM_CANCEL_FUSION = (1 << 1)
73841 +} txn_flags;
73842 +
73843 +/* Flags for controlling commit_txnh */
73844 +typedef enum {
73845 + /* Wait commit atom completion in commit_txnh */
73846 + TXNH_WAIT_COMMIT = 0x2,
73847 + /* Don't commit atom when this handle is closed */
73848 + TXNH_DONT_COMMIT = 0x4
73849 +} txn_handle_flags_t;
73850 +
73851 +/* TYPE DEFINITIONS */
73852 +
73853 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73854 + fields, so typically an operation on the atom through either of these objects must (1)
73855 + lock the object, (2) read the atom pointer, (3) lock the atom.
73856 +
73857 + During atom fusion, the process holds locks on both atoms at once. Then, it iterates
73858 + through the list of handles and pages held by the smaller of the two atoms. For each
73859 + handle and page referencing the smaller atom, the fusing process must: (1) lock the
73860 + object, and (2) update the atom pointer.
73861 +
73862 + You can see that there is a conflict of lock ordering here, so the more-complex
73863 + procedure should have priority, i.e., the fusing process has priority so that it is
73864 + guaranteed to make progress and to avoid restarts.
73865 +
73866 + This decision, however, means additional complexity for aquiring the atom lock in the
73867 + first place.
73868 +
73869 + The general original procedure followed in the code was:
73870 +
73871 + TXN_OBJECT *obj = ...;
73872 + TXN_ATOM *atom;
73873 +
73874 + spin_lock (& obj->_lock);
73875 +
73876 + atom = obj->_atom;
73877 +
73878 + if (! spin_trylock_atom (atom))
73879 + {
73880 + spin_unlock (& obj->_lock);
73881 + RESTART OPERATION, THERE WAS A RACE;
73882 + }
73883 +
73884 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73885 +
73886 + It has however been found that this wastes CPU a lot in a manner that is
73887 + hard to profile. So, proper refcounting was added to atoms, and new
73888 + standard locking sequence is like following:
73889 +
73890 + TXN_OBJECT *obj = ...;
73891 + TXN_ATOM *atom;
73892 +
73893 + spin_lock (& obj->_lock);
73894 +
73895 + atom = obj->_atom;
73896 +
73897 + if (! spin_trylock_atom (atom))
73898 + {
73899 + atomic_inc (& atom->refcount);
73900 + spin_unlock (& obj->_lock);
73901 + spin_lock (&atom->_lock);
73902 + atomic_dec (& atom->refcount);
73903 + // HERE atom is locked
73904 + spin_unlock (&atom->_lock);
73905 + RESTART OPERATION, THERE WAS A RACE;
73906 + }
73907 +
73908 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73909 +
73910 + (core of this is implemented in trylock_throttle() function)
73911 +
73912 + See the jnode_get_atom() function for a common case.
73913 +
73914 + As an additional (and important) optimization allowing to avoid restarts,
73915 + it is possible to re-check required pre-conditions at the HERE point in
73916 + code above and proceed without restarting if they are still satisfied.
73917 +*/
73918 +
73919 +/* An atomic transaction: this is the underlying system representation
73920 + of a transaction, not the one seen by clients.
73921 +
73922 + Invariants involving this data-type:
73923 +
73924 + [sb-fake-allocated]
73925 +*/
73926 +struct txn_atom {
73927 + /* The spinlock protecting the atom, held during fusion and various other state
73928 + changes. */
73929 + spinlock_t alock;
73930 +
73931 + /* The atom's reference counter, increasing (in case of a duplication
73932 + of an existing reference or when we are sure that some other
73933 + reference exists) may be done without taking spinlock, decrementing
73934 + of the ref. counter requires a spinlock to be held.
73935 +
73936 + Each transaction handle counts in ->refcount. All jnodes count as
73937 + one reference acquired in atom_begin_andlock(), released in
73938 + commit_current_atom().
73939 + */
73940 + atomic_t refcount;
73941 +
73942 + /* The atom_id identifies the atom in persistent records such as the log. */
73943 + __u32 atom_id;
73944 +
73945 + /* Flags holding any of the txn_flags enumerated values (e.g.,
73946 + ATOM_FORCE_COMMIT). */
73947 + __u32 flags;
73948 +
73949 + /* Number of open handles. */
73950 + __u32 txnh_count;
73951 +
73952 + /* The number of znodes captured by this atom. Equal to the sum of lengths of the
73953 + dirty_nodes[level] and clean_nodes lists. */
73954 + __u32 capture_count;
73955 +
73956 +#if REISER4_DEBUG
73957 + int clean;
73958 + int dirty;
73959 + int ovrwr;
73960 + int wb;
73961 + int fq;
73962 +#endif
73963 +
73964 + __u32 flushed;
73965 +
73966 + /* Current transaction stage. */
73967 + txn_stage stage;
73968 +
73969 + /* Start time. */
73970 + unsigned long start_time;
73971 +
73972 + /* The atom's delete set. It collects block numbers of the nodes
73973 + which were deleted during the transaction. */
73974 + struct list_head delete_set;
73975 +
73976 + /* The atom's wandered_block mapping. */
73977 + struct list_head wandered_map;
73978 +
73979 + /* The transaction's list of dirty captured nodes--per level. Index
73980 + by (level). dirty_nodes[0] is for znode-above-root */
73981 + struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73982 +
73983 + /* The transaction's list of clean captured nodes. */
73984 + struct list_head clean_nodes;
73985 +
73986 + /* The atom's overwrite set */
73987 + struct list_head ovrwr_nodes;
73988 +
73989 + /* nodes which are being written to disk */
73990 + struct list_head writeback_nodes;
73991 +
73992 + /* list of inodes */
73993 + struct list_head inodes;
73994 +
73995 + /* List of handles associated with this atom. */
73996 + struct list_head txnh_list;
73997 +
73998 + /* Transaction list link: list of atoms in the transaction manager. */
73999 + struct list_head atom_link;
74000 +
74001 + /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
74002 + struct list_head fwaitfor_list;
74003 +
74004 + /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
74005 + struct list_head fwaiting_list;
74006 +
74007 + /* Numbers of objects which were deleted/created in this transaction
74008 + thereby numbers of objects IDs which were released/deallocated. */
74009 + int nr_objects_deleted;
74010 + int nr_objects_created;
74011 + /* number of blocks allocated during the transaction */
74012 + __u64 nr_blocks_allocated;
74013 + /* All atom's flush queue objects are on this list */
74014 + struct list_head flush_queues;
74015 +#if REISER4_DEBUG
74016 + /* number of flush queues for this atom. */
74017 + int nr_flush_queues;
74018 + /* Number of jnodes which were removed from atom's lists and put
74019 + on flush_queue */
74020 + int num_queued;
74021 +#endif
74022 + /* number of threads who wait for this atom to complete commit */
74023 + int nr_waiters;
74024 + /* number of threads which do jnode_flush() over this atom */
74025 + int nr_flushers;
74026 + /* number of flush queues which are IN_USE and jnodes from fq->prepped
74027 + are submitted to disk by the reiser4_write_fq() routine. */
74028 + int nr_running_queues;
74029 + /* A counter of grabbed unformatted nodes, see a description of the
74030 + * reiser4 space reservation scheme at block_alloc.c */
74031 + reiser4_block_nr flush_reserved;
74032 +#if REISER4_DEBUG
74033 + void *committer;
74034 +#endif
74035 + struct super_block *super;
74036 +};
74037 +
74038 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
74039 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
74040 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
74041 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
74042 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
74043 +
74044 +#define NODE_LIST(node) (node)->list
74045 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
74046 +ON_DEBUG(void
74047 + count_jnode(txn_atom *, jnode *, atom_list old_list,
74048 + atom_list new_list, int check_lists));
74049 +
74050 +/* A transaction handle: the client obtains and commits this handle which is assigned by
74051 + the system to a txn_atom. */
74052 +struct txn_handle {
74053 + /* Spinlock protecting ->atom pointer */
74054 + spinlock_t hlock;
74055 +
74056 + /* Flags for controlling commit_txnh() behavior */
74057 + /* from txn_handle_flags_t */
74058 + txn_handle_flags_t flags;
74059 +
74060 + /* Whether it is READ_FUSING or WRITE_FUSING. */
74061 + txn_mode mode;
74062 +
74063 + /* If assigned, the atom it is part of. */
74064 + txn_atom *atom;
74065 +
74066 + /* Transaction list link. Head is in txn_atom. */
74067 + struct list_head txnh_link;
74068 +};
74069 +
74070 +/* The transaction manager: one is contained in the reiser4_super_info_data */
74071 +struct txn_mgr {
74072 + /* A spinlock protecting the atom list, id_count, flush_control */
74073 + spinlock_t tmgr_lock;
74074 +
74075 + /* List of atoms. */
74076 + struct list_head atoms_list;
74077 +
74078 + /* Number of atoms. */
74079 + int atom_count;
74080 +
74081 + /* A counter used to assign atom->atom_id values. */
74082 + __u32 id_count;
74083 +
74084 + /* a mutex object for commit serialization */
74085 + struct mutex commit_mutex;
74086 +
74087 + /* a list of all txnmrgs served by particular daemon. */
74088 + struct list_head linkage;
74089 +
74090 + /* description of daemon for this txnmgr */
74091 + ktxnmgrd_context *daemon;
74092 +
74093 + /* parameters. Adjustable through mount options. */
74094 + unsigned int atom_max_size;
74095 + unsigned int atom_max_age;
74096 + unsigned int atom_min_size;
74097 + /* max number of concurrent flushers for one atom, 0 - unlimited. */
74098 + unsigned int atom_max_flushers;
74099 + struct dentry *debugfs_atom_count;
74100 + struct dentry *debugfs_id_count;
74101 +};
74102 +
74103 +/* FUNCTION DECLARATIONS */
74104 +
74105 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
74106 + are prefixed with "txn_". For comments, see txnmgr.c. */
74107 +
74108 +extern int init_txnmgr_static(void);
74109 +extern void done_txnmgr_static(void);
74110 +
74111 +extern void reiser4_init_txnmgr(txn_mgr *);
74112 +extern void reiser4_done_txnmgr(txn_mgr *);
74113 +
74114 +extern int reiser4_txn_reserve(int reserved);
74115 +
74116 +extern void reiser4_txn_begin(reiser4_context * context);
74117 +extern int reiser4_txn_end(reiser4_context * context);
74118 +
74119 +extern void reiser4_txn_restart(reiser4_context * context);
74120 +extern void reiser4_txn_restart_current(void);
74121 +
74122 +extern int txnmgr_force_commit_all(struct super_block *, int);
74123 +extern int current_atom_should_commit(void);
74124 +
74125 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
74126 +
74127 +extern int commit_some_atoms(txn_mgr *);
74128 +extern int force_commit_atom(txn_handle *);
74129 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
74130 +
74131 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
74132 +
74133 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
74134 +
74135 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
74136 + int alloc_value);
74137 +extern void atom_dec_and_unlock(txn_atom * atom);
74138 +
74139 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
74140 +extern int try_capture_page_to_invalidate(struct page *pg);
74141 +
74142 +extern void reiser4_uncapture_page(struct page *pg);
74143 +extern void reiser4_uncapture_block(jnode *);
74144 +extern void reiser4_uncapture_jnode(jnode *);
74145 +
74146 +extern int reiser4_capture_inode(struct inode *);
74147 +extern int reiser4_uncapture_inode(struct inode *);
74148 +
74149 +extern txn_atom *get_current_atom_locked_nocheck(void);
74150 +
74151 +#if REISER4_DEBUG
74152 +
74153 +/**
74154 + * atom_is_protected - make sure that nobody but us can do anything with atom
74155 + * @atom: atom to be checked
74156 + *
74157 + * This is used to assert that atom either entered commit stages or is spin
74158 + * locked.
74159 + */
74160 +static inline int atom_is_protected(txn_atom *atom)
74161 +{
74162 + if (atom->stage >= ASTAGE_PRE_COMMIT)
74163 + return 1;
74164 + assert_spin_locked(&(atom->alock));
74165 + return 1;
74166 +}
74167 +
74168 +#endif
74169 +
74170 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
74171 +static inline txn_atom *get_current_atom_locked(void)
74172 +{
74173 + txn_atom *atom;
74174 +
74175 + atom = get_current_atom_locked_nocheck();
74176 + assert("zam-761", atom != NULL);
74177 +
74178 + return atom;
74179 +}
74180 +
74181 +extern txn_atom *jnode_get_atom(jnode *);
74182 +
74183 +extern void reiser4_atom_wait_event(txn_atom *);
74184 +extern void reiser4_atom_send_event(txn_atom *);
74185 +
74186 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
74187 +extern int reiser4_capture_super_block(struct super_block *s);
74188 +int capture_bulk(jnode **, int count);
74189 +
74190 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
74191 + calling convention of these three routines. */
74192 +extern void blocknr_set_init(struct list_head * bset);
74193 +extern void blocknr_set_destroy(struct list_head * bset);
74194 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
74195 +extern int blocknr_set_add_extent(txn_atom * atom,
74196 + struct list_head * bset,
74197 + blocknr_set_entry ** new_bsep,
74198 + const reiser4_block_nr * start,
74199 + const reiser4_block_nr * len);
74200 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
74201 + blocknr_set_entry ** new_bsep,
74202 + const reiser4_block_nr * a,
74203 + const reiser4_block_nr * b);
74204 +
74205 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
74206 + const reiser4_block_nr *, void *);
74207 +
74208 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
74209 + blocknr_set_actor_f actor, void *data,
74210 + int delete);
74211 +
74212 +/* flush code takes care about how to fuse flush queues */
74213 +extern void flush_init_atom(txn_atom * atom);
74214 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
74215 +
74216 +static inline void spin_lock_atom(txn_atom *atom)
74217 +{
74218 + /* check that spinlocks of lower priorities are not held */
74219 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
74220 + LOCK_CNT_NIL(spin_locked_atom) &&
74221 + LOCK_CNT_NIL(spin_locked_jnode) &&
74222 + LOCK_CNT_NIL(spin_locked_zlock) &&
74223 + LOCK_CNT_NIL(rw_locked_dk) &&
74224 + LOCK_CNT_NIL(rw_locked_tree)));
74225 +
74226 + spin_lock(&(atom->alock));
74227 +
74228 + LOCK_CNT_INC(spin_locked_atom);
74229 + LOCK_CNT_INC(spin_locked);
74230 +}
74231 +
74232 +static inline void spin_lock_atom_nested(txn_atom *atom)
74233 +{
74234 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
74235 + LOCK_CNT_NIL(spin_locked_jnode) &&
74236 + LOCK_CNT_NIL(spin_locked_zlock) &&
74237 + LOCK_CNT_NIL(rw_locked_dk) &&
74238 + LOCK_CNT_NIL(rw_locked_tree)));
74239 +
74240 + spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
74241 +
74242 + LOCK_CNT_INC(spin_locked_atom);
74243 + LOCK_CNT_INC(spin_locked);
74244 +}
74245 +
74246 +static inline int spin_trylock_atom(txn_atom *atom)
74247 +{
74248 + if (spin_trylock(&(atom->alock))) {
74249 + LOCK_CNT_INC(spin_locked_atom);
74250 + LOCK_CNT_INC(spin_locked);
74251 + return 1;
74252 + }
74253 + return 0;
74254 +}
74255 +
74256 +static inline void spin_unlock_atom(txn_atom *atom)
74257 +{
74258 + assert_spin_locked(&(atom->alock));
74259 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
74260 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74261 +
74262 + LOCK_CNT_DEC(spin_locked_atom);
74263 + LOCK_CNT_DEC(spin_locked);
74264 +
74265 + spin_unlock(&(atom->alock));
74266 +}
74267 +
74268 +static inline void spin_lock_txnh(txn_handle *txnh)
74269 +{
74270 + /* check that spinlocks of lower priorities are not held */
74271 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
74272 + LOCK_CNT_NIL(spin_locked_zlock) &&
74273 + LOCK_CNT_NIL(rw_locked_tree)));
74274 +
74275 + spin_lock(&(txnh->hlock));
74276 +
74277 + LOCK_CNT_INC(spin_locked_txnh);
74278 + LOCK_CNT_INC(spin_locked);
74279 +}
74280 +
74281 +static inline int spin_trylock_txnh(txn_handle *txnh)
74282 +{
74283 + if (spin_trylock(&(txnh->hlock))) {
74284 + LOCK_CNT_INC(spin_locked_txnh);
74285 + LOCK_CNT_INC(spin_locked);
74286 + return 1;
74287 + }
74288 + return 0;
74289 +}
74290 +
74291 +static inline void spin_unlock_txnh(txn_handle *txnh)
74292 +{
74293 + assert_spin_locked(&(txnh->hlock));
74294 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
74295 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74296 +
74297 + LOCK_CNT_DEC(spin_locked_txnh);
74298 + LOCK_CNT_DEC(spin_locked);
74299 +
74300 + spin_unlock(&(txnh->hlock));
74301 +}
74302 +
74303 +#define spin_ordering_pred_txnmgr(tmgr) \
74304 + ( LOCK_CNT_NIL(spin_locked_atom) && \
74305 + LOCK_CNT_NIL(spin_locked_txnh) && \
74306 + LOCK_CNT_NIL(spin_locked_jnode) && \
74307 + LOCK_CNT_NIL(rw_locked_zlock) && \
74308 + LOCK_CNT_NIL(rw_locked_dk) && \
74309 + LOCK_CNT_NIL(rw_locked_tree) )
74310 +
74311 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
74312 +{
74313 + /* check that spinlocks of lower priorities are not held */
74314 + assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
74315 + LOCK_CNT_NIL(spin_locked_txnh) &&
74316 + LOCK_CNT_NIL(spin_locked_jnode) &&
74317 + LOCK_CNT_NIL(spin_locked_zlock) &&
74318 + LOCK_CNT_NIL(rw_locked_dk) &&
74319 + LOCK_CNT_NIL(rw_locked_tree)));
74320 +
74321 + spin_lock(&(mgr->tmgr_lock));
74322 +
74323 + LOCK_CNT_INC(spin_locked_txnmgr);
74324 + LOCK_CNT_INC(spin_locked);
74325 +}
74326 +
74327 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
74328 +{
74329 + if (spin_trylock(&(mgr->tmgr_lock))) {
74330 + LOCK_CNT_INC(spin_locked_txnmgr);
74331 + LOCK_CNT_INC(spin_locked);
74332 + return 1;
74333 + }
74334 + return 0;
74335 +}
74336 +
74337 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
74338 +{
74339 + assert_spin_locked(&(mgr->tmgr_lock));
74340 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
74341 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74342 +
74343 + LOCK_CNT_DEC(spin_locked_txnmgr);
74344 + LOCK_CNT_DEC(spin_locked);
74345 +
74346 + spin_unlock(&(mgr->tmgr_lock));
74347 +}
74348 +
74349 +typedef enum {
74350 + FQ_IN_USE = 0x1
74351 +} flush_queue_state_t;
74352 +
74353 +typedef struct flush_queue flush_queue_t;
74354 +
74355 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
74356 + is filled by the jnode_flush() routine, and written to disk under memory
74357 + pressure or at atom commit time. */
74358 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
74359 + field and fq->prepped list can be modified if atom is spin-locked and fq
74360 + object is "in-use" state. For read-only traversal of the fq->prepped list
74361 + and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
74362 + only have atom spin-locked. */
74363 +struct flush_queue {
74364 + /* linkage element is the first in this structure to make debugging
74365 + easier. See field in atom struct for description of list. */
74366 + struct list_head alink;
74367 + /* A spinlock to protect changes of fq state and fq->atom pointer */
74368 + spinlock_t guard;
74369 + /* flush_queue state: [in_use | ready] */
74370 + flush_queue_state_t state;
74371 + /* A list which contains queued nodes, queued nodes are removed from any
74372 + * atom's list and put on this ->prepped one. */
74373 + struct list_head prepped;
74374 + /* number of submitted i/o requests */
74375 + atomic_t nr_submitted;
74376 + /* number of i/o errors */
74377 + atomic_t nr_errors;
74378 + /* An atom this flush queue is attached to */
74379 + txn_atom *atom;
74380 + /* A wait queue head to wait on i/o completion */
74381 + wait_queue_head_t wait;
74382 +#if REISER4_DEBUG
74383 + /* A thread which took this fq in exclusive use, NULL if fq is free,
74384 + * used for debugging. */
74385 + struct task_struct *owner;
74386 +#endif
74387 +};
74388 +
74389 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
74390 +extern void reiser4_fq_put_nolock(flush_queue_t *);
74391 +extern void reiser4_fq_put(flush_queue_t *);
74392 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
74393 +extern void queue_jnode(flush_queue_t *, jnode *);
74394 +
74395 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
74396 +extern int current_atom_finish_all_fq(void);
74397 +extern void init_atom_fq_parts(txn_atom *);
74398 +
74399 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
74400 +
74401 +extern void znode_make_dirty(znode * node);
74402 +extern void jnode_make_dirty_locked(jnode * node);
74403 +
74404 +extern int reiser4_sync_atom(txn_atom * atom);
74405 +
74406 +#if REISER4_DEBUG
74407 +extern int atom_fq_parts_are_clean(txn_atom *);
74408 +#endif
74409 +
74410 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
74411 +extern flush_queue_t *get_fq_for_current_atom(void);
74412 +
74413 +void reiser4_invalidate_list(struct list_head * head);
74414 +
74415 +# endif /* __REISER4_TXNMGR_H__ */
74416 +
74417 +/* Make Linus happy.
74418 + Local variables:
74419 + c-indentation-style: "K&R"
74420 + mode-name: "LC"
74421 + c-basic-offset: 8
74422 + tab-width: 8
74423 + fill-column: 120
74424 + End:
74425 +*/
74426 diff -urN linux-2.6.33.orig/fs/reiser4/type_safe_hash.h linux-2.6.33/fs/reiser4/type_safe_hash.h
74427 --- linux-2.6.33.orig/fs/reiser4/type_safe_hash.h 1970-01-01 01:00:00.000000000 +0100
74428 +++ linux-2.6.33/fs/reiser4/type_safe_hash.h 2010-03-04 19:33:22.000000000 +0100
74429 @@ -0,0 +1,320 @@
74430 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74431 + * reiser4/README */
74432 +
74433 +/* A hash table class that uses hash chains (singly-linked) and is
74434 + parametrized to provide type safety. */
74435 +
74436 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
74437 +#define __REISER4_TYPE_SAFE_HASH_H__
74438 +
74439 +#include "debug.h"
74440 +
74441 +#include <asm/errno.h>
74442 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74443 + based on the object type. You need to declare the item type before
74444 + this definition, define it after this definition. */
74445 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
74446 + \
74447 +typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
74448 +typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
74449 + \
74450 +struct PREFIX##_hash_table_ \
74451 +{ \
74452 + ITEM_TYPE **_table; \
74453 + __u32 _buckets; \
74454 +}; \
74455 + \
74456 +struct PREFIX##_hash_link_ \
74457 +{ \
74458 + ITEM_TYPE *_next; \
74459 +}
74460 +
74461 +/* Step 2: Define the object type of the hash: give it field of type
74462 + PREFIX_hash_link. */
74463 +
74464 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74465 + the type and field name used in step 3. The arguments are:
74466 +
74467 + ITEM_TYPE The item type being hashed
74468 + KEY_TYPE The type of key being hashed
74469 + KEY_NAME The name of the key field within the item
74470 + LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
74471 + HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
74472 + EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
74473 +
74474 + It implements these functions:
74475 +
74476 + prefix_hash_init Initialize the table given its size.
74477 + prefix_hash_insert Insert an item
74478 + prefix_hash_insert_index Insert an item w/ precomputed hash_index
74479 + prefix_hash_find Find an item by key
74480 + prefix_hash_find_index Find an item w/ precomputed hash_index
74481 + prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
74482 + prefix_hash_remove_index Remove an item w/ precomputed hash_index
74483 +
74484 + If you'd like something to be done differently, feel free to ask me
74485 + for modifications. Additional features that could be added but
74486 + have not been:
74487 +
74488 + prefix_hash_remove_key Find and remove an item by key
74489 + prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
74490 +
74491 + The hash_function currently receives only the key as an argument,
74492 + meaning it must somehow know the number of buckets. If this is a
74493 + problem let me know.
74494 +
74495 + This hash table uses a single-linked hash chain. This means
74496 + insertion is fast but deletion requires searching the chain.
74497 +
74498 + There is also the doubly-linked hash chain approach, under which
74499 + deletion requires no search but the code is longer and it takes two
74500 + pointers per item.
74501 +
74502 + The circularly-linked approach has the shortest code but requires
74503 + two pointers per bucket, doubling the size of the bucket array (in
74504 + addition to two pointers per item).
74505 +*/
74506 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
74507 + \
74508 +static __inline__ void \
74509 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
74510 + __u32 hash UNUSED_ARG) \
74511 +{ \
74512 + assert("nikita-2780", hash < table->_buckets); \
74513 +} \
74514 + \
74515 +static __inline__ int \
74516 +PREFIX##_hash_init (PREFIX##_hash_table *hash, \
74517 + __u32 buckets) \
74518 +{ \
74519 + hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
74520 + hash->_buckets = buckets; \
74521 + if (hash->_table == NULL) \
74522 + { \
74523 + return RETERR(-ENOMEM); \
74524 + } \
74525 + memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
74526 + ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
74527 + return 0; \
74528 +} \
74529 + \
74530 +static __inline__ void \
74531 +PREFIX##_hash_done (PREFIX##_hash_table *hash) \
74532 +{ \
74533 + if (REISER4_DEBUG && hash->_table != NULL) { \
74534 + __u32 i; \
74535 + for (i = 0 ; i < hash->_buckets ; ++ i) \
74536 + assert("nikita-2905", hash->_table[i] == NULL); \
74537 + } \
74538 + if (hash->_table != NULL) \
74539 + KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
74540 + hash->_table = NULL; \
74541 +} \
74542 + \
74543 +static __inline__ void \
74544 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
74545 +{ \
74546 + prefetch(item->LINK_NAME._next); \
74547 +} \
74548 + \
74549 +static __inline__ void \
74550 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
74551 + __u32 index) \
74552 +{ \
74553 + prefetch(hash->_table[index]); \
74554 +} \
74555 + \
74556 +static __inline__ ITEM_TYPE* \
74557 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
74558 + __u32 hash_index, \
74559 + KEY_TYPE const *find_key) \
74560 +{ \
74561 + ITEM_TYPE *item; \
74562 + \
74563 + PREFIX##_check_hash(hash, hash_index); \
74564 + \
74565 + for (item = hash->_table[hash_index]; \
74566 + item != NULL; \
74567 + item = item->LINK_NAME._next) \
74568 + { \
74569 + prefetch(item->LINK_NAME._next); \
74570 + prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
74571 + if (EQ_FUNC (& item->KEY_NAME, find_key)) \
74572 + { \
74573 + return item; \
74574 + } \
74575 + } \
74576 + \
74577 + return NULL; \
74578 +} \
74579 + \
74580 +static __inline__ ITEM_TYPE* \
74581 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
74582 + __u32 hash_index, \
74583 + KEY_TYPE const *find_key) \
74584 +{ \
74585 + ITEM_TYPE ** item = &hash->_table[hash_index]; \
74586 + \
74587 + PREFIX##_check_hash(hash, hash_index); \
74588 + \
74589 + while (*item != NULL) { \
74590 + prefetch(&(*item)->LINK_NAME._next); \
74591 + if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
74592 + ITEM_TYPE *found; \
74593 + \
74594 + found = *item; \
74595 + *item = found->LINK_NAME._next; \
74596 + found->LINK_NAME._next = hash->_table[hash_index]; \
74597 + hash->_table[hash_index] = found; \
74598 + return found; \
74599 + } \
74600 + item = &(*item)->LINK_NAME._next; \
74601 + } \
74602 + return NULL; \
74603 +} \
74604 + \
74605 +static __inline__ int \
74606 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
74607 + __u32 hash_index, \
74608 + ITEM_TYPE *del_item) \
74609 +{ \
74610 + ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
74611 + \
74612 + PREFIX##_check_hash(hash, hash_index); \
74613 + \
74614 + while (*hash_item_p != NULL) { \
74615 + prefetch(&(*hash_item_p)->LINK_NAME._next); \
74616 + if (*hash_item_p == del_item) { \
74617 + *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
74618 + return 1; \
74619 + } \
74620 + hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
74621 + } \
74622 + return 0; \
74623 +} \
74624 + \
74625 +static __inline__ void \
74626 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
74627 + __u32 hash_index, \
74628 + ITEM_TYPE *ins_item) \
74629 +{ \
74630 + PREFIX##_check_hash(hash, hash_index); \
74631 + \
74632 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74633 + hash->_table[hash_index] = ins_item; \
74634 +} \
74635 + \
74636 +static __inline__ void \
74637 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
74638 + __u32 hash_index, \
74639 + ITEM_TYPE *ins_item) \
74640 +{ \
74641 + PREFIX##_check_hash(hash, hash_index); \
74642 + \
74643 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74644 + smp_wmb(); \
74645 + hash->_table[hash_index] = ins_item; \
74646 +} \
74647 + \
74648 +static __inline__ ITEM_TYPE* \
74649 +PREFIX##_hash_find (PREFIX##_hash_table *hash, \
74650 + KEY_TYPE const *find_key) \
74651 +{ \
74652 + return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
74653 +} \
74654 + \
74655 +static __inline__ ITEM_TYPE* \
74656 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
74657 + KEY_TYPE const *find_key) \
74658 +{ \
74659 + return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
74660 +} \
74661 + \
74662 +static __inline__ int \
74663 +PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
74664 + ITEM_TYPE *del_item) \
74665 +{ \
74666 + return PREFIX##_hash_remove_index (hash, \
74667 + HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
74668 +} \
74669 + \
74670 +static __inline__ int \
74671 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
74672 + ITEM_TYPE *del_item) \
74673 +{ \
74674 + return PREFIX##_hash_remove (hash, del_item); \
74675 +} \
74676 + \
74677 +static __inline__ void \
74678 +PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
74679 + ITEM_TYPE *ins_item) \
74680 +{ \
74681 + return PREFIX##_hash_insert_index (hash, \
74682 + HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
74683 +} \
74684 + \
74685 +static __inline__ void \
74686 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
74687 + ITEM_TYPE *ins_item) \
74688 +{ \
74689 + return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
74690 + ins_item); \
74691 +} \
74692 + \
74693 +static __inline__ ITEM_TYPE * \
74694 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
74695 +{ \
74696 + ITEM_TYPE *first; \
74697 + \
74698 + for (first = NULL; ind < hash->_buckets; ++ ind) { \
74699 + first = hash->_table[ind]; \
74700 + if (first != NULL) \
74701 + break; \
74702 + } \
74703 + return first; \
74704 +} \
74705 + \
74706 +static __inline__ ITEM_TYPE * \
74707 +PREFIX##_hash_next (PREFIX##_hash_table *hash, \
74708 + ITEM_TYPE *item) \
74709 +{ \
74710 + ITEM_TYPE *next; \
74711 + \
74712 + if (item == NULL) \
74713 + return NULL; \
74714 + next = item->LINK_NAME._next; \
74715 + if (next == NULL) \
74716 + next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
74717 + return next; \
74718 +} \
74719 + \
74720 +typedef struct {} PREFIX##_hash_dummy
74721 +
74722 +#define for_all_ht_buckets(table, head) \
74723 +for ((head) = &(table) -> _table[ 0 ] ; \
74724 + (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74725 +
74726 +#define for_all_in_bucket(bucket, item, next, field) \
74727 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
74728 + (item) != NULL ; \
74729 + (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74730 +
74731 +#define for_all_in_htable(table, prefix, item, next) \
74732 +for ((item) = prefix ## _hash_first ((table), 0), \
74733 + (next) = prefix ## _hash_next ((table), (item)) ; \
74734 + (item) != NULL ; \
74735 + (item) = (next), \
74736 + (next) = prefix ## _hash_next ((table), (item)))
74737 +
74738 +/* __REISER4_TYPE_SAFE_HASH_H__ */
74739 +#endif
74740 +
74741 +/* Make Linus happy.
74742 + Local variables:
74743 + c-indentation-style: "K&R"
74744 + mode-name: "LC"
74745 + c-basic-offset: 8
74746 + tab-width: 8
74747 + fill-column: 120
74748 + End:
74749 +*/
74750 diff -urN linux-2.6.33.orig/fs/reiser4/vfs_ops.c linux-2.6.33/fs/reiser4/vfs_ops.c
74751 --- linux-2.6.33.orig/fs/reiser4/vfs_ops.c 1970-01-01 01:00:00.000000000 +0100
74752 +++ linux-2.6.33/fs/reiser4/vfs_ops.c 2010-03-04 19:33:22.000000000 +0100
74753 @@ -0,0 +1,267 @@
74754 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74755 + * reiser4/README */
74756 +
74757 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74758 + here. */
74759 +
74760 +#include "forward.h"
74761 +#include "debug.h"
74762 +#include "dformat.h"
74763 +#include "coord.h"
74764 +#include "plugin/item/item.h"
74765 +#include "plugin/file/file.h"
74766 +#include "plugin/security/perm.h"
74767 +#include "plugin/disk_format/disk_format.h"
74768 +#include "plugin/plugin.h"
74769 +#include "plugin/plugin_set.h"
74770 +#include "plugin/object.h"
74771 +#include "txnmgr.h"
74772 +#include "jnode.h"
74773 +#include "znode.h"
74774 +#include "block_alloc.h"
74775 +#include "tree.h"
74776 +#include "vfs_ops.h"
74777 +#include "inode.h"
74778 +#include "page_cache.h"
74779 +#include "ktxnmgrd.h"
74780 +#include "super.h"
74781 +#include "reiser4.h"
74782 +#include "entd.h"
74783 +#include "status_flags.h"
74784 +#include "flush.h"
74785 +#include "dscale.h"
74786 +
74787 +#include <linux/profile.h>
74788 +#include <linux/types.h>
74789 +#include <linux/mount.h>
74790 +#include <linux/vfs.h>
74791 +#include <linux/mm.h>
74792 +#include <linux/buffer_head.h>
74793 +#include <linux/dcache.h>
74794 +#include <linux/list.h>
74795 +#include <linux/pagemap.h>
74796 +#include <linux/slab.h>
74797 +#include <linux/seq_file.h>
74798 +#include <linux/init.h>
74799 +#include <linux/module.h>
74800 +#include <linux/writeback.h>
74801 +#include <linux/blkdev.h>
74802 +#include <linux/quotaops.h>
74803 +#include <linux/security.h>
74804 +#include <linux/reboot.h>
74805 +#include <linux/rcupdate.h>
74806 +
74807 +/* update inode stat-data by calling plugin */
74808 +int reiser4_update_sd(struct inode *object)
74809 +{
74810 + file_plugin *fplug;
74811 +
74812 + assert("nikita-2338", object != NULL);
74813 + /* check for read-only file system. */
74814 + if (IS_RDONLY(object))
74815 + return 0;
74816 +
74817 + fplug = inode_file_plugin(object);
74818 + assert("nikita-2339", fplug != NULL);
74819 + return fplug->write_sd_by_inode(object);
74820 +}
74821 +
74822 +/* helper function: increase inode nlink count and call plugin method to save
74823 + updated stat-data.
74824 +
74825 + Used by link/create and during creation of dot and dotdot in mkdir
74826 +*/
74827 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74828 + struct inode *parent /* parent where new entry will be */
74829 + ,
74830 + int write_sd_p /* true if stat-data has to be
74831 + * updated */ )
74832 +{
74833 + file_plugin *fplug;
74834 + int result;
74835 +
74836 + assert("nikita-1351", object != NULL);
74837 +
74838 + fplug = inode_file_plugin(object);
74839 + assert("nikita-1445", fplug != NULL);
74840 +
74841 + /* ask plugin whether it can add yet another link to this
74842 + object */
74843 + if (!fplug->can_add_link(object))
74844 + return RETERR(-EMLINK);
74845 +
74846 + assert("nikita-2211", fplug->add_link != NULL);
74847 + /* call plugin to do actual addition of link */
74848 + result = fplug->add_link(object, parent);
74849 +
74850 + /* optionally update stat data */
74851 + if (result == 0 && write_sd_p)
74852 + result = fplug->write_sd_by_inode(object);
74853 + return result;
74854 +}
74855 +
74856 +/* helper function: decrease inode nlink count and call plugin method to save
74857 + updated stat-data.
74858 +
74859 + Used by unlink/create
74860 +*/
74861 +int reiser4_del_nlink(struct inode *object /* object from which link is
74862 + * removed */ ,
74863 + struct inode *parent /* parent where entry was */ ,
74864 + int write_sd_p /* true is stat-data has to be
74865 + * updated */ )
74866 +{
74867 + file_plugin *fplug;
74868 + int result;
74869 +
74870 + assert("nikita-1349", object != NULL);
74871 +
74872 + fplug = inode_file_plugin(object);
74873 + assert("nikita-1350", fplug != NULL);
74874 + assert("nikita-1446", object->i_nlink > 0);
74875 + assert("nikita-2210", fplug->rem_link != NULL);
74876 +
74877 + /* call plugin to do actual deletion of link */
74878 + result = fplug->rem_link(object, parent);
74879 +
74880 + /* optionally update stat data */
74881 + if (result == 0 && write_sd_p)
74882 + result = fplug->write_sd_by_inode(object);
74883 + return result;
74884 +}
74885 +
74886 +/* Release reiser4 dentry. This is d_op->d_release() method. */
74887 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74888 +{
74889 + reiser4_free_dentry_fsdata(dentry);
74890 +}
74891 +
74892 +/*
74893 + * Called by reiser4_sync_inodes(), during speculative write-back (through
74894 + * pdflush, or balance_dirty_pages()).
74895 + */
74896 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74897 +{
74898 + long written = 0;
74899 + int repeats = 0;
74900 + int result;
74901 + struct address_space *mapping;
74902 +
74903 + /*
74904 + * Performs early flushing, trying to free some memory. If there is
74905 + * nothing to flush, commits some atoms.
74906 + */
74907 +
74908 + /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74909 + sys_fsync(). */
74910 + if (wbc->sync_mode != WB_SYNC_NONE) {
74911 + txnmgr_force_commit_all(sb, 0);
74912 + return;
74913 + }
74914 +
74915 + BUG_ON(reiser4_get_super_fake(sb) == NULL);
74916 + mapping = reiser4_get_super_fake(sb)->i_mapping;
74917 + do {
74918 + long nr_submitted = 0;
74919 + jnode *node = NULL;
74920 +
74921 + /* do not put more requests to overload write queue */
74922 + if (wbc->nonblocking &&
74923 + bdi_write_congested(mapping->backing_dev_info)) {
74924 + blk_run_address_space(mapping);
74925 + wbc->encountered_congestion = 1;
74926 + break;
74927 + }
74928 + repeats++;
74929 + BUG_ON(wbc->nr_to_write <= 0);
74930 +
74931 + if (get_current_context()->entd) {
74932 + entd_context *ent = get_entd_context(sb);
74933 +
74934 + if (ent->cur_request->node)
74935 + /*
74936 + * this is ent thread and it managed to capture
74937 + * requested page itself - start flush from
74938 + * that page
74939 + */
74940 + node = ent->cur_request->node;
74941 + }
74942 +
74943 + result = flush_some_atom(node, &nr_submitted, wbc,
74944 + JNODE_FLUSH_WRITE_BLOCKS);
74945 + if (result != 0)
74946 + warning("nikita-31001", "Flush failed: %i", result);
74947 + if (node)
74948 + /* drop the reference aquired
74949 + in find_or_create_extent() */
74950 + jput(node);
74951 + if (!nr_submitted)
74952 + break;
74953 +
74954 + wbc->nr_to_write -= nr_submitted;
74955 + written += nr_submitted;
74956 + } while (wbc->nr_to_write > 0);
74957 +}
74958 +
74959 +/* tell VM how many pages were dirtied */
74960 +void reiser4_throttle_write(struct inode *inode, int nrpages)
74961 +{
74962 + reiser4_context *ctx;
74963 +
74964 + ctx = get_current_context();
74965 + reiser4_txn_restart(ctx);
74966 + current->journal_info = NULL;
74967 + balance_dirty_pages_ratelimited_nr(inode->i_mapping, nrpages);
74968 + current->journal_info = ctx;
74969 +}
74970 +
74971 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74972 +const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
74973 + * beginning of device */
74974 +
74975 +/*
74976 + * Reiser4 initialization/shutdown.
74977 + *
74978 + * Code below performs global reiser4 initialization that is done either as
74979 + * part of kernel initialization (when reiser4 is statically built-in), or
74980 + * during reiser4 module load (when compiled as module).
74981 + */
74982 +
74983 +void reiser4_handle_error(void)
74984 +{
74985 + struct super_block *sb = reiser4_get_current_sb();
74986 +
74987 + if (!sb)
74988 + return;
74989 + reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74990 + "Filesystem error occured");
74991 + switch (get_super_private(sb)->onerror) {
74992 + case 0:
74993 + reiser4_panic("foobar-42", "Filesystem error occured\n");
74994 + case 1:
74995 + default:
74996 + if (sb->s_flags & MS_RDONLY)
74997 + return;
74998 + sb->s_flags |= MS_RDONLY;
74999 + break;
75000 + }
75001 +}
75002 +
75003 +struct dentry_operations reiser4_dentry_operations = {
75004 + .d_revalidate = NULL,
75005 + .d_hash = NULL,
75006 + .d_compare = NULL,
75007 + .d_delete = NULL,
75008 + .d_release = reiser4_d_release,
75009 + .d_iput = NULL,
75010 +};
75011 +
75012 +/* Make Linus happy.
75013 + Local variables:
75014 + c-indentation-style: "K&R"
75015 + mode-name: "LC"
75016 + c-basic-offset: 8
75017 + tab-width: 8
75018 + fill-column: 120
75019 + End:
75020 +*/
75021 diff -urN linux-2.6.33.orig/fs/reiser4/vfs_ops.h linux-2.6.33/fs/reiser4/vfs_ops.h
75022 --- linux-2.6.33.orig/fs/reiser4/vfs_ops.h 1970-01-01 01:00:00.000000000 +0100
75023 +++ linux-2.6.33/fs/reiser4/vfs_ops.h 2010-03-04 19:33:22.000000000 +0100
75024 @@ -0,0 +1,53 @@
75025 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75026 + * reiser4/README */
75027 +
75028 +/* vfs_ops.c's exported symbols */
75029 +
75030 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
75031 +#define __FS_REISER4_VFS_OPS_H__
75032 +
75033 +#include "forward.h"
75034 +#include "coord.h"
75035 +#include "seal.h"
75036 +#include "plugin/file/file.h"
75037 +#include "super.h"
75038 +#include "readahead.h"
75039 +
75040 +#include <linux/types.h> /* for loff_t */
75041 +#include <linux/fs.h> /* for struct address_space */
75042 +#include <linux/dcache.h> /* for struct dentry */
75043 +#include <linux/mm.h>
75044 +#include <linux/backing-dev.h>
75045 +
75046 +/* address space operations */
75047 +int reiser4_writepage(struct page *, struct writeback_control *);
75048 +int reiser4_set_page_dirty(struct page *);
75049 +void reiser4_invalidatepage(struct page *, unsigned long offset);
75050 +int reiser4_releasepage(struct page *, gfp_t);
75051 +
75052 +extern int reiser4_update_sd(struct inode *);
75053 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
75054 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
75055 +
75056 +extern int reiser4_start_up_io(struct page *page);
75057 +extern void reiser4_throttle_write(struct inode *, int nrpages);
75058 +extern int jnode_is_releasable(jnode *);
75059 +
75060 +#define CAPTURE_APAGE_BURST (1024l)
75061 +void reiser4_writeout(struct super_block *, struct writeback_control *);
75062 +
75063 +extern void reiser4_handle_error(void);
75064 +
75065 +/* __FS_REISER4_VFS_OPS_H__ */
75066 +#endif
75067 +
75068 +/* Make Linus happy.
75069 + Local variables:
75070 + c-indentation-style: "K&R"
75071 + mode-name: "LC"
75072 + c-basic-offset: 8
75073 + tab-width: 8
75074 + fill-column: 120
75075 + scroll-step: 1
75076 + End:
75077 +*/
75078 diff -urN linux-2.6.33.orig/fs/reiser4/wander.c linux-2.6.33/fs/reiser4/wander.c
75079 --- linux-2.6.33.orig/fs/reiser4/wander.c 1970-01-01 01:00:00.000000000 +0100
75080 +++ linux-2.6.33/fs/reiser4/wander.c 2010-03-04 19:33:22.000000000 +0100
75081 @@ -0,0 +1,1798 @@
75082 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75083 + * reiser4/README */
75084 +
75085 +/* Reiser4 Wandering Log */
75086 +
75087 +/* You should read http://www.namesys.com/txn-doc.html
75088 +
75089 + That describes how filesystem operations are performed as atomic
75090 + transactions, and how we try to arrange it so that we can write most of the
75091 + data only once while performing the operation atomically.
75092 +
75093 + For the purposes of this code, it is enough for it to understand that it
75094 + has been told a given block should be written either once, or twice (if
75095 + twice then once to the wandered location and once to the real location).
75096 +
75097 + This code guarantees that those blocks that are defined to be part of an
75098 + atom either all take effect or none of them take effect.
75099 +
75100 + The "relocate set" of nodes are submitted to write by the jnode_flush()
75101 + routine, and the "overwrite set" is submitted by reiser4_write_log().
75102 + This is because with the overwrite set we seek to optimize writes, and
75103 + with the relocate set we seek to cause disk order to correlate with the
75104 + "parent first order" (preorder).
75105 +
75106 + reiser4_write_log() allocates and writes wandered blocks and maintains
75107 + additional on-disk structures of the atom as wander records (each wander
75108 + record occupies one block) for storing of the "wandered map" (a table which
75109 + contains a relation between wandered and real block numbers) and other
75110 + information which might be needed at transaction recovery time.
75111 +
75112 + The wander records are unidirectionally linked into a circle: each wander
75113 + record contains a block number of the next wander record, the last wander
75114 + record points to the first one.
75115 +
75116 + One wander record (named "tx head" in this file) has a format which is
75117 + different from the other wander records. The "tx head" has a reference to the
75118 + "tx head" block of the previously committed atom. Also, "tx head" contains
75119 + fs information (the free blocks counter, and the oid allocator state) which
75120 + is logged in a special way .
75121 +
75122 + There are two journal control blocks, named journal header and journal
75123 + footer which have fixed on-disk locations. The journal header has a
75124 + reference to the "tx head" block of the last committed atom. The journal
75125 + footer points to the "tx head" of the last flushed atom. The atom is
75126 + "played" when all blocks from its overwrite set are written to disk the
75127 + second time (i.e. written to their real locations).
75128 +
75129 + NOTE: People who know reiserfs internals and its journal structure might be
75130 + confused with these terms journal footer and journal header. There is a table
75131 + with terms of similar semantics in reiserfs (reiser3) and reiser4:
75132 +
75133 + REISER3 TERM | REISER4 TERM | DESCRIPTION
75134 + --------------------+-----------------------+----------------------------
75135 + commit record | journal header | atomic write of this record
75136 + | | ends transaction commit
75137 + --------------------+-----------------------+----------------------------
75138 + journal header | journal footer | atomic write of this record
75139 + | | ends post-commit writes.
75140 + | | After successful
75141 + | | writing of this journal
75142 + | | blocks (in reiser3) or
75143 + | | wandered blocks/records are
75144 + | | free for re-use.
75145 + --------------------+-----------------------+----------------------------
75146 +
75147 + The atom commit process is the following:
75148 +
75149 + 1. The overwrite set is taken from atom's clean list, and its size is
75150 + counted.
75151 +
75152 + 2. The number of necessary wander records (including tx head) is calculated,
75153 + and the wander record blocks are allocated.
75154 +
75155 + 3. Allocate wandered blocks and populate wander records by wandered map.
75156 +
75157 + 4. submit write requests for wander records and wandered blocks.
75158 +
75159 + 5. wait until submitted write requests complete.
75160 +
75161 + 6. update journal header: change the pointer to the block number of just
75162 + written tx head, submit an i/o for modified journal header block and wait
75163 + for i/o completion.
75164 +
75165 + NOTE: The special logging for bitmap blocks and some reiser4 super block
75166 + fields makes processes of atom commit, flush and recovering a bit more
75167 + complex (see comments in the source code for details).
75168 +
75169 + The atom playing process is the following:
75170 +
75171 + 1. Write atom's overwrite set in-place.
75172 +
75173 + 2. Wait on i/o.
75174 +
75175 + 3. Update journal footer: change the pointer to block number of tx head
75176 + block of the atom we currently flushing, submit an i/o, wait on i/o
75177 + completion.
75178 +
75179 + 4. Free disk space which was used for wandered blocks and wander records.
75180 +
75181 + After the freeing of wandered blocks and wander records we have that journal
75182 + footer points to the on-disk structure which might be overwritten soon.
75183 + Neither the log writer nor the journal recovery procedure use that pointer
75184 + for accessing the data. When the journal recovery procedure finds the oldest
75185 + transaction it compares the journal footer pointer value with the "prev_tx"
75186 + pointer value in tx head, if values are equal the oldest not flushed
75187 + transaction is found.
75188 +
75189 + NOTE on disk space leakage: the information about of what blocks and how many
75190 + blocks are allocated for wandered blocks, wandered records is not written to
75191 + the disk because of special logging for bitmaps and some super blocks
75192 + counters. After a system crash we the reiser4 does not remember those
75193 + objects allocation, thus we have no such a kind of disk space leakage.
75194 +*/
75195 +
75196 +/* Special logging of reiser4 super block fields. */
75197 +
75198 +/* There are some reiser4 super block fields (free block count and OID allocator
75199 + state (number of files and next free OID) which are logged separately from
75200 + super block to avoid unnecessary atom fusion.
75201 +
75202 + So, the reiser4 super block can be not captured by a transaction with
75203 + allocates/deallocates disk blocks or create/delete file objects. Moreover,
75204 + the reiser4 on-disk super block is not touched when such a transaction is
75205 + committed and flushed. Those "counters logged specially" are logged in "tx
75206 + head" blocks and in the journal footer block.
75207 +
75208 + A step-by-step description of special logging:
75209 +
75210 + 0. The per-atom information about deleted or created files and allocated or
75211 + freed blocks is collected during the transaction. The atom's
75212 + ->nr_objects_created and ->nr_objects_deleted are for object
75213 + deletion/creation tracking, the numbers of allocated and freed blocks are
75214 + calculated using atom's delete set and atom's capture list -- all new and
75215 + relocated nodes should be on atom's clean list and should have JNODE_RELOC
75216 + bit set.
75217 +
75218 + 1. The "logged specially" reiser4 super block fields have their "committed"
75219 + versions in the reiser4 in-memory super block. They get modified only at
75220 + atom commit time. The atom's commit thread has an exclusive access to those
75221 + "committed" fields because the log writer implementation supports only one
75222 + atom commit a time (there is a per-fs "commit" mutex). At
75223 + that time "committed" counters are modified using per-atom information
75224 + collected during the transaction. These counters are stored on disk as a
75225 + part of tx head block when atom is committed.
75226 +
75227 + 2. When the atom is flushed the value of the free block counter and the OID
75228 + allocator state get written to the journal footer block. A special journal
75229 + procedure (journal_recover_sb_data()) takes those values from the journal
75230 + footer and updates the reiser4 in-memory super block.
75231 +
75232 + NOTE: That means free block count and OID allocator state are logged
75233 + separately from the reiser4 super block regardless of the fact that the
75234 + reiser4 super block has fields to store both the free block counter and the
75235 + OID allocator.
75236 +
75237 + Writing the whole super block at commit time requires knowing true values of
75238 + all its fields without changes made by not yet committed transactions. It is
75239 + possible by having their "committed" version of the super block like the
75240 + reiser4 bitmap blocks have "committed" and "working" versions. However,
75241 + another scheme was implemented which stores special logged values in the
75242 + unused free space inside transaction head block. In my opinion it has an
75243 + advantage of not writing whole super block when only part of it was
75244 + modified. */
75245 +
75246 +#include "debug.h"
75247 +#include "dformat.h"
75248 +#include "txnmgr.h"
75249 +#include "jnode.h"
75250 +#include "znode.h"
75251 +#include "block_alloc.h"
75252 +#include "page_cache.h"
75253 +#include "wander.h"
75254 +#include "reiser4.h"
75255 +#include "super.h"
75256 +#include "vfs_ops.h"
75257 +#include "writeout.h"
75258 +#include "inode.h"
75259 +#include "entd.h"
75260 +
75261 +#include <linux/types.h>
75262 +#include <linux/fs.h> /* for struct super_block */
75263 +#include <linux/mm.h> /* for struct page */
75264 +#include <linux/pagemap.h>
75265 +#include <linux/bio.h> /* for struct bio */
75266 +#include <linux/blkdev.h>
75267 +
75268 +static int write_jnodes_to_disk_extent(
75269 + jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
75270 +
75271 +/* The commit_handle is a container for objects needed at atom commit time */
75272 +struct commit_handle {
75273 + /* A pointer to atom's list of OVRWR nodes */
75274 + struct list_head *overwrite_set;
75275 + /* atom's overwrite set size */
75276 + int overwrite_set_size;
75277 + /* jnodes for wander record blocks */
75278 + struct list_head tx_list;
75279 + /* number of wander records */
75280 + __u32 tx_size;
75281 + /* 'committed' sb counters are saved here until atom is completely
75282 + flushed */
75283 + __u64 free_blocks;
75284 + __u64 nr_files;
75285 + __u64 next_oid;
75286 + /* A pointer to the atom which is being committed */
75287 + txn_atom *atom;
75288 + /* A pointer to current super block */
75289 + struct super_block *super;
75290 + /* The counter of modified bitmaps */
75291 + reiser4_block_nr nr_bitmap;
75292 +};
75293 +
75294 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
75295 +{
75296 + memset(ch, 0, sizeof(struct commit_handle));
75297 + INIT_LIST_HEAD(&ch->tx_list);
75298 +
75299 + ch->atom = atom;
75300 + ch->super = reiser4_get_current_sb();
75301 +}
75302 +
75303 +static void done_commit_handle(struct commit_handle *ch)
75304 +{
75305 + assert("zam-690", list_empty(&ch->tx_list));
75306 +}
75307 +
75308 +static inline int reiser4_use_write_barrier(struct super_block * s)
75309 +{
75310 + return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
75311 +}
75312 +
75313 +static void disable_write_barrier(struct super_block * s)
75314 +{
75315 + notice("zam-1055", "%s does not support write barriers,"
75316 + " using synchronous write instead.", s->s_id);
75317 + set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
75318 +}
75319 +
75320 +/* fill journal header block data */
75321 +static void format_journal_header(struct commit_handle *ch)
75322 +{
75323 + struct reiser4_super_info_data *sbinfo;
75324 + struct journal_header *header;
75325 + jnode *txhead;
75326 +
75327 + sbinfo = get_super_private(ch->super);
75328 + assert("zam-479", sbinfo != NULL);
75329 + assert("zam-480", sbinfo->journal_header != NULL);
75330 +
75331 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75332 +
75333 + jload(sbinfo->journal_header);
75334 +
75335 + header = (struct journal_header *)jdata(sbinfo->journal_header);
75336 + assert("zam-484", header != NULL);
75337 +
75338 + put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
75339 + &header->last_committed_tx);
75340 +
75341 + jrelse(sbinfo->journal_header);
75342 +}
75343 +
75344 +/* fill journal footer block data */
75345 +static void format_journal_footer(struct commit_handle *ch)
75346 +{
75347 + struct reiser4_super_info_data *sbinfo;
75348 + struct journal_footer *footer;
75349 + jnode *tx_head;
75350 +
75351 + sbinfo = get_super_private(ch->super);
75352 +
75353 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75354 +
75355 + assert("zam-493", sbinfo != NULL);
75356 + assert("zam-494", sbinfo->journal_header != NULL);
75357 +
75358 + check_me("zam-691", jload(sbinfo->journal_footer) == 0);
75359 +
75360 + footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
75361 + assert("zam-495", footer != NULL);
75362 +
75363 + put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
75364 + &footer->last_flushed_tx);
75365 + put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
75366 +
75367 + put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
75368 + put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
75369 +
75370 + jrelse(sbinfo->journal_footer);
75371 +}
75372 +
75373 +/* wander record capacity depends on current block size */
75374 +static int wander_record_capacity(const struct super_block *super)
75375 +{
75376 + return (super->s_blocksize -
75377 + sizeof(struct wander_record_header)) /
75378 + sizeof(struct wander_entry);
75379 +}
75380 +
75381 +/* Fill first wander record (tx head) in accordance with supplied given data */
75382 +static void format_tx_head(struct commit_handle *ch)
75383 +{
75384 + jnode *tx_head;
75385 + jnode *next;
75386 + struct tx_header *header;
75387 +
75388 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75389 + assert("zam-692", &ch->tx_list != &tx_head->capture_link);
75390 +
75391 + next = list_entry(tx_head->capture_link.next, jnode, capture_link);
75392 + if (&ch->tx_list == &next->capture_link)
75393 + next = tx_head;
75394 +
75395 + header = (struct tx_header *)jdata(tx_head);
75396 +
75397 + assert("zam-460", header != NULL);
75398 + assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
75399 +
75400 + memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
75401 + memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
75402 +
75403 + put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
75404 + put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
75405 + &header->prev_tx);
75406 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
75407 + put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
75408 + put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
75409 + put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
75410 +}
75411 +
75412 +/* prepare ordinary wander record block (fill all service fields) */
75413 +static void
75414 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
75415 +{
75416 + struct wander_record_header *LRH;
75417 + jnode *next;
75418 +
75419 + assert("zam-464", node != NULL);
75420 +
75421 + LRH = (struct wander_record_header *)jdata(node);
75422 + next = list_entry(node->capture_link.next, jnode, capture_link);
75423 +
75424 + if (&ch->tx_list == &next->capture_link)
75425 + next = list_entry(ch->tx_list.next, jnode, capture_link);
75426 +
75427 + assert("zam-465", LRH != NULL);
75428 + assert("zam-463",
75429 + ch->super->s_blocksize > sizeof(struct wander_record_header));
75430 +
75431 + memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75432 + memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75433 +
75434 + put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75435 + put_unaligned(cpu_to_le32(serial), &LRH->serial);
75436 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75437 +}
75438 +
75439 +/* add one wandered map entry to formatted wander record */
75440 +static void
75441 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
75442 + const reiser4_block_nr * b)
75443 +{
75444 + char *data;
75445 + struct wander_entry *pairs;
75446 +
75447 + data = jdata(node);
75448 + assert("zam-451", data != NULL);
75449 +
75450 + pairs =
75451 + (struct wander_entry *)(data + sizeof(struct wander_record_header));
75452 +
75453 + put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75454 + put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75455 +}
75456 +
75457 +/* currently, wander records contains contain only wandered map, which depend on
75458 + overwrite set size */
75459 +static void get_tx_size(struct commit_handle *ch)
75460 +{
75461 + assert("zam-440", ch->overwrite_set_size != 0);
75462 + assert("zam-695", ch->tx_size == 0);
75463 +
75464 + /* count all ordinary wander records
75465 + (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75466 + for tx head block */
75467 + ch->tx_size =
75468 + (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75469 + 2;
75470 +}
75471 +
75472 +/* A special structure for using in store_wmap_actor() for saving its state
75473 + between calls */
75474 +struct store_wmap_params {
75475 + jnode *cur; /* jnode of current wander record to fill */
75476 + int idx; /* free element index in wander record */
75477 + int capacity; /* capacity */
75478 +
75479 +#if REISER4_DEBUG
75480 + struct list_head *tx_list;
75481 +#endif
75482 +};
75483 +
75484 +/* an actor for use in blocknr_set_iterator routine which populates the list
75485 + of pre-formatted wander records by wandered map info */
75486 +static int
75487 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75488 + const reiser4_block_nr * b, void *data)
75489 +{
75490 + struct store_wmap_params *params = data;
75491 +
75492 + if (params->idx >= params->capacity) {
75493 + /* a new wander record should be taken from the tx_list */
75494 + params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75495 + assert("zam-454",
75496 + params->tx_list != &params->cur->capture_link);
75497 +
75498 + params->idx = 0;
75499 + }
75500 +
75501 + store_entry(params->cur, params->idx, a, b);
75502 + params->idx++;
75503 +
75504 + return 0;
75505 +}
75506 +
75507 +/* This function is called after Relocate set gets written to disk, Overwrite
75508 + set is written to wandered locations and all wander records are written
75509 + also. Updated journal header blocks contains a pointer (block number) to
75510 + first wander record of the just written transaction */
75511 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
75512 +{
75513 + struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75514 + jnode *jh = sbinfo->journal_header;
75515 + jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75516 + int ret;
75517 +
75518 + format_journal_header(ch);
75519 +
75520 + ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75521 + use_barrier ? WRITEOUT_BARRIER : 0);
75522 + if (ret)
75523 + return ret;
75524 +
75525 + /* blk_run_address_space(sbinfo->fake->i_mapping);
75526 + * blk_run_queues(); */
75527 +
75528 + ret = jwait_io(jh, WRITE);
75529 +
75530 + if (ret)
75531 + return ret;
75532 +
75533 + sbinfo->last_committed_tx = *jnode_get_block(head);
75534 +
75535 + return 0;
75536 +}
75537 +
75538 +/* This function is called after write-back is finished. We update journal
75539 + footer block and free blocks which were occupied by wandered blocks and
75540 + transaction wander records */
75541 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75542 +{
75543 + reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75544 +
75545 + jnode *jf = sbinfo->journal_footer;
75546 +
75547 + int ret;
75548 +
75549 + format_journal_footer(ch);
75550 +
75551 + ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75552 + use_barrier ? WRITEOUT_BARRIER : 0);
75553 + if (ret)
75554 + return ret;
75555 +
75556 + /* blk_run_address_space(sbinfo->fake->i_mapping);
75557 + * blk_run_queue(); */
75558 +
75559 + ret = jwait_io(jf, WRITE);
75560 + if (ret)
75561 + return ret;
75562 +
75563 + return 0;
75564 +}
75565 +
75566 +/* free block numbers of wander records of already written in place transaction */
75567 +static void dealloc_tx_list(struct commit_handle *ch)
75568 +{
75569 + while (!list_empty(&ch->tx_list)) {
75570 + jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75571 + list_del(&cur->capture_link);
75572 + ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75573 + reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75574 + BA_FORMATTED);
75575 +
75576 + unpin_jnode_data(cur);
75577 + reiser4_drop_io_head(cur);
75578 + }
75579 +}
75580 +
75581 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75582 + from atom's overwrite set. */
75583 +static int
75584 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75585 + const reiser4_block_nr * a UNUSED_ARG,
75586 + const reiser4_block_nr * b, void *data UNUSED_ARG)
75587 +{
75588 +
75589 + assert("zam-499", b != NULL);
75590 + assert("zam-500", *b != 0);
75591 + assert("zam-501", !reiser4_blocknr_is_fake(b));
75592 +
75593 + reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75594 + return 0;
75595 +}
75596 +
75597 +/* free wandered block locations of already written in place transaction */
75598 +static void dealloc_wmap(struct commit_handle *ch)
75599 +{
75600 + assert("zam-696", ch->atom != NULL);
75601 +
75602 + blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75603 + dealloc_wmap_actor, NULL, 1);
75604 +}
75605 +
75606 +/* helper function for alloc wandered blocks, which refill set of block
75607 + numbers needed for wandered blocks */
75608 +static int
75609 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75610 +{
75611 + reiser4_blocknr_hint hint;
75612 + int ret;
75613 +
75614 + reiser4_block_nr wide_len = count;
75615 +
75616 + /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75617 + ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75618 + reserved allocation area so as to get the best qualities of fixed
75619 + journals? */
75620 + reiser4_blocknr_hint_init(&hint);
75621 + hint.block_stage = BLOCK_GRABBED;
75622 +
75623 + ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75624 + BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75625 + *len = (int)wide_len;
75626 +
75627 + return ret;
75628 +}
75629 +
75630 +/*
75631 + * roll back changes made before issuing BIO in the case of IO error.
75632 + */
75633 +static void undo_bio(struct bio *bio)
75634 +{
75635 + int i;
75636 +
75637 + for (i = 0; i < bio->bi_vcnt; ++i) {
75638 + struct page *pg;
75639 + jnode *node;
75640 +
75641 + pg = bio->bi_io_vec[i].bv_page;
75642 + end_page_writeback(pg);
75643 + node = jprivate(pg);
75644 + spin_lock_jnode(node);
75645 + JF_CLR(node, JNODE_WRITEBACK);
75646 + JF_SET(node, JNODE_DIRTY);
75647 + spin_unlock_jnode(node);
75648 + }
75649 + bio_put(bio);
75650 +}
75651 +
75652 +/* put overwrite set back to atom's clean list */
75653 +static void put_overwrite_set(struct commit_handle *ch)
75654 +{
75655 + jnode *cur;
75656 +
75657 + list_for_each_entry(cur, ch->overwrite_set, capture_link)
75658 + jrelse_tail(cur);
75659 +}
75660 +
75661 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
75662 + Since we have a separate list for atom's overwrite set we just scan the list,
75663 + count bitmap and other not leaf nodes which wandered blocks allocation we
75664 + have to grab space for. */
75665 +static int get_overwrite_set(struct commit_handle *ch)
75666 +{
75667 + int ret;
75668 + jnode *cur;
75669 + __u64 nr_not_leaves = 0;
75670 +#if REISER4_DEBUG
75671 + __u64 nr_formatted_leaves = 0;
75672 + __u64 nr_unformatted_leaves = 0;
75673 +#endif
75674 +
75675 + assert("zam-697", ch->overwrite_set_size == 0);
75676 +
75677 + ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75678 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75679 +
75680 + while (ch->overwrite_set != &cur->capture_link) {
75681 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75682 +
75683 + /* Count bitmap locks for getting correct statistics what number
75684 + * of blocks were cleared by the transaction commit. */
75685 + if (jnode_get_type(cur) == JNODE_BITMAP)
75686 + ch->nr_bitmap++;
75687 +
75688 + assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75689 + || jnode_get_type(cur) == JNODE_BITMAP);
75690 +
75691 + if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75692 + /* we replace fake znode by another (real)
75693 + znode which is suggested by disk_layout
75694 + plugin */
75695 +
75696 + /* FIXME: it looks like fake znode should be
75697 + replaced by jnode supplied by
75698 + disk_layout. */
75699 +
75700 + struct super_block *s = reiser4_get_current_sb();
75701 + reiser4_super_info_data *sbinfo =
75702 + get_current_super_private();
75703 +
75704 + if (sbinfo->df_plug->log_super) {
75705 + jnode *sj = sbinfo->df_plug->log_super(s);
75706 +
75707 + assert("zam-593", sj != NULL);
75708 +
75709 + if (IS_ERR(sj))
75710 + return PTR_ERR(sj);
75711 +
75712 + spin_lock_jnode(sj);
75713 + JF_SET(sj, JNODE_OVRWR);
75714 + insert_into_atom_ovrwr_list(ch->atom, sj);
75715 + spin_unlock_jnode(sj);
75716 +
75717 + /* jload it as the rest of overwrite set */
75718 + jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75719 +
75720 + ch->overwrite_set_size++;
75721 + }
75722 + spin_lock_jnode(cur);
75723 + reiser4_uncapture_block(cur);
75724 + jput(cur);
75725 +
75726 + } else {
75727 + int ret;
75728 + ch->overwrite_set_size++;
75729 + ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75730 + if (ret)
75731 + reiser4_panic("zam-783",
75732 + "cannot load e-flushed jnode back (ret = %d)\n",
75733 + ret);
75734 + }
75735 +
75736 + /* Count not leaves here because we have to grab disk space
75737 + * for wandered blocks. They were not counted as "flush
75738 + * reserved". Counting should be done _after_ nodes are pinned
75739 + * into memory by jload(). */
75740 + if (!jnode_is_leaf(cur))
75741 + nr_not_leaves++;
75742 + else {
75743 +#if REISER4_DEBUG
75744 + /* at this point @cur either has JNODE_FLUSH_RESERVED
75745 + * or is eflushed. Locking is not strong enough to
75746 + * write an assertion checking for this. */
75747 + if (jnode_is_znode(cur))
75748 + nr_formatted_leaves++;
75749 + else
75750 + nr_unformatted_leaves++;
75751 +#endif
75752 + JF_CLR(cur, JNODE_FLUSH_RESERVED);
75753 + }
75754 +
75755 + cur = next;
75756 + }
75757 +
75758 + /* Grab space for writing (wandered blocks) of not leaves found in
75759 + * overwrite set. */
75760 + ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75761 + if (ret)
75762 + return ret;
75763 +
75764 + /* Disk space for allocation of wandered blocks of leaf nodes already
75765 + * reserved as "flush reserved", move it to grabbed space counter. */
75766 + spin_lock_atom(ch->atom);
75767 + assert("zam-940",
75768 + nr_formatted_leaves + nr_unformatted_leaves <=
75769 + ch->atom->flush_reserved);
75770 + flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75771 + spin_unlock_atom(ch->atom);
75772 +
75773 + return ch->overwrite_set_size;
75774 +}
75775 +
75776 +/**
75777 + * write_jnodes_to_disk_extent - submit write request
75778 + * @head:
75779 + * @first: first jnode of the list
75780 + * @nr: number of jnodes on the list
75781 + * @block_p:
75782 + * @fq:
75783 + * @flags: used to decide whether page is to get PG_reclaim flag
75784 + *
75785 + * Submits a write request for @nr jnodes beginning from the @first, other
75786 + * jnodes are after the @first on the double-linked "capture" list. All jnodes
75787 + * will be written to the disk region of @nr blocks starting with @block_p block
75788 + * number. If @fq is not NULL it means that waiting for i/o completion will be
75789 + * done more efficiently by using flush_queue_t objects.
75790 + * This function is the one which writes list of jnodes in batch mode. It does
75791 + * all low-level things as bio construction and page states manipulation.
75792 + *
75793 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75794 + * aggregated in this function instead of being left to the layers below
75795 + *
75796 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75797 + * Why that layer needed? Why BIOs cannot be constructed here?
75798 + */
75799 +static int write_jnodes_to_disk_extent(
75800 + jnode *first, int nr, const reiser4_block_nr *block_p,
75801 + flush_queue_t *fq, int flags)
75802 +{
75803 + struct super_block *super = reiser4_get_current_sb();
75804 + int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75805 + int max_blocks;
75806 + jnode *cur = first;
75807 + reiser4_block_nr block;
75808 +
75809 + assert("zam-571", first != NULL);
75810 + assert("zam-572", block_p != NULL);
75811 + assert("zam-570", nr > 0);
75812 +
75813 + block = *block_p;
75814 + max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75815 +
75816 + while (nr > 0) {
75817 + struct bio *bio;
75818 + int nr_blocks = min(nr, max_blocks);
75819 + int i;
75820 + int nr_used;
75821 +
75822 + bio = bio_alloc(GFP_NOIO, nr_blocks);
75823 + if (!bio)
75824 + return RETERR(-ENOMEM);
75825 +
75826 + bio->bi_bdev = super->s_bdev;
75827 + bio->bi_sector = block * (super->s_blocksize >> 9);
75828 + for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75829 + struct page *pg;
75830 +
75831 + pg = jnode_page(cur);
75832 + assert("zam-573", pg != NULL);
75833 +
75834 + page_cache_get(pg);
75835 +
75836 + lock_and_wait_page_writeback(pg);
75837 +
75838 + if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75839 + /*
75840 + * underlying device is satiated. Stop adding
75841 + * pages to the bio.
75842 + */
75843 + unlock_page(pg);
75844 + page_cache_release(pg);
75845 + break;
75846 + }
75847 +
75848 + spin_lock_jnode(cur);
75849 + assert("nikita-3166",
75850 + pg->mapping == jnode_get_mapping(cur));
75851 + assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75852 +#if REISER4_DEBUG
75853 + spin_lock(&cur->load);
75854 + assert("nikita-3165", !jnode_is_releasable(cur));
75855 + spin_unlock(&cur->load);
75856 +#endif
75857 + JF_SET(cur, JNODE_WRITEBACK);
75858 + JF_CLR(cur, JNODE_DIRTY);
75859 + ON_DEBUG(cur->written++);
75860 + spin_unlock_jnode(cur);
75861 +
75862 + ClearPageError(pg);
75863 + set_page_writeback(pg);
75864 +
75865 + if (get_current_context()->entd) {
75866 + /* this is ent thread */
75867 + entd_context *ent = get_entd_context(super);
75868 + struct wbq *rq, *next;
75869 +
75870 + spin_lock(&ent->guard);
75871 +
75872 + if (pg == ent->cur_request->page) {
75873 + /*
75874 + * entd is called for this page. This
75875 + * request is not in th etodo list
75876 + */
75877 + ent->cur_request->written = 1;
75878 + } else {
75879 + /*
75880 + * if we have written a page for which writepage
75881 + * is called for - move request to another list.
75882 + */
75883 + list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75884 + assert("", rq->magic == WBQ_MAGIC);
75885 + if (pg == rq->page) {
75886 + /*
75887 + * remove request from
75888 + * entd's queue, but do
75889 + * not wake up a thread
75890 + * which put this
75891 + * request
75892 + */
75893 + list_del_init(&rq->link);
75894 + ent->nr_todo_reqs --;
75895 + list_add_tail(&rq->link, &ent->done_list);
75896 + ent->nr_done_reqs ++;
75897 + rq->written = 1;
75898 + break;
75899 + }
75900 + }
75901 + }
75902 + spin_unlock(&ent->guard);
75903 + }
75904 +
75905 + clear_page_dirty_for_io(pg);
75906 +
75907 + unlock_page(pg);
75908 +
75909 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75910 + nr_used++;
75911 + }
75912 + if (nr_used > 0) {
75913 + assert("nikita-3453",
75914 + bio->bi_size == super->s_blocksize * nr_used);
75915 + assert("nikita-3454", bio->bi_vcnt == nr_used);
75916 +
75917 + /* Check if we are allowed to write at all */
75918 + if (super->s_flags & MS_RDONLY)
75919 + undo_bio(bio);
75920 + else {
75921 + int not_supported;
75922 +
75923 + add_fq_to_bio(fq, bio);
75924 + bio_get(bio);
75925 + reiser4_submit_bio(write_op, bio);
75926 + not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75927 + bio_put(bio);
75928 + if (not_supported)
75929 + return -EOPNOTSUPP;
75930 + }
75931 +
75932 + block += nr_used - 1;
75933 + update_blocknr_hint_default(super, &block);
75934 + block += 1;
75935 + } else {
75936 + bio_put(bio);
75937 + }
75938 + nr -= nr_used;
75939 + }
75940 +
75941 + return 0;
75942 +}
75943 +
75944 +/* This is a procedure which recovers a contiguous sequences of disk block
75945 + numbers in the given list of j-nodes and submits write requests on this
75946 + per-sequence basis */
75947 +int
75948 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
75949 + long *nr_submitted, int flags)
75950 +{
75951 + int ret;
75952 + jnode *beg = list_entry(head->next, jnode, capture_link);
75953 +
75954 + while (head != &beg->capture_link) {
75955 + int nr = 1;
75956 + jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75957 +
75958 + while (head != &cur->capture_link) {
75959 + if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75960 + break;
75961 + ++nr;
75962 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75963 + }
75964 +
75965 + ret = write_jnodes_to_disk_extent(
75966 + beg, nr, jnode_get_block(beg), fq, flags);
75967 + if (ret)
75968 + return ret;
75969 +
75970 + if (nr_submitted)
75971 + *nr_submitted += nr;
75972 +
75973 + beg = cur;
75974 + }
75975 +
75976 + return 0;
75977 +}
75978 +
75979 +/* add given wandered mapping to atom's wandered map */
75980 +static int
75981 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75982 +{
75983 + int ret;
75984 + blocknr_set_entry *new_bsep = NULL;
75985 + reiser4_block_nr block;
75986 +
75987 + txn_atom *atom;
75988 +
75989 + assert("zam-568", block_p != NULL);
75990 + block = *block_p;
75991 + assert("zam-569", len > 0);
75992 +
75993 + while ((len--) > 0) {
75994 + do {
75995 + atom = get_current_atom_locked();
75996 + assert("zam-536",
75997 + !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75998 + ret =
75999 + blocknr_set_add_pair(atom, &atom->wandered_map,
76000 + &new_bsep,
76001 + jnode_get_block(cur), &block);
76002 + } while (ret == -E_REPEAT);
76003 +
76004 + if (ret) {
76005 + /* deallocate blocks which were not added to wandered
76006 + map */
76007 + reiser4_block_nr wide_len = len;
76008 +
76009 + reiser4_dealloc_blocks(&block, &wide_len,
76010 + BLOCK_NOT_COUNTED,
76011 + BA_FORMATTED
76012 + /* formatted, without defer */ );
76013 +
76014 + return ret;
76015 + }
76016 +
76017 + spin_unlock_atom(atom);
76018 +
76019 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
76020 + ++block;
76021 + }
76022 +
76023 + return 0;
76024 +}
76025 +
76026 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
76027 + submit IO for allocated blocks. We assume that current atom is in a stage
76028 + when any atom fusion is impossible and atom is unlocked and it is safe. */
76029 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
76030 +{
76031 + reiser4_block_nr block;
76032 +
76033 + int rest;
76034 + int len;
76035 + int ret;
76036 +
76037 + jnode *cur;
76038 +
76039 + assert("zam-534", ch->overwrite_set_size > 0);
76040 +
76041 + rest = ch->overwrite_set_size;
76042 +
76043 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
76044 + while (ch->overwrite_set != &cur->capture_link) {
76045 + assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
76046 +
76047 + ret = get_more_wandered_blocks(rest, &block, &len);
76048 + if (ret)
76049 + return ret;
76050 +
76051 + rest -= len;
76052 +
76053 + ret = add_region_to_wmap(cur, len, &block);
76054 + if (ret)
76055 + return ret;
76056 +
76057 + ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
76058 + if (ret)
76059 + return ret;
76060 +
76061 + while ((len--) > 0) {
76062 + assert("zam-604",
76063 + ch->overwrite_set != &cur->capture_link);
76064 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
76065 + }
76066 + }
76067 +
76068 + return 0;
76069 +}
76070 +
76071 +/* allocate given number of nodes over the journal area and link them into a
76072 + list, return pointer to the first jnode in the list */
76073 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
76074 +{
76075 + reiser4_blocknr_hint hint;
76076 + reiser4_block_nr allocated = 0;
76077 + reiser4_block_nr first, len;
76078 + jnode *cur;
76079 + jnode *txhead;
76080 + int ret;
76081 + reiser4_context *ctx;
76082 + reiser4_super_info_data *sbinfo;
76083 +
76084 + assert("zam-698", ch->tx_size > 0);
76085 + assert("zam-699", list_empty_careful(&ch->tx_list));
76086 +
76087 + ctx = get_current_context();
76088 + sbinfo = get_super_private(ctx->super);
76089 +
76090 + while (allocated < (unsigned)ch->tx_size) {
76091 + len = (ch->tx_size - allocated);
76092 +
76093 + reiser4_blocknr_hint_init(&hint);
76094 +
76095 + hint.block_stage = BLOCK_GRABBED;
76096 +
76097 + /* FIXME: there should be some block allocation policy for
76098 + nodes which contain wander records */
76099 +
76100 + /* We assume that disk space for wandered record blocks can be
76101 + * taken from reserved area. */
76102 + ret = reiser4_alloc_blocks(&hint, &first, &len,
76103 + BA_FORMATTED | BA_RESERVED |
76104 + BA_USE_DEFAULT_SEARCH_START);
76105 + reiser4_blocknr_hint_done(&hint);
76106 +
76107 + if (ret)
76108 + return ret;
76109 +
76110 + allocated += len;
76111 +
76112 + /* create jnodes for all wander records */
76113 + while (len--) {
76114 + cur = reiser4_alloc_io_head(&first);
76115 +
76116 + if (cur == NULL) {
76117 + ret = RETERR(-ENOMEM);
76118 + goto free_not_assigned;
76119 + }
76120 +
76121 + ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
76122 +
76123 + if (ret != 0) {
76124 + jfree(cur);
76125 + goto free_not_assigned;
76126 + }
76127 +
76128 + pin_jnode_data(cur);
76129 +
76130 + list_add_tail(&cur->capture_link, &ch->tx_list);
76131 +
76132 + first++;
76133 + }
76134 + }
76135 +
76136 + { /* format a on-disk linked list of wander records */
76137 + int serial = 1;
76138 +
76139 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
76140 + format_tx_head(ch);
76141 +
76142 + cur = list_entry(txhead->capture_link.next, jnode, capture_link);
76143 + while (&ch->tx_list != &cur->capture_link) {
76144 + format_wander_record(ch, cur, serial++);
76145 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
76146 + }
76147 + }
76148 +
76149 + { /* Fill wander records with Wandered Set */
76150 + struct store_wmap_params params;
76151 + txn_atom *atom;
76152 +
76153 + params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
76154 +
76155 + params.idx = 0;
76156 + params.capacity =
76157 + wander_record_capacity(reiser4_get_current_sb());
76158 +
76159 + atom = get_current_atom_locked();
76160 + blocknr_set_iterator(atom, &atom->wandered_map,
76161 + &store_wmap_actor, &params, 0);
76162 + spin_unlock_atom(atom);
76163 + }
76164 +
76165 + { /* relse all jnodes from tx_list */
76166 + cur = list_entry(ch->tx_list.next, jnode, capture_link);
76167 + while (&ch->tx_list != &cur->capture_link) {
76168 + jrelse(cur);
76169 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
76170 + }
76171 + }
76172 +
76173 + ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
76174 +
76175 + return ret;
76176 +
76177 + free_not_assigned:
76178 + /* We deallocate blocks not yet assigned to jnodes on tx_list. The
76179 + caller takes care about invalidating of tx list */
76180 + reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
76181 +
76182 + return ret;
76183 +}
76184 +
76185 +static int commit_tx(struct commit_handle *ch)
76186 +{
76187 + flush_queue_t *fq;
76188 + int barrier;
76189 + int ret;
76190 +
76191 + /* Grab more space for wandered records. */
76192 + ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
76193 + if (ret)
76194 + return ret;
76195 +
76196 + fq = get_fq_for_current_atom();
76197 + if (IS_ERR(fq))
76198 + return PTR_ERR(fq);
76199 +
76200 + spin_unlock_atom(fq->atom);
76201 + do {
76202 + ret = alloc_wandered_blocks(ch, fq);
76203 + if (ret)
76204 + break;
76205 + ret = alloc_tx(ch, fq);
76206 + if (ret)
76207 + break;
76208 + } while (0);
76209 +
76210 + reiser4_fq_put(fq);
76211 + if (ret)
76212 + return ret;
76213 + repeat_wo_barrier:
76214 + barrier = reiser4_use_write_barrier(ch->super);
76215 + if (!barrier) {
76216 + ret = current_atom_finish_all_fq();
76217 + if (ret)
76218 + return ret;
76219 + }
76220 + ret = update_journal_header(ch, barrier);
76221 + if (barrier) {
76222 + if (ret) {
76223 + if (ret == -EOPNOTSUPP) {
76224 + disable_write_barrier(ch->super);
76225 + goto repeat_wo_barrier;
76226 + }
76227 + return ret;
76228 + }
76229 + ret = current_atom_finish_all_fq();
76230 + }
76231 + return ret;
76232 +}
76233 +
76234 +static int write_tx_back(struct commit_handle * ch)
76235 +{
76236 + flush_queue_t *fq;
76237 + int ret;
76238 + int barrier;
76239 +
76240 + reiser4_post_commit_hook();
76241 + fq = get_fq_for_current_atom();
76242 + if (IS_ERR(fq))
76243 + return PTR_ERR(fq);
76244 + spin_unlock_atom(fq->atom);
76245 + ret = write_jnode_list(
76246 + ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
76247 + reiser4_fq_put(fq);
76248 + if (ret)
76249 + return ret;
76250 + repeat_wo_barrier:
76251 + barrier = reiser4_use_write_barrier(ch->super);
76252 + if (!barrier) {
76253 + ret = current_atom_finish_all_fq();
76254 + if (ret)
76255 + return ret;
76256 + }
76257 + ret = update_journal_footer(ch, barrier);
76258 + if (barrier) {
76259 + if (ret) {
76260 + if (ret == -EOPNOTSUPP) {
76261 + disable_write_barrier(ch->super);
76262 + goto repeat_wo_barrier;
76263 + }
76264 + return ret;
76265 + }
76266 + ret = current_atom_finish_all_fq();
76267 + }
76268 + if (ret)
76269 + return ret;
76270 + reiser4_post_write_back_hook();
76271 + return 0;
76272 +}
76273 +
76274 +/* We assume that at this moment all captured blocks are marked as RELOC or
76275 + WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
76276 + are submitted to write.
76277 +*/
76278 +
76279 +int reiser4_write_logs(long *nr_submitted)
76280 +{
76281 + txn_atom *atom;
76282 + struct super_block *super = reiser4_get_current_sb();
76283 + reiser4_super_info_data *sbinfo = get_super_private(super);
76284 + struct commit_handle ch;
76285 + int ret;
76286 +
76287 + writeout_mode_enable();
76288 +
76289 + /* block allocator may add j-nodes to the clean_list */
76290 + ret = reiser4_pre_commit_hook();
76291 + if (ret)
76292 + return ret;
76293 +
76294 + /* No locks are required if we take atom which stage >=
76295 + * ASTAGE_PRE_COMMIT */
76296 + atom = get_current_context()->trans->atom;
76297 + assert("zam-965", atom != NULL);
76298 +
76299 + /* relocate set is on the atom->clean_nodes list after
76300 + * current_atom_complete_writes() finishes. It can be safely
76301 + * uncaptured after commit_mutex is locked, because any atom that
76302 + * captures these nodes is guaranteed to commit after current one.
76303 + *
76304 + * This can only be done after reiser4_pre_commit_hook(), because it is where
76305 + * early flushed jnodes with CREATED bit are transferred to the
76306 + * overwrite list. */
76307 + reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
76308 + spin_lock_atom(atom);
76309 + /* There might be waiters for the relocate nodes which we have
76310 + * released, wake them up. */
76311 + reiser4_atom_send_event(atom);
76312 + spin_unlock_atom(atom);
76313 +
76314 + if (REISER4_DEBUG) {
76315 + int level;
76316 +
76317 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
76318 + assert("nikita-3352",
76319 + list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
76320 + }
76321 +
76322 + sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
76323 + sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
76324 +
76325 + init_commit_handle(&ch, atom);
76326 +
76327 + ch.free_blocks = sbinfo->blocks_free_committed;
76328 + ch.nr_files = sbinfo->nr_files_committed;
76329 + /* ZAM-FIXME-HANS: email me what the contention level is for the super
76330 + * lock. */
76331 + ch.next_oid = oid_next(super);
76332 +
76333 + /* count overwrite set and place it in a separate list */
76334 + ret = get_overwrite_set(&ch);
76335 +
76336 + if (ret <= 0) {
76337 + /* It is possible that overwrite set is empty here, it means
76338 + all captured nodes are clean */
76339 + goto up_and_ret;
76340 + }
76341 +
76342 + /* Inform the caller about what number of dirty pages will be
76343 + * submitted to disk. */
76344 + *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
76345 +
76346 + /* count all records needed for storing of the wandered set */
76347 + get_tx_size(&ch);
76348 +
76349 + ret = commit_tx(&ch);
76350 + if (ret)
76351 + goto up_and_ret;
76352 +
76353 + spin_lock_atom(atom);
76354 + reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
76355 + spin_unlock_atom(atom);
76356 +
76357 + ret = write_tx_back(&ch);
76358 + reiser4_post_write_back_hook();
76359 +
76360 + up_and_ret:
76361 + if (ret) {
76362 + /* there could be fq attached to current atom; the only way to
76363 + remove them is: */
76364 + current_atom_finish_all_fq();
76365 + }
76366 +
76367 + /* free blocks of flushed transaction */
76368 + dealloc_tx_list(&ch);
76369 + dealloc_wmap(&ch);
76370 +
76371 + put_overwrite_set(&ch);
76372 +
76373 + done_commit_handle(&ch);
76374 +
76375 + writeout_mode_disable();
76376 +
76377 + return ret;
76378 +}
76379 +
76380 +/* consistency checks for journal data/control blocks: header, footer, log
76381 + records, transactions head blocks. All functions return zero on success. */
76382 +
76383 +static int check_journal_header(const jnode * node UNUSED_ARG)
76384 +{
76385 + /* FIXME: journal header has no magic field yet. */
76386 + return 0;
76387 +}
76388 +
76389 +/* wait for write completion for all jnodes from given list */
76390 +static int wait_on_jnode_list(struct list_head *head)
76391 +{
76392 + jnode *scan;
76393 + int ret = 0;
76394 +
76395 + list_for_each_entry(scan, head, capture_link) {
76396 + struct page *pg = jnode_page(scan);
76397 +
76398 + if (pg) {
76399 + if (PageWriteback(pg))
76400 + wait_on_page_writeback(pg);
76401 +
76402 + if (PageError(pg))
76403 + ret++;
76404 + }
76405 + }
76406 +
76407 + return ret;
76408 +}
76409 +
76410 +static int check_journal_footer(const jnode * node UNUSED_ARG)
76411 +{
76412 + /* FIXME: journal footer has no magic field yet. */
76413 + return 0;
76414 +}
76415 +
76416 +static int check_tx_head(const jnode * node)
76417 +{
76418 + struct tx_header *header = (struct tx_header *)jdata(node);
76419 +
76420 + if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
76421 + warning("zam-627", "tx head at block %s corrupted\n",
76422 + sprint_address(jnode_get_block(node)));
76423 + return RETERR(-EIO);
76424 + }
76425 +
76426 + return 0;
76427 +}
76428 +
76429 +static int check_wander_record(const jnode * node)
76430 +{
76431 + struct wander_record_header *RH =
76432 + (struct wander_record_header *)jdata(node);
76433 +
76434 + if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76435 + 0) {
76436 + warning("zam-628", "wander record at block %s corrupted\n",
76437 + sprint_address(jnode_get_block(node)));
76438 + return RETERR(-EIO);
76439 + }
76440 +
76441 + return 0;
76442 +}
76443 +
76444 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
76445 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76446 +{
76447 + struct tx_header *TXH;
76448 + int ret;
76449 +
76450 + ret = jload(tx_head);
76451 + if (ret)
76452 + return ret;
76453 +
76454 + TXH = (struct tx_header *)jdata(tx_head);
76455 +
76456 + ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76457 + ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76458 + ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76459 +
76460 + jrelse(tx_head);
76461 +
76462 + list_add(&tx_head->capture_link, &ch->tx_list);
76463 +
76464 + return 0;
76465 +}
76466 +
76467 +/* replay one transaction: restore and write overwrite set in place */
76468 +static int replay_transaction(const struct super_block *s,
76469 + jnode * tx_head,
76470 + const reiser4_block_nr * log_rec_block_p,
76471 + const reiser4_block_nr * end_block,
76472 + unsigned int nr_wander_records)
76473 +{
76474 + reiser4_block_nr log_rec_block = *log_rec_block_p;
76475 + struct commit_handle ch;
76476 + LIST_HEAD(overwrite_set);
76477 + jnode *log;
76478 + int ret;
76479 +
76480 + init_commit_handle(&ch, NULL);
76481 + ch.overwrite_set = &overwrite_set;
76482 +
76483 + restore_commit_handle(&ch, tx_head);
76484 +
76485 + while (log_rec_block != *end_block) {
76486 + struct wander_record_header *header;
76487 + struct wander_entry *entry;
76488 +
76489 + int i;
76490 +
76491 + if (nr_wander_records == 0) {
76492 + warning("zam-631",
76493 + "number of wander records in the linked list"
76494 + " greater than number stored in tx head.\n");
76495 + ret = RETERR(-EIO);
76496 + goto free_ow_set;
76497 + }
76498 +
76499 + log = reiser4_alloc_io_head(&log_rec_block);
76500 + if (log == NULL)
76501 + return RETERR(-ENOMEM);
76502 +
76503 + ret = jload(log);
76504 + if (ret < 0) {
76505 + reiser4_drop_io_head(log);
76506 + return ret;
76507 + }
76508 +
76509 + ret = check_wander_record(log);
76510 + if (ret) {
76511 + jrelse(log);
76512 + reiser4_drop_io_head(log);
76513 + return ret;
76514 + }
76515 +
76516 + header = (struct wander_record_header *)jdata(log);
76517 + log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76518 +
76519 + entry = (struct wander_entry *)(header + 1);
76520 +
76521 + /* restore overwrite set from wander record content */
76522 + for (i = 0; i < wander_record_capacity(s); i++) {
76523 + reiser4_block_nr block;
76524 + jnode *node;
76525 +
76526 + block = le64_to_cpu(get_unaligned(&entry->wandered));
76527 + if (block == 0)
76528 + break;
76529 +
76530 + node = reiser4_alloc_io_head(&block);
76531 + if (node == NULL) {
76532 + ret = RETERR(-ENOMEM);
76533 + /*
76534 + * FIXME-VS:???
76535 + */
76536 + jrelse(log);
76537 + reiser4_drop_io_head(log);
76538 + goto free_ow_set;
76539 + }
76540 +
76541 + ret = jload(node);
76542 +
76543 + if (ret < 0) {
76544 + reiser4_drop_io_head(node);
76545 + /*
76546 + * FIXME-VS:???
76547 + */
76548 + jrelse(log);
76549 + reiser4_drop_io_head(log);
76550 + goto free_ow_set;
76551 + }
76552 +
76553 + block = le64_to_cpu(get_unaligned(&entry->original));
76554 +
76555 + assert("zam-603", block != 0);
76556 +
76557 + jnode_set_block(node, &block);
76558 +
76559 + list_add_tail(&node->capture_link, ch.overwrite_set);
76560 +
76561 + ++entry;
76562 + }
76563 +
76564 + jrelse(log);
76565 + reiser4_drop_io_head(log);
76566 +
76567 + --nr_wander_records;
76568 + }
76569 +
76570 + if (nr_wander_records != 0) {
76571 + warning("zam-632", "number of wander records in the linked list"
76572 + " less than number stored in tx head.\n");
76573 + ret = RETERR(-EIO);
76574 + goto free_ow_set;
76575 + }
76576 +
76577 + { /* write wandered set in place */
76578 + write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76579 + ret = wait_on_jnode_list(ch.overwrite_set);
76580 +
76581 + if (ret) {
76582 + ret = RETERR(-EIO);
76583 + goto free_ow_set;
76584 + }
76585 + }
76586 +
76587 + ret = update_journal_footer(&ch, 0);
76588 +
76589 + free_ow_set:
76590 +
76591 + while (!list_empty(ch.overwrite_set)) {
76592 + jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76593 + list_del_init(&cur->capture_link);
76594 + jrelse(cur);
76595 + reiser4_drop_io_head(cur);
76596 + }
76597 +
76598 + list_del_init(&tx_head->capture_link);
76599 +
76600 + done_commit_handle(&ch);
76601 +
76602 + return ret;
76603 +}
76604 +
76605 +/* find oldest committed and not played transaction and play it. The transaction
76606 + * was committed and journal header block was updated but the blocks from the
76607 + * process of writing the atom's overwrite set in-place and updating of journal
76608 + * footer block were not completed. This function completes the process by
76609 + * recovering the atom's overwrite set from their wandered locations and writes
76610 + * them in-place and updating the journal footer. */
76611 +static int replay_oldest_transaction(struct super_block *s)
76612 +{
76613 + reiser4_super_info_data *sbinfo = get_super_private(s);
76614 + jnode *jf = sbinfo->journal_footer;
76615 + unsigned int total;
76616 + struct journal_footer *F;
76617 + struct tx_header *T;
76618 +
76619 + reiser4_block_nr prev_tx;
76620 + reiser4_block_nr last_flushed_tx;
76621 + reiser4_block_nr log_rec_block = 0;
76622 +
76623 + jnode *tx_head;
76624 +
76625 + int ret;
76626 +
76627 + if ((ret = jload(jf)) < 0)
76628 + return ret;
76629 +
76630 + F = (struct journal_footer *)jdata(jf);
76631 +
76632 + last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76633 +
76634 + jrelse(jf);
76635 +
76636 + if (sbinfo->last_committed_tx == last_flushed_tx) {
76637 + /* all transactions are replayed */
76638 + return 0;
76639 + }
76640 +
76641 + prev_tx = sbinfo->last_committed_tx;
76642 +
76643 + /* searching for oldest not flushed transaction */
76644 + while (1) {
76645 + tx_head = reiser4_alloc_io_head(&prev_tx);
76646 + if (!tx_head)
76647 + return RETERR(-ENOMEM);
76648 +
76649 + ret = jload(tx_head);
76650 + if (ret < 0) {
76651 + reiser4_drop_io_head(tx_head);
76652 + return ret;
76653 + }
76654 +
76655 + ret = check_tx_head(tx_head);
76656 + if (ret) {
76657 + jrelse(tx_head);
76658 + reiser4_drop_io_head(tx_head);
76659 + return ret;
76660 + }
76661 +
76662 + T = (struct tx_header *)jdata(tx_head);
76663 +
76664 + prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76665 +
76666 + if (prev_tx == last_flushed_tx)
76667 + break;
76668 +
76669 + jrelse(tx_head);
76670 + reiser4_drop_io_head(tx_head);
76671 + }
76672 +
76673 + total = le32_to_cpu(get_unaligned(&T->total));
76674 + log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76675 +
76676 + pin_jnode_data(tx_head);
76677 + jrelse(tx_head);
76678 +
76679 + ret =
76680 + replay_transaction(s, tx_head, &log_rec_block,
76681 + jnode_get_block(tx_head), total - 1);
76682 +
76683 + unpin_jnode_data(tx_head);
76684 + reiser4_drop_io_head(tx_head);
76685 +
76686 + if (ret)
76687 + return ret;
76688 + return -E_REPEAT;
76689 +}
76690 +
76691 +/* The reiser4 journal current implementation was optimized to not to capture
76692 + super block if certain super blocks fields are modified. Currently, the set
76693 + is (<free block count>, <OID allocator>). These fields are logged by
76694 + special way which includes storing them in each transaction head block at
76695 + atom commit time and writing that information to journal footer block at
76696 + atom flush time. For getting info from journal footer block to the
76697 + in-memory super block there is a special function
76698 + reiser4_journal_recover_sb_data() which should be called after disk format
76699 + plugin re-reads super block after journal replaying.
76700 +*/
76701 +
76702 +/* get the information from journal footer in-memory super block */
76703 +int reiser4_journal_recover_sb_data(struct super_block *s)
76704 +{
76705 + reiser4_super_info_data *sbinfo = get_super_private(s);
76706 + struct journal_footer *jf;
76707 + int ret;
76708 +
76709 + assert("zam-673", sbinfo->journal_footer != NULL);
76710 +
76711 + ret = jload(sbinfo->journal_footer);
76712 + if (ret != 0)
76713 + return ret;
76714 +
76715 + ret = check_journal_footer(sbinfo->journal_footer);
76716 + if (ret != 0)
76717 + goto out;
76718 +
76719 + jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76720 +
76721 + /* was there at least one flushed transaction? */
76722 + if (jf->last_flushed_tx) {
76723 +
76724 + /* restore free block counter logged in this transaction */
76725 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76726 +
76727 + /* restore oid allocator state */
76728 + oid_init_allocator(s,
76729 + le64_to_cpu(get_unaligned(&jf->nr_files)),
76730 + le64_to_cpu(get_unaligned(&jf->next_oid)));
76731 + }
76732 + out:
76733 + jrelse(sbinfo->journal_footer);
76734 + return ret;
76735 +}
76736 +
76737 +/* reiser4 replay journal procedure */
76738 +int reiser4_journal_replay(struct super_block *s)
76739 +{
76740 + reiser4_super_info_data *sbinfo = get_super_private(s);
76741 + jnode *jh, *jf;
76742 + struct journal_header *header;
76743 + int nr_tx_replayed = 0;
76744 + int ret;
76745 +
76746 + assert("zam-582", sbinfo != NULL);
76747 +
76748 + jh = sbinfo->journal_header;
76749 + jf = sbinfo->journal_footer;
76750 +
76751 + if (!jh || !jf) {
76752 + /* it is possible that disk layout does not support journal
76753 + structures, we just warn about this */
76754 + warning("zam-583",
76755 + "journal control blocks were not loaded by disk layout plugin. "
76756 + "journal replaying is not possible.\n");
76757 + return 0;
76758 + }
76759 +
76760 + /* Take free block count from journal footer block. The free block
76761 + counter value corresponds the last flushed transaction state */
76762 + ret = jload(jf);
76763 + if (ret < 0)
76764 + return ret;
76765 +
76766 + ret = check_journal_footer(jf);
76767 + if (ret) {
76768 + jrelse(jf);
76769 + return ret;
76770 + }
76771 +
76772 + jrelse(jf);
76773 +
76774 + /* store last committed transaction info in reiser4 in-memory super
76775 + block */
76776 + ret = jload(jh);
76777 + if (ret < 0)
76778 + return ret;
76779 +
76780 + ret = check_journal_header(jh);
76781 + if (ret) {
76782 + jrelse(jh);
76783 + return ret;
76784 + }
76785 +
76786 + header = (struct journal_header *)jdata(jh);
76787 + sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76788 +
76789 + jrelse(jh);
76790 +
76791 + /* replay committed transactions */
76792 + while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76793 + nr_tx_replayed++;
76794 +
76795 + return ret;
76796 +}
76797 +
76798 +/* load journal control block (either journal header or journal footer block) */
76799 +static int
76800 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76801 +{
76802 + int ret;
76803 +
76804 + *node = reiser4_alloc_io_head(block);
76805 + if (!(*node))
76806 + return RETERR(-ENOMEM);
76807 +
76808 + ret = jload(*node);
76809 +
76810 + if (ret) {
76811 + reiser4_drop_io_head(*node);
76812 + *node = NULL;
76813 + return ret;
76814 + }
76815 +
76816 + pin_jnode_data(*node);
76817 + jrelse(*node);
76818 +
76819 + return 0;
76820 +}
76821 +
76822 +/* unload journal header or footer and free jnode */
76823 +static void unload_journal_control_block(jnode ** node)
76824 +{
76825 + if (*node) {
76826 + unpin_jnode_data(*node);
76827 + reiser4_drop_io_head(*node);
76828 + *node = NULL;
76829 + }
76830 +}
76831 +
76832 +/* release journal control blocks */
76833 +void reiser4_done_journal_info(struct super_block *s)
76834 +{
76835 + reiser4_super_info_data *sbinfo = get_super_private(s);
76836 +
76837 + assert("zam-476", sbinfo != NULL);
76838 +
76839 + unload_journal_control_block(&sbinfo->journal_header);
76840 + unload_journal_control_block(&sbinfo->journal_footer);
76841 + rcu_barrier();
76842 +}
76843 +
76844 +/* load journal control blocks */
76845 +int reiser4_init_journal_info(struct super_block *s)
76846 +{
76847 + reiser4_super_info_data *sbinfo = get_super_private(s);
76848 + journal_location *loc;
76849 + int ret;
76850 +
76851 + loc = &sbinfo->jloc;
76852 +
76853 + assert("zam-651", loc != NULL);
76854 + assert("zam-652", loc->header != 0);
76855 + assert("zam-653", loc->footer != 0);
76856 +
76857 + ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76858 +
76859 + if (ret)
76860 + return ret;
76861 +
76862 + ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76863 +
76864 + if (ret) {
76865 + unload_journal_control_block(&sbinfo->journal_header);
76866 + }
76867 +
76868 + return ret;
76869 +}
76870 +
76871 +/* Make Linus happy.
76872 + Local variables:
76873 + c-indentation-style: "K&R"
76874 + mode-name: "LC"
76875 + c-basic-offset: 8
76876 + tab-width: 8
76877 + fill-column: 80
76878 + End:
76879 +*/
76880 diff -urN linux-2.6.33.orig/fs/reiser4/wander.h linux-2.6.33/fs/reiser4/wander.h
76881 --- linux-2.6.33.orig/fs/reiser4/wander.h 1970-01-01 01:00:00.000000000 +0100
76882 +++ linux-2.6.33/fs/reiser4/wander.h 2010-03-04 19:33:22.000000000 +0100
76883 @@ -0,0 +1,135 @@
76884 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76885 +
76886 +#if !defined (__FS_REISER4_WANDER_H__)
76887 +#define __FS_REISER4_WANDER_H__
76888 +
76889 +#include "dformat.h"
76890 +
76891 +#include <linux/fs.h> /* for struct super_block */
76892 +
76893 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
76894 +
76895 +#define TX_HEADER_MAGIC "TxMagic4"
76896 +#define WANDER_RECORD_MAGIC "LogMagc4"
76897 +
76898 +#define TX_HEADER_MAGIC_SIZE (8)
76899 +#define WANDER_RECORD_MAGIC_SIZE (8)
76900 +
76901 +/* journal header block format */
76902 +struct journal_header {
76903 + /* last written transaction head location */
76904 + d64 last_committed_tx;
76905 +};
76906 +
76907 +typedef struct journal_location {
76908 + reiser4_block_nr footer;
76909 + reiser4_block_nr header;
76910 +} journal_location;
76911 +
76912 +/* The wander.c head comment describes usage and semantic of all these structures */
76913 +/* journal footer block format */
76914 +struct journal_footer {
76915 + /* last flushed transaction location. */
76916 + /* This block number is no more valid after the transaction it points
76917 + to gets flushed, this number is used only at journal replaying time
76918 + for detection of the end of on-disk list of committed transactions
76919 + which were not flushed completely */
76920 + d64 last_flushed_tx;
76921 +
76922 + /* free block counter is written in journal footer at transaction
76923 + flushing , not in super block because free blocks counter is logged
76924 + by another way than super block fields (root pointer, for
76925 + example). */
76926 + d64 free_blocks;
76927 +
76928 + /* number of used OIDs and maximal used OID are logged separately from
76929 + super block */
76930 + d64 nr_files;
76931 + d64 next_oid;
76932 +};
76933 +
76934 +/* Each wander record (except the first one) has unified format with wander
76935 + record header followed by an array of log entries */
76936 +struct wander_record_header {
76937 + /* when there is no predefined location for wander records, this magic
76938 + string should help reiser4fsck. */
76939 + char magic[WANDER_RECORD_MAGIC_SIZE];
76940 +
76941 + /* transaction id */
76942 + d64 id;
76943 +
76944 + /* total number of wander records in current transaction */
76945 + d32 total;
76946 +
76947 + /* this block number in transaction */
76948 + d32 serial;
76949 +
76950 + /* number of previous block in commit */
76951 + d64 next_block;
76952 +};
76953 +
76954 +/* The first wander record (transaction head) of written transaction has the
76955 + special format */
76956 +struct tx_header {
76957 + /* magic string makes first block in transaction different from other
76958 + logged blocks, it should help fsck. */
76959 + char magic[TX_HEADER_MAGIC_SIZE];
76960 +
76961 + /* transaction id */
76962 + d64 id;
76963 +
76964 + /* total number of records (including this first tx head) in the
76965 + transaction */
76966 + d32 total;
76967 +
76968 + /* align next field to 8-byte boundary; this field always is zero */
76969 + d32 padding;
76970 +
76971 + /* block number of previous transaction head */
76972 + d64 prev_tx;
76973 +
76974 + /* next wander record location */
76975 + d64 next_block;
76976 +
76977 + /* committed versions of free blocks counter */
76978 + d64 free_blocks;
76979 +
76980 + /* number of used OIDs (nr_files) and maximal used OID are logged
76981 + separately from super block */
76982 + d64 nr_files;
76983 + d64 next_oid;
76984 +};
76985 +
76986 +/* A transaction gets written to disk as a set of wander records (each wander
76987 + record size is fs block) */
76988 +
76989 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76990 + by zeroes */
76991 +struct wander_entry {
76992 + d64 original; /* block original location */
76993 + d64 wandered; /* block wandered location */
76994 +};
76995 +
76996 +/* REISER4 JOURNAL WRITER FUNCTIONS */
76997 +
76998 +extern int reiser4_write_logs(long *);
76999 +extern int reiser4_journal_replay(struct super_block *);
77000 +extern int reiser4_journal_recover_sb_data(struct super_block *);
77001 +
77002 +extern int reiser4_init_journal_info(struct super_block *);
77003 +extern void reiser4_done_journal_info(struct super_block *);
77004 +
77005 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
77006 +
77007 +#endif /* __FS_REISER4_WANDER_H__ */
77008 +
77009 +/* Make Linus happy.
77010 + Local variables:
77011 + c-indentation-style: "K&R"
77012 + mode-name: "LC"
77013 + c-basic-offset: 8
77014 + tab-width: 8
77015 + fill-column: 80
77016 + scroll-step: 1
77017 + End:
77018 +*/
77019 diff -urN linux-2.6.33.orig/fs/reiser4/writeout.h linux-2.6.33/fs/reiser4/writeout.h
77020 --- linux-2.6.33.orig/fs/reiser4/writeout.h 1970-01-01 01:00:00.000000000 +0100
77021 +++ linux-2.6.33/fs/reiser4/writeout.h 2010-03-04 19:33:22.000000000 +0100
77022 @@ -0,0 +1,21 @@
77023 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
77024 +
77025 +#if !defined (__FS_REISER4_WRITEOUT_H__)
77026 +
77027 +#define WRITEOUT_SINGLE_STREAM (0x1)
77028 +#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
77029 +#define WRITEOUT_BARRIER (0x4)
77030 +
77031 +extern int reiser4_get_writeout_flags(void);
77032 +
77033 +#endif /* __FS_REISER4_WRITEOUT_H__ */
77034 +
77035 +/* Make Linus happy.
77036 + Local variables:
77037 + c-indentation-style: "K&R"
77038 + mode-name: "LC"
77039 + c-basic-offset: 8
77040 + tab-width: 8
77041 + fill-column: 80
77042 + End:
77043 +*/
77044 diff -urN linux-2.6.33.orig/fs/reiser4/znode.c linux-2.6.33/fs/reiser4/znode.c
77045 --- linux-2.6.33.orig/fs/reiser4/znode.c 1970-01-01 01:00:00.000000000 +0100
77046 +++ linux-2.6.33/fs/reiser4/znode.c 2010-03-04 19:33:22.000000000 +0100
77047 @@ -0,0 +1,1029 @@
77048 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77049 + * reiser4/README */
77050 +/* Znode manipulation functions. */
77051 +/* Znode is the in-memory header for a tree node. It is stored
77052 + separately from the node itself so that it does not get written to
77053 + disk. In this respect znode is like buffer head or page head. We
77054 + also use znodes for additional reiser4 specific purposes:
77055 +
77056 + . they are organized into tree structure which is a part of whole
77057 + reiser4 tree.
77058 + . they are used to implement node grained locking
77059 + . they are used to keep additional state associated with a
77060 + node
77061 + . they contain links to lists used by the transaction manager
77062 +
77063 + Znode is attached to some variable "block number" which is instance of
77064 + fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
77065 + appropriate node being actually loaded in memory. Existence of znode itself
77066 + is regulated by reference count (->x_count) in it. Each time thread
77067 + acquires reference to znode through call to zget(), ->x_count is
77068 + incremented and decremented on call to zput(). Data (content of node) are
77069 + brought in memory through call to zload(), which also increments ->d_count
77070 + reference counter. zload can block waiting on IO. Call to zrelse()
77071 + decreases this counter. Also, ->c_count keeps track of number of child
77072 + znodes and prevents parent znode from being recycled until all of its
77073 + children are. ->c_count is decremented whenever child goes out of existence
77074 + (being actually recycled in zdestroy()) which can be some time after last
77075 + reference to this child dies if we support some form of LRU cache for
77076 + znodes.
77077 +
77078 +*/
77079 +/* EVERY ZNODE'S STORY
77080 +
77081 + 1. His infancy.
77082 +
77083 + Once upon a time, the znode was born deep inside of zget() by call to
77084 + zalloc(). At the return from zget() znode had:
77085 +
77086 + . reference counter (x_count) of 1
77087 + . assigned block number, marked as used in bitmap
77088 + . pointer to parent znode. Root znode parent pointer points
77089 + to its father: "fake" znode. This, in turn, has NULL parent pointer.
77090 + . hash table linkage
77091 + . no data loaded from disk
77092 + . no node plugin
77093 + . no sibling linkage
77094 +
77095 + 2. His childhood
77096 +
77097 + Each node is either brought into memory as a result of tree traversal, or
77098 + created afresh, creation of the root being a special case of the latter. In
77099 + either case it's inserted into sibling list. This will typically require
77100 + some ancillary tree traversing, but ultimately both sibling pointers will
77101 + exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
77102 + zjnode.state.
77103 +
77104 + 3. His youth.
77105 +
77106 + If znode is bound to already existing node in a tree, its content is read
77107 + from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
77108 + in zjnode.state and zdata() function starts to return non null for this
77109 + znode. zload() further calls zparse() that determines which node layout
77110 + this node is rendered in, and sets ->nplug on success.
77111 +
77112 + If znode is for new node just created, memory for it is allocated and
77113 + zinit_new() function is called to initialise data, according to selected
77114 + node layout.
77115 +
77116 + 4. His maturity.
77117 +
77118 + After this point, znode lingers in memory for some time. Threads can
77119 + acquire references to znode either by blocknr through call to zget(), or by
77120 + following a pointer to unallocated znode from internal item. Each time
77121 + reference to znode is obtained, x_count is increased. Thread can read/write
77122 + lock znode. Znode data can be loaded through calls to zload(), d_count will
77123 + be increased appropriately. If all references to znode are released
77124 + (x_count drops to 0), znode is not recycled immediately. Rather, it is
77125 + still cached in the hash table in the hope that it will be accessed
77126 + shortly.
77127 +
77128 + There are two ways in which znode existence can be terminated:
77129 +
77130 + . sudden death: node bound to this znode is removed from the tree
77131 + . overpopulation: znode is purged out of memory due to memory pressure
77132 +
77133 + 5. His death.
77134 +
77135 + Death is complex process.
77136 +
77137 + When we irrevocably commit ourselves to decision to remove node from the
77138 + tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
77139 + znode. This is done either in ->kill_hook() of internal item or in
77140 + reiser4_kill_root() function when tree root is removed.
77141 +
77142 + At this moment znode still has:
77143 +
77144 + . locks held on it, necessary write ones
77145 + . references to it
77146 + . disk block assigned to it
77147 + . data loaded from the disk
77148 + . pending requests for lock
77149 +
77150 + But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
77151 + deletion. Node deletion includes two phases. First all ways to get
77152 + references to that znode (sibling and parent links and hash lookup using
77153 + block number stored in parent node) should be deleted -- it is done through
77154 + sibling_list_remove(), also we assume that nobody uses down link from
77155 + parent node due to its nonexistence or proper parent node locking and
77156 + nobody uses parent pointers from children due to absence of them. Second we
77157 + invalidate all pending lock requests which still are on znode's lock
77158 + request queue, this is done by reiser4_invalidate_lock(). Another
77159 + JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
77160 + Once it set all requesters are forced to return -EINVAL from
77161 + longterm_lock_znode(). Future locking attempts are not possible because all
77162 + ways to get references to that znode are removed already. Last, node is
77163 + uncaptured from transaction.
77164 +
77165 + When last reference to the dying znode is just about to be released,
77166 + block number for this lock is released and znode is removed from the
77167 + hash table.
77168 +
77169 + Now znode can be recycled.
77170 +
77171 + [it's possible to free bitmap block and remove znode from the hash
77172 + table when last lock is released. This will result in having
77173 + referenced but completely orphaned znode]
77174 +
77175 + 6. Limbo
77176 +
77177 + As have been mentioned above znodes with reference counter 0 are
77178 + still cached in a hash table. Once memory pressure increases they are
77179 + purged out of there [this requires something like LRU list for
77180 + efficient implementation. LRU list would also greatly simplify
77181 + implementation of coord cache that would in this case morph to just
77182 + scanning some initial segment of LRU list]. Data loaded into
77183 + unreferenced znode are flushed back to the durable storage if
77184 + necessary and memory is freed. Znodes themselves can be recycled at
77185 + this point too.
77186 +
77187 +*/
77188 +
77189 +#include "debug.h"
77190 +#include "dformat.h"
77191 +#include "key.h"
77192 +#include "coord.h"
77193 +#include "plugin/plugin_header.h"
77194 +#include "plugin/node/node.h"
77195 +#include "plugin/plugin.h"
77196 +#include "txnmgr.h"
77197 +#include "jnode.h"
77198 +#include "znode.h"
77199 +#include "block_alloc.h"
77200 +#include "tree.h"
77201 +#include "tree_walk.h"
77202 +#include "super.h"
77203 +#include "reiser4.h"
77204 +
77205 +#include <linux/pagemap.h>
77206 +#include <linux/spinlock.h>
77207 +#include <linux/slab.h>
77208 +#include <linux/err.h>
77209 +
77210 +static z_hash_table *get_htable(reiser4_tree *,
77211 + const reiser4_block_nr * const blocknr);
77212 +static z_hash_table *znode_get_htable(const znode *);
77213 +static void zdrop(znode *);
77214 +
77215 +/* hash table support */
77216 +
77217 +/* compare two block numbers for equality. Used by hash-table macros */
77218 +static inline int
77219 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
77220 +{
77221 + assert("nikita-534", b1 != NULL);
77222 + assert("nikita-535", b2 != NULL);
77223 +
77224 + return *b1 == *b2;
77225 +}
77226 +
77227 +/* Hash znode by block number. Used by hash-table macros */
77228 +/* Audited by: umka (2002.06.11) */
77229 +static inline __u32
77230 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
77231 +{
77232 + assert("nikita-536", b != NULL);
77233 +
77234 + return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
77235 +}
77236 +
77237 +/* The hash table definition */
77238 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
77239 +#define KFREE(ptr, size) kfree(ptr)
77240 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
77241 + blknrhashfn, blknreq);
77242 +#undef KFREE
77243 +#undef KMALLOC
77244 +
77245 +/* slab for znodes */
77246 +static struct kmem_cache *znode_cache;
77247 +
77248 +int znode_shift_order;
77249 +
77250 +/**
77251 + * init_znodes - create znode cache
77252 + *
77253 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
77254 + */
77255 +int init_znodes(void)
77256 +{
77257 + znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
77258 + SLAB_HWCACHE_ALIGN |
77259 + SLAB_RECLAIM_ACCOUNT, NULL);
77260 + if (znode_cache == NULL)
77261 + return RETERR(-ENOMEM);
77262 +
77263 + for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
77264 + ++znode_shift_order);
77265 + --znode_shift_order;
77266 + return 0;
77267 +}
77268 +
77269 +/**
77270 + * done_znodes - delete znode cache
77271 + *
77272 + * This is called on reiser4 module unloading or system shutdown.
77273 + */
77274 +void done_znodes(void)
77275 +{
77276 + destroy_reiser4_cache(&znode_cache);
77277 +}
77278 +
77279 +/* call this to initialise tree of znodes */
77280 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
77281 +{
77282 + int result;
77283 + assert("umka-050", tree != NULL);
77284 +
77285 + rwlock_init(&tree->dk_lock);
77286 +
77287 + result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
77288 + if (result != 0)
77289 + return result;
77290 + result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
77291 + return result;
77292 +}
77293 +
77294 +/* free this znode */
77295 +void zfree(znode * node /* znode to free */ )
77296 +{
77297 + assert("nikita-465", node != NULL);
77298 + assert("nikita-2120", znode_page(node) == NULL);
77299 + assert("nikita-2301", list_empty_careful(&node->lock.owners));
77300 + assert("nikita-2302", list_empty_careful(&node->lock.requestors));
77301 + assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
77302 + NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
77303 + assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
77304 + assert("nikita-3293", !znode_is_right_connected(node));
77305 + assert("nikita-3294", !znode_is_left_connected(node));
77306 + assert("nikita-3295", node->left == NULL);
77307 + assert("nikita-3296", node->right == NULL);
77308 +
77309 + /* not yet phash_jnode_destroy(ZJNODE(node)); */
77310 +
77311 + kmem_cache_free(znode_cache, node);
77312 +}
77313 +
77314 +/* call this to free tree of znodes */
77315 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
77316 +{
77317 + znode *node;
77318 + znode *next;
77319 + z_hash_table *ztable;
77320 +
77321 + /* scan znode hash-tables and kill all znodes, then free hash tables
77322 + * themselves. */
77323 +
77324 + assert("nikita-795", tree != NULL);
77325 +
77326 + ztable = &tree->zhash_table;
77327 +
77328 + if (ztable->_table != NULL) {
77329 + for_all_in_htable(ztable, z, node, next) {
77330 + node->c_count = 0;
77331 + node->in_parent.node = NULL;
77332 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
77333 + zdrop(node);
77334 + }
77335 +
77336 + z_hash_done(&tree->zhash_table);
77337 + }
77338 +
77339 + ztable = &tree->zfake_table;
77340 +
77341 + if (ztable->_table != NULL) {
77342 + for_all_in_htable(ztable, z, node, next) {
77343 + node->c_count = 0;
77344 + node->in_parent.node = NULL;
77345 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
77346 + zdrop(node);
77347 + }
77348 +
77349 + z_hash_done(&tree->zfake_table);
77350 + }
77351 +}
77352 +
77353 +/* ZNODE STRUCTURES */
77354 +
77355 +/* allocate fresh znode */
77356 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
77357 +{
77358 + znode *node;
77359 +
77360 + node = kmem_cache_alloc(znode_cache, gfp_flag);
77361 + return node;
77362 +}
77363 +
77364 +/* Initialize fields of znode
77365 + @node: znode to initialize;
77366 + @parent: parent znode;
77367 + @tree: tree we are in. */
77368 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
77369 +{
77370 + assert("nikita-466", node != NULL);
77371 + assert("umka-268", current_tree != NULL);
77372 +
77373 + memset(node, 0, sizeof *node);
77374 +
77375 + assert("umka-051", tree != NULL);
77376 +
77377 + jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
77378 + reiser4_init_lock(&node->lock);
77379 + init_parent_coord(&node->in_parent, parent);
77380 +}
77381 +
77382 +/*
77383 + * remove znode from indices. This is called jput() when last reference on
77384 + * znode is released.
77385 + */
77386 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
77387 +{
77388 + assert("nikita-2108", node != NULL);
77389 + assert("nikita-470", node->c_count == 0);
77390 + assert_rw_write_locked(&(tree->tree_lock));
77391 +
77392 + /* remove reference to this znode from cbk cache */
77393 + cbk_cache_invalidate(node, tree);
77394 +
77395 + /* update c_count of parent */
77396 + if (znode_parent(node) != NULL) {
77397 + assert("nikita-472", znode_parent(node)->c_count > 0);
77398 + /* father, onto your hands I forward my spirit... */
77399 + znode_parent(node)->c_count--;
77400 + node->in_parent.node = NULL;
77401 + } else {
77402 + /* orphaned znode?! Root? */
77403 + }
77404 +
77405 + /* remove znode from hash-table */
77406 + z_hash_remove_rcu(znode_get_htable(node), node);
77407 +}
77408 +
77409 +/* zdrop() -- Remove znode from the tree.
77410 +
77411 + This is called when znode is removed from the memory. */
77412 +static void zdrop(znode * node /* znode to finish with */ )
77413 +{
77414 + jdrop(ZJNODE(node));
77415 +}
77416 +
77417 +/*
77418 + * put znode into right place in the hash table. This is called by relocate
77419 + * code.
77420 + */
77421 +int znode_rehash(znode * node /* node to rehash */ ,
77422 + const reiser4_block_nr * new_block_nr /* new block number */ )
77423 +{
77424 + z_hash_table *oldtable;
77425 + z_hash_table *newtable;
77426 + reiser4_tree *tree;
77427 +
77428 + assert("nikita-2018", node != NULL);
77429 +
77430 + tree = znode_get_tree(node);
77431 + oldtable = znode_get_htable(node);
77432 + newtable = get_htable(tree, new_block_nr);
77433 +
77434 + write_lock_tree(tree);
77435 + /* remove znode from hash-table */
77436 + z_hash_remove_rcu(oldtable, node);
77437 +
77438 + /* assertion no longer valid due to RCU */
77439 + /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77440 +
77441 + /* update blocknr */
77442 + znode_set_block(node, new_block_nr);
77443 + node->zjnode.key.z = *new_block_nr;
77444 +
77445 + /* insert it into hash */
77446 + z_hash_insert_rcu(newtable, node);
77447 + write_unlock_tree(tree);
77448 + return 0;
77449 +}
77450 +
77451 +/* ZNODE LOOKUP, GET, PUT */
77452 +
77453 +/* zlook() - get znode with given block_nr in a hash table or return NULL
77454 +
77455 + If result is non-NULL then the znode's x_count is incremented. Internal version
77456 + accepts pre-computed hash index. The hash table is accessed under caller's
77457 + tree->hash_lock.
77458 +*/
77459 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77460 +{
77461 + znode *result;
77462 + __u32 hash;
77463 + z_hash_table *htable;
77464 +
77465 + assert("jmacd-506", tree != NULL);
77466 + assert("jmacd-507", blocknr != NULL);
77467 +
77468 + htable = get_htable(tree, blocknr);
77469 + hash = blknrhashfn(htable, blocknr);
77470 +
77471 + rcu_read_lock();
77472 + result = z_hash_find_index(htable, hash, blocknr);
77473 +
77474 + if (result != NULL) {
77475 + add_x_ref(ZJNODE(result));
77476 + result = znode_rip_check(tree, result);
77477 + }
77478 + rcu_read_unlock();
77479 +
77480 + return result;
77481 +}
77482 +
77483 +/* return hash table where znode with block @blocknr is (or should be)
77484 + * stored */
77485 +static z_hash_table *get_htable(reiser4_tree * tree,
77486 + const reiser4_block_nr * const blocknr)
77487 +{
77488 + z_hash_table *table;
77489 + if (is_disk_addr_unallocated(blocknr))
77490 + table = &tree->zfake_table;
77491 + else
77492 + table = &tree->zhash_table;
77493 + return table;
77494 +}
77495 +
77496 +/* return hash table where znode @node is (or should be) stored */
77497 +static z_hash_table *znode_get_htable(const znode * node)
77498 +{
77499 + return get_htable(znode_get_tree(node), znode_get_block(node));
77500 +}
77501 +
77502 +/* zget() - get znode from hash table, allocating it if necessary.
77503 +
77504 + First a call to zlook, locating a x-referenced znode if one
77505 + exists. If znode is not found, allocate new one and return. Result
77506 + is returned with x_count reference increased.
77507 +
77508 + LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
77509 + LOCK ORDERING: NONE
77510 +*/
77511 +znode *zget(reiser4_tree * tree,
77512 + const reiser4_block_nr * const blocknr,
77513 + znode * parent, tree_level level, gfp_t gfp_flag)
77514 +{
77515 + znode *result;
77516 + __u32 hashi;
77517 +
77518 + z_hash_table *zth;
77519 +
77520 + assert("jmacd-512", tree != NULL);
77521 + assert("jmacd-513", blocknr != NULL);
77522 + assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77523 +
77524 + zth = get_htable(tree, blocknr);
77525 + hashi = blknrhashfn(zth, blocknr);
77526 +
77527 + /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77528 + implemented. */
77529 +
77530 + z_hash_prefetch_bucket(zth, hashi);
77531 +
77532 + rcu_read_lock();
77533 + /* Find a matching BLOCKNR in the hash table. If the znode is found,
77534 + we obtain an reference (x_count) but the znode remains unlocked.
77535 + Have to worry about race conditions later. */
77536 + result = z_hash_find_index(zth, hashi, blocknr);
77537 + /* According to the current design, the hash table lock protects new
77538 + znode references. */
77539 + if (result != NULL) {
77540 + add_x_ref(ZJNODE(result));
77541 + /* NOTE-NIKITA it should be so, but special case during
77542 + creation of new root makes such assertion highly
77543 + complicated. */
77544 + assert("nikita-2131", 1 || znode_parent(result) == parent ||
77545 + (ZF_ISSET(result, JNODE_ORPHAN)
77546 + && (znode_parent(result) == NULL)));
77547 + result = znode_rip_check(tree, result);
77548 + }
77549 +
77550 + rcu_read_unlock();
77551 +
77552 + if (!result) {
77553 + znode *shadow;
77554 +
77555 + result = zalloc(gfp_flag);
77556 + if (!result) {
77557 + return ERR_PTR(RETERR(-ENOMEM));
77558 + }
77559 +
77560 + zinit(result, parent, tree);
77561 + ZJNODE(result)->blocknr = *blocknr;
77562 + ZJNODE(result)->key.z = *blocknr;
77563 + result->level = level;
77564 +
77565 + write_lock_tree(tree);
77566 +
77567 + shadow = z_hash_find_index(zth, hashi, blocknr);
77568 + if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77569 + jnode_list_remove(ZJNODE(result));
77570 + zfree(result);
77571 + result = shadow;
77572 + } else {
77573 + result->version = znode_build_version(tree);
77574 + z_hash_insert_index_rcu(zth, hashi, result);
77575 +
77576 + if (parent != NULL)
77577 + ++parent->c_count;
77578 + }
77579 +
77580 + add_x_ref(ZJNODE(result));
77581 +
77582 + write_unlock_tree(tree);
77583 + }
77584 +#if REISER4_DEBUG
77585 + if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77586 + reiser4_check_block(blocknr, 1);
77587 +#endif
77588 + /* Check for invalid tree level, return -EIO */
77589 + if (unlikely(znode_get_level(result) != level)) {
77590 + warning("jmacd-504",
77591 + "Wrong level for cached block %llu: %i expecting %i",
77592 + (unsigned long long)(*blocknr), znode_get_level(result),
77593 + level);
77594 + zput(result);
77595 + return ERR_PTR(RETERR(-EIO));
77596 + }
77597 +
77598 + assert("nikita-1227", znode_invariant(result));
77599 +
77600 + return result;
77601 +}
77602 +
77603 +/* ZNODE PLUGINS/DATA */
77604 +
77605 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77606 + stored at the fixed offset from the beginning of the node. */
77607 +static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
77608 + * plugin of */ )
77609 +{
77610 + reiser4_tree *tree;
77611 +
77612 + assert("nikita-1053", node != NULL);
77613 + assert("nikita-1055", zdata(node) != NULL);
77614 +
77615 + tree = znode_get_tree(node);
77616 + assert("umka-053", tree != NULL);
77617 +
77618 + if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77619 + return tree->nplug;
77620 + } else {
77621 + return node_plugin_by_disk_id
77622 + (tree, &((common_node_header *) zdata(node))->plugin_id);
77623 +#ifdef GUESS_EXISTS
77624 + reiser4_plugin *plugin;
77625 +
77626 + /* NOTE-NIKITA add locking here when dynamic plugins will be
77627 + * implemented */
77628 + for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77629 + if ((plugin->u.node.guess != NULL)
77630 + && plugin->u.node.guess(node))
77631 + return plugin;
77632 + }
77633 + warning("nikita-1057", "Cannot guess node plugin");
77634 + print_znode("node", node);
77635 + return NULL;
77636 +#endif
77637 + }
77638 +}
77639 +
77640 +/* parse node header and install ->node_plugin */
77641 +int zparse(znode * node /* znode to parse */ )
77642 +{
77643 + int result;
77644 +
77645 + assert("nikita-1233", node != NULL);
77646 + assert("nikita-2370", zdata(node) != NULL);
77647 +
77648 + if (node->nplug == NULL) {
77649 + node_plugin *nplug;
77650 +
77651 + nplug = znode_guess_plugin(node);
77652 + if (likely(nplug != NULL)) {
77653 + result = nplug->parse(node);
77654 + if (likely(result == 0))
77655 + node->nplug = nplug;
77656 + } else {
77657 + result = RETERR(-EIO);
77658 + }
77659 + } else
77660 + result = 0;
77661 + return result;
77662 +}
77663 +
77664 +/* zload with readahead */
77665 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77666 +{
77667 + int result;
77668 +
77669 + assert("nikita-484", node != NULL);
77670 + assert("nikita-1377", znode_invariant(node));
77671 + assert("jmacd-7771", !znode_above_root(node));
77672 + assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77673 + assert("nikita-3016", reiser4_schedulable());
77674 +
77675 + if (info)
77676 + formatted_readahead(node, info);
77677 +
77678 + result = jload(ZJNODE(node));
77679 + assert("nikita-1378", znode_invariant(node));
77680 + return result;
77681 +}
77682 +
77683 +/* load content of node into memory */
77684 +int zload(znode * node)
77685 +{
77686 + return zload_ra(node, NULL);
77687 +}
77688 +
77689 +/* call node plugin to initialise newly allocated node. */
77690 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77691 +{
77692 + return jinit_new(ZJNODE(node), gfp_flags);
77693 +}
77694 +
77695 +/* drop reference to node data. When last reference is dropped, data are
77696 + unloaded. */
77697 +void zrelse(znode * node /* znode to release references to */ )
77698 +{
77699 + assert("nikita-1381", znode_invariant(node));
77700 +
77701 + jrelse(ZJNODE(node));
77702 +}
77703 +
77704 +/* returns free space in node */
77705 +unsigned znode_free_space(znode * node /* znode to query */ )
77706 +{
77707 + assert("nikita-852", node != NULL);
77708 + return node_plugin_by_node(node)->free_space(node);
77709 +}
77710 +
77711 +/* left delimiting key of znode */
77712 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77713 +{
77714 + assert("nikita-958", node != NULL);
77715 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77716 + assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77717 + assert("nikita-30671", node->rd_key_version != 0);
77718 + return &node->rd_key;
77719 +}
77720 +
77721 +/* right delimiting key of znode */
77722 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77723 +{
77724 + assert("nikita-974", node != NULL);
77725 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77726 + assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77727 + assert("nikita-30681", node->ld_key_version != 0);
77728 + return &node->ld_key;
77729 +}
77730 +
77731 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77732 + )
77733 +
77734 +/* update right-delimiting key of @node */
77735 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77736 +{
77737 + assert("nikita-2937", node != NULL);
77738 + assert("nikita-2939", key != NULL);
77739 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77740 + assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77741 + assert("nikita-2944",
77742 + znode_is_any_locked(node) ||
77743 + znode_get_level(node) != LEAF_LEVEL ||
77744 + keyge(key, &node->rd_key) ||
77745 + keyeq(&node->rd_key, reiser4_min_key()) ||
77746 + ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77747 +
77748 + node->rd_key = *key;
77749 + ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77750 + return &node->rd_key;
77751 +}
77752 +
77753 +/* update left-delimiting key of @node */
77754 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77755 +{
77756 + assert("nikita-2940", node != NULL);
77757 + assert("nikita-2941", key != NULL);
77758 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77759 + assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77760 + assert("nikita-2943",
77761 + znode_is_any_locked(node) || keyeq(&node->ld_key,
77762 + reiser4_min_key()));
77763 +
77764 + node->ld_key = *key;
77765 + ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77766 + return &node->ld_key;
77767 +}
77768 +
77769 +/* true if @key is inside key range for @node */
77770 +int znode_contains_key(znode * node /* znode to look in */ ,
77771 + const reiser4_key * key /* key to look for */ )
77772 +{
77773 + assert("nikita-1237", node != NULL);
77774 + assert("nikita-1238", key != NULL);
77775 +
77776 + /* left_delimiting_key <= key <= right_delimiting_key */
77777 + return keyle(znode_get_ld_key(node), key)
77778 + && keyle(key, znode_get_rd_key(node));
77779 +}
77780 +
77781 +/* same as znode_contains_key(), but lock dk lock */
77782 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
77783 + const reiser4_key * key /* key to look for */ )
77784 +{
77785 + int result;
77786 +
77787 + assert("umka-056", node != NULL);
77788 + assert("umka-057", key != NULL);
77789 +
77790 + read_lock_dk(znode_get_tree(node));
77791 + result = znode_contains_key(node, key);
77792 + read_unlock_dk(znode_get_tree(node));
77793 + return result;
77794 +}
77795 +
77796 +/* get parent pointer, assuming tree is not locked */
77797 +znode *znode_parent_nolock(const znode * node /* child znode */ )
77798 +{
77799 + assert("nikita-1444", node != NULL);
77800 + return node->in_parent.node;
77801 +}
77802 +
77803 +/* get parent pointer of znode */
77804 +znode *znode_parent(const znode * node /* child znode */ )
77805 +{
77806 + assert("nikita-1226", node != NULL);
77807 + assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77808 + return znode_parent_nolock(node);
77809 +}
77810 +
77811 +/* detect uber znode used to protect in-superblock tree root pointer */
77812 +int znode_above_root(const znode * node /* znode to query */ )
77813 +{
77814 + assert("umka-059", node != NULL);
77815 +
77816 + return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77817 +}
77818 +
77819 +/* check that @node is root---that its block number is recorder in the tree as
77820 + that of root node */
77821 +#if REISER4_DEBUG
77822 +static int znode_is_true_root(const znode * node /* znode to query */ )
77823 +{
77824 + assert("umka-060", node != NULL);
77825 + assert("umka-061", current_tree != NULL);
77826 +
77827 + return disk_addr_eq(znode_get_block(node),
77828 + &znode_get_tree(node)->root_block);
77829 +}
77830 +#endif
77831 +
77832 +/* check that @node is root */
77833 +int znode_is_root(const znode * node /* znode to query */ )
77834 +{
77835 + assert("nikita-1206", node != NULL);
77836 +
77837 + return znode_get_level(node) == znode_get_tree(node)->height;
77838 +}
77839 +
77840 +/* Returns true is @node was just created by zget() and wasn't ever loaded
77841 + into memory. */
77842 +/* NIKITA-HANS: yes */
77843 +int znode_just_created(const znode * node)
77844 +{
77845 + assert("nikita-2188", node != NULL);
77846 + return (znode_page(node) == NULL);
77847 +}
77848 +
77849 +/* obtain updated ->znode_epoch. See seal.c for description. */
77850 +__u64 znode_build_version(reiser4_tree * tree)
77851 +{
77852 + __u64 result;
77853 +
77854 + spin_lock(&tree->epoch_lock);
77855 + result = ++tree->znode_epoch;
77856 + spin_unlock(&tree->epoch_lock);
77857 + return result;
77858 +}
77859 +
77860 +void init_load_count(load_count * dh)
77861 +{
77862 + assert("nikita-2105", dh != NULL);
77863 + memset(dh, 0, sizeof *dh);
77864 +}
77865 +
77866 +void done_load_count(load_count * dh)
77867 +{
77868 + assert("nikita-2106", dh != NULL);
77869 + if (dh->node != NULL) {
77870 + for (; dh->d_ref > 0; --dh->d_ref)
77871 + zrelse(dh->node);
77872 + dh->node = NULL;
77873 + }
77874 +}
77875 +
77876 +static int incr_load_count(load_count * dh)
77877 +{
77878 + int result;
77879 +
77880 + assert("nikita-2110", dh != NULL);
77881 + assert("nikita-2111", dh->node != NULL);
77882 +
77883 + result = zload(dh->node);
77884 + if (result == 0)
77885 + ++dh->d_ref;
77886 + return result;
77887 +}
77888 +
77889 +int incr_load_count_znode(load_count * dh, znode * node)
77890 +{
77891 + assert("nikita-2107", dh != NULL);
77892 + assert("nikita-2158", node != NULL);
77893 + assert("nikita-2109",
77894 + ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77895 +
77896 + dh->node = node;
77897 + return incr_load_count(dh);
77898 +}
77899 +
77900 +int incr_load_count_jnode(load_count * dh, jnode * node)
77901 +{
77902 + if (jnode_is_znode(node)) {
77903 + return incr_load_count_znode(dh, JZNODE(node));
77904 + }
77905 + return 0;
77906 +}
77907 +
77908 +void copy_load_count(load_count * new, load_count * old)
77909 +{
77910 + int ret = 0;
77911 + done_load_count(new);
77912 + new->node = old->node;
77913 + new->d_ref = 0;
77914 +
77915 + while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77916 + }
77917 +
77918 + assert("jmacd-87589", ret == 0);
77919 +}
77920 +
77921 +void move_load_count(load_count * new, load_count * old)
77922 +{
77923 + done_load_count(new);
77924 + new->node = old->node;
77925 + new->d_ref = old->d_ref;
77926 + old->node = NULL;
77927 + old->d_ref = 0;
77928 +}
77929 +
77930 +/* convert parent pointer into coord */
77931 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77932 +{
77933 + assert("nikita-3204", pcoord != NULL);
77934 + assert("nikita-3205", coord != NULL);
77935 +
77936 + coord_init_first_unit_nocheck(coord, pcoord->node);
77937 + coord_set_item_pos(coord, pcoord->item_pos);
77938 + coord->between = AT_UNIT;
77939 +}
77940 +
77941 +/* pack coord into parent_coord_t */
77942 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77943 +{
77944 + assert("nikita-3206", pcoord != NULL);
77945 + assert("nikita-3207", coord != NULL);
77946 +
77947 + pcoord->node = coord->node;
77948 + pcoord->item_pos = coord->item_pos;
77949 +}
77950 +
77951 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77952 + look for comments there) */
77953 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77954 +{
77955 + pcoord->node = (znode *) node;
77956 + pcoord->item_pos = (unsigned short)~0;
77957 +}
77958 +
77959 +#if REISER4_DEBUG
77960 +
77961 +/* debugging aid: znode invariant */
77962 +static int znode_invariant_f(const znode * node /* znode to check */ ,
77963 + char const **msg /* where to store error
77964 + * message, if any */ )
77965 +{
77966 +#define _ergo(ant, con) \
77967 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77968 +
77969 +#define _equi(e1, e2) \
77970 + ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77971 +
77972 +#define _check(exp) ((*msg) = #exp, (exp))
77973 +
77974 + return jnode_invariant_f(ZJNODE(node), msg) &&
77975 + /* [znode-fake] invariant */
77976 + /* fake znode doesn't have a parent, and */
77977 + _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77978 + /* there is another way to express this very check, and */
77979 + _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77980 + /* it has special block number, and */
77981 + _ergo(znode_get_level(node) == 0,
77982 + disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77983 + /* it is the only znode with such block number, and */
77984 + _ergo(!znode_above_root(node) && znode_is_loaded(node),
77985 + !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77986 + /* it is parent of the tree root node */
77987 + _ergo(znode_is_true_root(node),
77988 + znode_above_root(znode_parent(node))) &&
77989 + /* [znode-level] invariant */
77990 + /* level of parent znode is one larger than that of child,
77991 + except for the fake znode, and */
77992 + _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77993 + znode_get_level(znode_parent(node)) ==
77994 + znode_get_level(node) + 1) &&
77995 + /* left neighbor is at the same level, and */
77996 + _ergo(znode_is_left_connected(node) && node->left != NULL,
77997 + znode_get_level(node) == znode_get_level(node->left)) &&
77998 + /* right neighbor is at the same level */
77999 + _ergo(znode_is_right_connected(node) && node->right != NULL,
78000 + znode_get_level(node) == znode_get_level(node->right)) &&
78001 + /* [znode-connected] invariant */
78002 + _ergo(node->left != NULL, znode_is_left_connected(node)) &&
78003 + _ergo(node->right != NULL, znode_is_right_connected(node)) &&
78004 + _ergo(!znode_is_root(node) && node->left != NULL,
78005 + znode_is_right_connected(node->left) &&
78006 + node->left->right == node) &&
78007 + _ergo(!znode_is_root(node) && node->right != NULL,
78008 + znode_is_left_connected(node->right) &&
78009 + node->right->left == node) &&
78010 + /* [znode-c_count] invariant */
78011 + /* for any znode, c_count of its parent is greater than 0 */
78012 + _ergo(znode_parent(node) != NULL &&
78013 + !znode_above_root(znode_parent(node)),
78014 + znode_parent(node)->c_count > 0) &&
78015 + /* leaves don't have children */
78016 + _ergo(znode_get_level(node) == LEAF_LEVEL,
78017 + node->c_count == 0) &&
78018 + _check(node->zjnode.jnodes.prev != NULL) &&
78019 + _check(node->zjnode.jnodes.next != NULL) &&
78020 + /* orphan doesn't have a parent */
78021 + _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
78022 + /* [znode-modify] invariant */
78023 + /* if znode is not write-locked, its checksum remains
78024 + * invariant */
78025 + /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
78026 + * cannot check this. */
78027 + /* [znode-refs] invariant */
78028 + /* only referenced znode can be long-term locked */
78029 + _ergo(znode_is_locked(node),
78030 + atomic_read(&ZJNODE(node)->x_count) != 0);
78031 +}
78032 +
78033 +/* debugging aid: check znode invariant and panic if it doesn't hold */
78034 +int znode_invariant(znode * node /* znode to check */ )
78035 +{
78036 + char const *failed_msg;
78037 + int result;
78038 +
78039 + assert("umka-063", node != NULL);
78040 + assert("umka-064", current_tree != NULL);
78041 +
78042 + spin_lock_znode(node);
78043 + read_lock_tree(znode_get_tree(node));
78044 + result = znode_invariant_f(node, &failed_msg);
78045 + if (!result) {
78046 + /* print_znode("corrupted node", node); */
78047 + warning("jmacd-555", "Condition %s failed", failed_msg);
78048 + }
78049 + read_unlock_tree(znode_get_tree(node));
78050 + spin_unlock_znode(node);
78051 + return result;
78052 +}
78053 +
78054 +/* return non-0 iff data are loaded into znode */
78055 +int znode_is_loaded(const znode * node /* znode to query */ )
78056 +{
78057 + assert("nikita-497", node != NULL);
78058 + return jnode_is_loaded(ZJNODE(node));
78059 +}
78060 +
78061 +unsigned long znode_times_locked(const znode * z)
78062 +{
78063 + return z->times_locked;
78064 +}
78065 +
78066 +#endif /* REISER4_DEBUG */
78067 +
78068 +/* Make Linus happy.
78069 + Local variables:
78070 + c-indentation-style: "K&R"
78071 + mode-name: "LC"
78072 + c-basic-offset: 8
78073 + tab-width: 8
78074 + fill-column: 120
78075 + End:
78076 +*/
78077 diff -urN linux-2.6.33.orig/fs/reiser4/znode.h linux-2.6.33/fs/reiser4/znode.h
78078 --- linux-2.6.33.orig/fs/reiser4/znode.h 1970-01-01 01:00:00.000000000 +0100
78079 +++ linux-2.6.33/fs/reiser4/znode.h 2010-03-04 19:33:22.000000000 +0100
78080 @@ -0,0 +1,433 @@
78081 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
78082 + * reiser4/README */
78083 +
78084 +/* Declaration of znode (Zam's node). See znode.c for more details. */
78085 +
78086 +#ifndef __ZNODE_H__
78087 +#define __ZNODE_H__
78088 +
78089 +#include "forward.h"
78090 +#include "debug.h"
78091 +#include "dformat.h"
78092 +#include "key.h"
78093 +#include "coord.h"
78094 +#include "plugin/node/node.h"
78095 +#include "jnode.h"
78096 +#include "lock.h"
78097 +#include "readahead.h"
78098 +
78099 +#include <linux/types.h>
78100 +#include <linux/spinlock.h>
78101 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
78102 +#include <asm/atomic.h>
78103 +
78104 +/* znode tracks its position within parent (internal item in a parent node,
78105 + * that contains znode's block number). */
78106 +typedef struct parent_coord {
78107 + znode *node;
78108 + pos_in_node_t item_pos;
78109 +} parent_coord_t;
78110 +
78111 +/* &znode - node in a reiser4 tree.
78112 +
78113 + NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
78114 + cacheline pressure.
78115 +
78116 + Locking:
78117 +
78118 + Long term: data in a disk node attached to this znode are protected
78119 + by long term, deadlock aware lock ->lock;
78120 +
78121 + Spin lock: the following fields are protected by the spin lock:
78122 +
78123 + ->lock
78124 +
78125 + Following fields are protected by the global tree lock:
78126 +
78127 + ->left
78128 + ->right
78129 + ->in_parent
78130 + ->c_count
78131 +
78132 + Following fields are protected by the global delimiting key lock (dk_lock):
78133 +
78134 + ->ld_key (to update ->ld_key long-term lock on the node is also required)
78135 + ->rd_key
78136 +
78137 + Following fields are protected by the long term lock:
78138 +
78139 + ->nr_items
78140 +
78141 + ->node_plugin is never changed once set. This means that after code made
78142 + itself sure that field is valid it can be accessed without any additional
78143 + locking.
78144 +
78145 + ->level is immutable.
78146 +
78147 + Invariants involving this data-type:
78148 +
78149 + [znode-fake]
78150 + [znode-level]
78151 + [znode-connected]
78152 + [znode-c_count]
78153 + [znode-refs]
78154 + [jnode-refs]
78155 + [jnode-queued]
78156 + [znode-modify]
78157 +
78158 + For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
78159 + Suggestions for how to do that are desired.*/
78160 +struct znode {
78161 + /* Embedded jnode. */
78162 + jnode zjnode;
78163 +
78164 + /* contains three subfields, node, pos_in_node, and pos_in_unit.
78165 +
78166 + pos_in_node and pos_in_unit are only hints that are cached to
78167 + speed up lookups during balancing. They are not required to be up to
78168 + date. Synched in find_child_ptr().
78169 +
78170 + This value allows us to avoid expensive binary searches.
78171 +
78172 + in_parent->node points to the parent of this node, and is NOT a
78173 + hint.
78174 + */
78175 + parent_coord_t in_parent;
78176 +
78177 + /*
78178 + * sibling list pointers
78179 + */
78180 +
78181 + /* left-neighbor */
78182 + znode *left;
78183 + /* right-neighbor */
78184 + znode *right;
78185 +
78186 + /* long term lock on node content. This lock supports deadlock
78187 + detection. See lock.c
78188 + */
78189 + zlock lock;
78190 +
78191 + /* You cannot remove from memory a node that has children in
78192 + memory. This is because we rely on the fact that parent of given
78193 + node can always be reached without blocking for io. When reading a
78194 + node into memory you must increase the c_count of its parent, when
78195 + removing it from memory you must decrease the c_count. This makes
78196 + the code simpler, and the cases where it is suboptimal are truly
78197 + obscure.
78198 + */
78199 + int c_count;
78200 +
78201 + /* plugin of node attached to this znode. NULL if znode is not
78202 + loaded. */
78203 + node_plugin *nplug;
78204 +
78205 + /* version of znode data. This is increased on each modification. This
78206 + * is necessary to implement seals (see seal.[ch]) efficiently. */
78207 + __u64 version;
78208 +
78209 + /* left delimiting key. Necessary to efficiently perform
78210 + balancing with node-level locking. Kept in memory only. */
78211 + reiser4_key ld_key;
78212 + /* right delimiting key. */
78213 + reiser4_key rd_key;
78214 +
78215 + /* znode's tree level */
78216 + __u16 level;
78217 + /* number of items in this node. This field is modified by node
78218 + * plugin. */
78219 + __u16 nr_items;
78220 +
78221 +#if REISER4_DEBUG
78222 + void *creator;
78223 + reiser4_key first_key;
78224 + unsigned long times_locked;
78225 + int left_version; /* when node->left was updated */
78226 + int right_version; /* when node->right was updated */
78227 + int ld_key_version; /* when node->ld_key was updated */
78228 + int rd_key_version; /* when node->rd_key was updated */
78229 +#endif
78230 +
78231 +} __attribute__ ((aligned(16)));
78232 +
78233 +ON_DEBUG(extern atomic_t delim_key_version;
78234 + )
78235 +
78236 +/* In general I think these macros should not be exposed. */
78237 +#define znode_is_locked(node) (lock_is_locked(&node->lock))
78238 +#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
78239 +#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
78240 +#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
78241 +#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
78242 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
78243 +/* Macros for accessing the znode state. */
78244 +#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
78245 +#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
78246 +#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
78247 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
78248 + znode * parent, tree_level level, gfp_t gfp_flag);
78249 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
78250 +extern int zload(znode * node);
78251 +extern int zload_ra(znode * node, ra_info_t * info);
78252 +extern int zinit_new(znode * node, gfp_t gfp_flags);
78253 +extern void zrelse(znode * node);
78254 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
78255 +
78256 +/* size of data in znode */
78257 +static inline unsigned
78258 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
78259 +{
78260 + assert("nikita-1416", node != NULL);
78261 + return PAGE_CACHE_SIZE;
78262 +}
78263 +
78264 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
78265 + coord_t * coord);
78266 +extern void coord_to_parent_coord(const coord_t * coord,
78267 + parent_coord_t * pcoord);
78268 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
78269 +
78270 +extern unsigned znode_free_space(znode * node);
78271 +
78272 +extern reiser4_key *znode_get_rd_key(znode * node);
78273 +extern reiser4_key *znode_get_ld_key(znode * node);
78274 +
78275 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
78276 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
78277 +
78278 +/* `connected' state checks */
78279 +static inline int znode_is_right_connected(const znode * node)
78280 +{
78281 + return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
78282 +}
78283 +
78284 +static inline int znode_is_left_connected(const znode * node)
78285 +{
78286 + return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
78287 +}
78288 +
78289 +static inline int znode_is_connected(const znode * node)
78290 +{
78291 + return znode_is_right_connected(node) && znode_is_left_connected(node);
78292 +}
78293 +
78294 +extern int znode_shift_order;
78295 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
78296 +extern void znode_remove(znode *, reiser4_tree *);
78297 +extern znode *znode_parent(const znode * node);
78298 +extern znode *znode_parent_nolock(const znode * node);
78299 +extern int znode_above_root(const znode * node);
78300 +extern int init_znodes(void);
78301 +extern void done_znodes(void);
78302 +extern int znodes_tree_init(reiser4_tree * ztree);
78303 +extern void znodes_tree_done(reiser4_tree * ztree);
78304 +extern int znode_contains_key(znode * node, const reiser4_key * key);
78305 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
78306 +extern unsigned znode_save_free_space(znode * node);
78307 +extern unsigned znode_recover_free_space(znode * node);
78308 +extern znode *zalloc(gfp_t gfp_flag);
78309 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
78310 +extern int zparse(znode * node);
78311 +
78312 +extern int znode_just_created(const znode * node);
78313 +
78314 +extern void zfree(znode * node);
78315 +
78316 +#if REISER4_DEBUG
78317 +extern void print_znode(const char *prefix, const znode * node);
78318 +#else
78319 +#define print_znode( p, n ) noop
78320 +#endif
78321 +
78322 +/* Make it look like various znode functions exist instead of treating znodes as
78323 + jnodes in znode-specific code. */
78324 +#define znode_page(x) jnode_page ( ZJNODE(x) )
78325 +#define zdata(x) jdata ( ZJNODE(x) )
78326 +#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
78327 +#define znode_created(x) jnode_created ( ZJNODE(x) )
78328 +#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
78329 +#define znode_convertible(x) jnode_convertible (ZJNODE(x))
78330 +#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
78331 +
78332 +#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
78333 +#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
78334 +#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
78335 +#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
78336 +
78337 +#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
78338 +#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
78339 +#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
78340 +#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
78341 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
78342 +
78343 +#if REISER4_DEBUG
78344 +extern int znode_x_count_is_protected(const znode * node);
78345 +extern int znode_invariant(znode * node);
78346 +#endif
78347 +
78348 +/* acquire reference to @node */
78349 +static inline znode *zref(znode * node)
78350 +{
78351 + /* change of x_count from 0 to 1 is protected by tree spin-lock */
78352 + return JZNODE(jref(ZJNODE(node)));
78353 +}
78354 +
78355 +/* release reference to @node */
78356 +static inline void zput(znode * node)
78357 +{
78358 + assert("nikita-3564", znode_invariant(node));
78359 + jput(ZJNODE(node));
78360 +}
78361 +
78362 +/* get the level field for a znode */
78363 +static inline tree_level znode_get_level(const znode * node)
78364 +{
78365 + return node->level;
78366 +}
78367 +
78368 +/* get the level field for a jnode */
78369 +static inline tree_level jnode_get_level(const jnode * node)
78370 +{
78371 + if (jnode_is_znode(node))
78372 + return znode_get_level(JZNODE(node));
78373 + else
78374 + /* unformatted nodes are all at the LEAF_LEVEL and for
78375 + "semi-formatted" nodes like bitmaps, level doesn't matter. */
78376 + return LEAF_LEVEL;
78377 +}
78378 +
78379 +/* true if jnode is on leaf level */
78380 +static inline int jnode_is_leaf(const jnode * node)
78381 +{
78382 + if (jnode_is_znode(node))
78383 + return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
78384 + if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
78385 + return 1;
78386 + return 0;
78387 +}
78388 +
78389 +/* return znode's tree */
78390 +static inline reiser4_tree *znode_get_tree(const znode * node)
78391 +{
78392 + assert("nikita-2692", node != NULL);
78393 + return jnode_get_tree(ZJNODE(node));
78394 +}
78395 +
78396 +/* resolve race with zput */
78397 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
78398 +{
78399 + jnode *j;
78400 +
78401 + j = jnode_rip_sync(tree, ZJNODE(node));
78402 + if (likely(j != NULL))
78403 + node = JZNODE(j);
78404 + else
78405 + node = NULL;
78406 + return node;
78407 +}
78408 +
78409 +#if defined(REISER4_DEBUG)
78410 +int znode_is_loaded(const znode * node /* znode to query */ );
78411 +#endif
78412 +
78413 +extern __u64 znode_build_version(reiser4_tree * tree);
78414 +
78415 +/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
78416 + must load the data for a node in many places. We could do this by simply calling
78417 + zload() everywhere, the difficulty arises when we must release the loaded data by
78418 + calling zrelse. In a function with many possible error/return paths, it requires extra
78419 + work to figure out which exit paths must call zrelse and those which do not. The data
78420 + handle automatically calls zrelse for every zload that it is responsible for. In that
78421 + sense, it acts much like a lock_handle.
78422 +*/
78423 +typedef struct load_count {
78424 + znode *node;
78425 + int d_ref;
78426 +} load_count;
78427 +
78428 +extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
78429 +extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
78430 +extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
78431 +extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
78432 + * incr_load_count_znode, otherwise do nothing (unformatted nodes
78433 + * don't require zload/zrelse treatment). */
78434 +extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
78435 +extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
78436 +
78437 +/* Variable initializers for load_count. */
78438 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78439 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78440 +/* A convenience macro for use in assertions or debug-only code, where loaded
78441 + data is only required to perform the debugging check. This macro
78442 + encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78443 +#define WITH_DATA( node, exp ) \
78444 +({ \
78445 + long __with_dh_result; \
78446 + znode *__with_dh_node; \
78447 + \
78448 + __with_dh_node = ( node ); \
78449 + __with_dh_result = zload( __with_dh_node ); \
78450 + if( __with_dh_result == 0 ) { \
78451 + __with_dh_result = ( long )( exp ); \
78452 + zrelse( __with_dh_node ); \
78453 + } \
78454 + __with_dh_result; \
78455 +})
78456 +
78457 +/* Same as above, but accepts a return value in case zload fails. */
78458 +#define WITH_DATA_RET( node, ret, exp ) \
78459 +({ \
78460 + int __with_dh_result; \
78461 + znode *__with_dh_node; \
78462 + \
78463 + __with_dh_node = ( node ); \
78464 + __with_dh_result = zload( __with_dh_node ); \
78465 + if( __with_dh_result == 0 ) { \
78466 + __with_dh_result = ( int )( exp ); \
78467 + zrelse( __with_dh_node ); \
78468 + } else \
78469 + __with_dh_result = ( ret ); \
78470 + __with_dh_result; \
78471 +})
78472 +
78473 +#define WITH_COORD(coord, exp) \
78474 +({ \
78475 + coord_t *__coord; \
78476 + \
78477 + __coord = (coord); \
78478 + coord_clear_iplug(__coord); \
78479 + WITH_DATA(__coord->node, exp); \
78480 +})
78481 +
78482 +#if REISER4_DEBUG
78483 +#define STORE_COUNTERS \
78484 + reiser4_lock_cnt_info __entry_counters = \
78485 + *reiser4_lock_counters()
78486 +#define CHECK_COUNTERS \
78487 +ON_DEBUG_CONTEXT( \
78488 +({ \
78489 + __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
78490 + __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
78491 + __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
78492 + assert("nikita-2159", \
78493 + !memcmp(&__entry_counters, reiser4_lock_counters(), \
78494 + sizeof __entry_counters)); \
78495 +}) )
78496 +
78497 +#else
78498 +#define STORE_COUNTERS
78499 +#define CHECK_COUNTERS noop
78500 +#endif
78501 +
78502 +/* __ZNODE_H__ */
78503 +#endif
78504 +
78505 +/* Make Linus happy.
78506 + Local variables:
78507 + c-indentation-style: "K&R"
78508 + mode-name: "LC"
78509 + c-basic-offset: 8
78510 + tab-width: 8
78511 + fill-column: 120
78512 + End:
78513 +*/
78514 diff -urN linux-2.6.33.orig/include/linux/fs.h linux-2.6.33/include/linux/fs.h
78515 --- linux-2.6.33.orig/include/linux/fs.h 2010-02-24 19:52:17.000000000 +0100
78516 +++ linux-2.6.33/include/linux/fs.h 2010-03-04 19:33:22.000000000 +0100
78517 @@ -511,6 +511,7 @@
78518 struct page;
78519 struct address_space;
78520 struct writeback_control;
78521 +struct bdi_writeback;
78522
78523 struct iov_iter {
78524 const struct iovec *iov;
78525 @@ -1567,7 +1568,11 @@
78526 int (*remount_fs) (struct super_block *, int *, char *);
78527 void (*clear_inode) (struct inode *);
78528 void (*umount_begin) (struct super_block *);
78529 -
78530 + int (*writeback_inodes)(struct super_block *sb,
78531 + struct bdi_writeback *wb,
78532 + struct writeback_control *wbc);
78533 + void (*sync_inodes) (struct super_block *sb,
78534 + struct writeback_control *wbc);
78535 int (*show_options)(struct seq_file *, struct vfsmount *);
78536 int (*show_stats)(struct seq_file *, struct vfsmount *);
78537 #ifdef CONFIG_QUOTA
78538 @@ -2074,6 +2079,12 @@
78539 extern int invalidate_inode_pages2_range(struct address_space *mapping,
78540 pgoff_t start, pgoff_t end);
78541 extern int write_inode_now(struct inode *, int);
78542 +extern void writeback_skip_sb_inodes(struct super_block *sb,
78543 + struct bdi_writeback *wb);
78544 +extern void writeback_inodes_wbc(struct writeback_control *wbc);
78545 +extern int generic_writeback_sb_inodes(struct super_block *sb,
78546 + struct bdi_writeback *wb,
78547 + struct writeback_control *wbc);
78548 extern int filemap_fdatawrite(struct address_space *);
78549 extern int filemap_flush(struct address_space *);
78550 extern int filemap_fdatawait(struct address_space *);
78551 diff -urN linux-2.6.33.orig/include/linux/mm.h linux-2.6.33/include/linux/mm.h
78552 --- linux-2.6.33.orig/include/linux/mm.h 2010-02-24 19:52:17.000000000 +0100
78553 +++ linux-2.6.33/include/linux/mm.h 2010-03-04 19:33:22.000000000 +0100
78554 @@ -850,6 +850,7 @@
78555 void account_page_dirtied(struct page *page, struct address_space *mapping);
78556 int set_page_dirty(struct page *page);
78557 int set_page_dirty_lock(struct page *page);
78558 +int set_page_dirty_notag(struct page *page);
78559 int clear_page_dirty_for_io(struct page *page);
78560
78561 extern unsigned long move_page_tables(struct vm_area_struct *vma,
78562 diff -urN linux-2.6.33.orig/include/linux/writeback.h linux-2.6.33/include/linux/writeback.h
78563 --- linux-2.6.33.orig/include/linux/writeback.h 2010-02-24 19:52:17.000000000 +0100
78564 +++ linux-2.6.33/include/linux/writeback.h 2010-03-04 19:33:22.000000000 +0100
78565 @@ -13,6 +13,12 @@
78566 extern struct list_head inode_in_use;
78567 extern struct list_head inode_unused;
78568
78569 +static inline int is_flush_bd_task(struct task_struct *task)
78570 +{
78571 + return task->flags & PF_FLUSHER;
78572 +}
78573 +#define current_is_flush_bd_task() is_flush_bd_task(current)
78574 +
78575 /*
78576 * fs/fs-writeback.c
78577 */
78578 @@ -34,6 +40,9 @@
78579 enum writeback_sync_modes sync_mode;
78580 unsigned long *older_than_this; /* If !NULL, only write back inodes
78581 older than this */
78582 + unsigned long wb_start; /* Time writeback_inodes_wb was
78583 + called. This is needed to avoid
78584 + extra jobs and livelock */
78585 long nr_to_write; /* Write this many pages, and decrement
78586 this for each page written */
78587 long pages_skipped; /* Pages which were not written */
78588 diff -urN linux-2.6.33.orig/mm/filemap.c linux-2.6.33/mm/filemap.c
78589 --- linux-2.6.33.orig/mm/filemap.c 2010-02-24 19:52:17.000000000 +0100
78590 +++ linux-2.6.33/mm/filemap.c 2010-03-04 19:33:22.000000000 +0100
78591 @@ -139,6 +139,7 @@
78592 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
78593 }
78594 }
78595 +EXPORT_SYMBOL(__remove_from_page_cache);
78596
78597 void remove_from_page_cache(struct page *page)
78598 {
78599 @@ -151,6 +152,7 @@
78600 spin_unlock_irq(&mapping->tree_lock);
78601 mem_cgroup_uncharge_cache_page(page);
78602 }
78603 +EXPORT_SYMBOL(remove_from_page_cache);
78604
78605 static int sync_page(void *word)
78606 {
78607 @@ -948,6 +950,7 @@
78608 {
78609 ra->ra_pages /= 4;
78610 }
78611 +EXPORT_SYMBOL(find_get_pages);
78612
78613 /**
78614 * do_generic_file_read - generic file read routine
78615 diff -urN linux-2.6.33.orig/mm/page-writeback.c linux-2.6.33/mm/page-writeback.c
78616 --- linux-2.6.33.orig/mm/page-writeback.c 2010-02-24 19:52:17.000000000 +0100
78617 +++ linux-2.6.33/mm/page-writeback.c 2010-03-04 19:33:22.000000000 +0100
78618 @@ -1130,6 +1130,32 @@
78619 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
78620
78621 /*
78622 + * set_page_dirty_notag() -- similar to __set_page_dirty_nobuffers()
78623 + * except it doesn't tag the page dirty in the page-cache radix tree.
78624 + * This means that the address space using this cannot use the regular
78625 + * filemap ->writepages() helpers and must provide its own means of
78626 + * tracking and finding non-tagged dirty pages.
78627 + *
78628 + * NOTE: furthermore, this version also doesn't handle truncate races.
78629 + */
78630 +int set_page_dirty_notag(struct page *page)
78631 +{
78632 + struct address_space *mapping = page->mapping;
78633 +
78634 + if (!TestSetPageDirty(page)) {
78635 + unsigned long flags;
78636 + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
78637 + local_irq_save(flags);
78638 + account_page_dirtied(page, mapping);
78639 + local_irq_restore(flags);
78640 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
78641 + return 1;
78642 + }
78643 + return 0;
78644 +}
78645 +EXPORT_SYMBOL(set_page_dirty_notag);
78646 +
78647 +/*
78648 * When a writepage implementation decides that it doesn't want to write this
78649 * page for some reason, it should redirty the locked page via
78650 * redirty_page_for_writepage() and it should then unlock the page and return 0