]> git.ipfire.org Git - ipfire-2.x.git/blob - src/patches/reiser4-for-2.6.20.patch
Updated vlc to actual version
[ipfire-2.x.git] / src / patches / reiser4-for-2.6.20.patch
1 diff -urN linux-2.6.20.orig/arch/i386/lib/usercopy.c linux-2.6.20/arch/i386/lib/usercopy.c
2 --- linux-2.6.20.orig/arch/i386/lib/usercopy.c 2006-11-30 00:57:37.000000000 +0300
3 +++ linux-2.6.20/arch/i386/lib/usercopy.c 2007-05-06 14:50:43.658963226 +0400
4 @@ -812,6 +812,7 @@
5 #endif
6 return n;
7 }
8 +EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12 @@ -827,6 +828,7 @@
13 #endif
14 return n;
15 }
16 +EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20 diff -urN linux-2.6.20.orig/Documentation/Changes linux-2.6.20/Documentation/Changes
21 --- linux-2.6.20.orig/Documentation/Changes 2007-05-06 15:04:34.226399593 +0400
22 +++ linux-2.6.20/Documentation/Changes 2007-05-06 14:50:43.658963226 +0400
23 @@ -36,6 +36,7 @@
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27 +o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
31 @@ -144,6 +145,13 @@
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35 +Reiser4progs
36 +------------
37 +
38 +The reiser4progs package contains utilities for the reiser4 file system.
39 +Detailed instructions are provided in the README file located at:
40 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41 +
42 Xfsprogs
43 --------
44
45 @@ -322,6 +330,10 @@
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49 +Reiser4progs
50 +------------
51 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
52 +
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
56 diff -urN linux-2.6.20.orig/Documentation/filesystems/reiser4.txt linux-2.6.20/Documentation/filesystems/reiser4.txt
57 --- linux-2.6.20.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58 +++ linux-2.6.20/Documentation/filesystems/reiser4.txt 2007-05-06 14:50:43.658963226 +0400
59 @@ -0,0 +1,75 @@
60 +Reiser4 filesystem
61 +==================
62 +Reiser4 is a file system based on dancing tree algorithms, and is
63 +described at http://www.namesys.com
64 +
65 +
66 +References
67 +==========
68 +web page http://namesys.com/v4/v4.html
69 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71 +install page http://www.namesys.com/install_v4.html
72 +
73 +Compile options
74 +===============
75 +Enable reiser4 debug mode
76 + This checks everything imaginable while reiser4
77 + runs
78 +
79 +Mount options
80 +=============
81 +tmgr.atom_max_size=N
82 + Atoms containing more than N blocks will be forced to commit.
83 + N is decimal.
84 + Default is nr_free_pagecache_pages() / 2 at mount time.
85 +
86 +tmgr.atom_max_age=N
87 + Atoms older than N seconds will be forced to commit. N is decimal.
88 + Default is 600.
89 +
90 +tmgr.atom_max_flushers=N
91 + Limit of concurrent flushers for one atom. 0 means no limit.
92 + Default is 0.
93 +
94 +tree.cbk_cache.nr_slots=N
95 + Number of slots in the cbk cache.
96 +
97 +flush.relocate_threshold=N
98 + If flush finds more than N adjacent dirty leaf-level blocks it
99 + will force them to be relocated.
100 + Default is 64.
101 +
102 +flush.relocate_distance=N
103 + If flush finds can find a block allocation closer than at most
104 + N from the preceder it will relocate to that position.
105 + Default is 64.
106 +
107 +flush.scan_maxnodes=N
108 + The maximum number of nodes to scan left on a level during
109 + flush.
110 + Default is 10000.
111 +
112 +optimal_io_size=N
113 + Preferred IO size. This value is used to set st_blksize of
114 + struct stat.
115 + Default is 65536.
116 +
117 +bsdgroups
118 + Turn on BSD-style gid assignment.
119 +
120 +32bittimes
121 + By default file in reiser4 have 64 bit timestamps. Files
122 + created when filesystem is mounted with 32bittimes mount
123 + option will get 32 bit timestamps.
124 +
125 +mtflush
126 + Turn off concurrent flushing.
127 +
128 +nopseudo
129 + Disable pseudo files support. See
130 + http://namesys.com/v4/pseudo.html for more about pseudo files.
131 +
132 +dont_load_bitmap
133 + Don't load all bitmap blocks at mount time, it is useful for
134 + machines with tiny RAM and large disks.
135 diff -urN linux-2.6.20.orig/fs/fs-writeback.c linux-2.6.20/fs/fs-writeback.c
136 --- linux-2.6.20.orig/fs/fs-writeback.c 2007-05-06 15:04:39.848155607 +0400
137 +++ linux-2.6.20/fs/fs-writeback.c 2007-05-06 14:50:43.662964476 +0400
138 @@ -296,8 +296,6 @@
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142 - * Called under inode_lock.
143 - *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
147 @@ -313,11 +311,13 @@
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on __wait_on_inode.
150 */
151 -static void
152 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153 +void
154 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158 + spin_lock(&inode_lock);
159 +
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 list_splice_init(&sb->s_dirty, &sb->s_io);
162
163 @@ -397,8 +397,19 @@
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167 + spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170 +EXPORT_SYMBOL(generic_sync_sb_inodes);
171 +
172 +static void
173 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174 +{
175 + if (sb->s_op->sync_inodes)
176 + sb->s_op->sync_inodes(sb, wbc);
177 + else
178 + generic_sync_sb_inodes(sb, wbc);
179 +}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
183 @@ -439,11 +450,8 @@
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187 - if (sb->s_root) {
188 - spin_lock(&inode_lock);
189 + if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191 - spin_unlock(&inode_lock);
192 - }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
196 @@ -481,9 +489,7 @@
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200 - spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202 - spin_unlock(&inode_lock);
203 }
204
205 /*
206 diff -urN linux-2.6.20.orig/fs/Kconfig linux-2.6.20/fs/Kconfig
207 --- linux-2.6.20.orig/fs/Kconfig 2007-05-06 15:04:39.668099364 +0400
208 +++ linux-2.6.20/fs/Kconfig 2007-05-06 14:50:43.662964476 +0400
209 @@ -272,6 +272,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213 +source "fs/reiser4/Kconfig"
214 +
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218 diff -urN linux-2.6.20.orig/fs/Makefile linux-2.6.20/fs/Makefile
219 --- linux-2.6.20.orig/fs/Makefile 2007-05-06 15:04:39.668099364 +0400
220 +++ linux-2.6.20/fs/Makefile 2007-05-06 14:50:43.666965726 +0400
221 @@ -62,6 +62,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225 +obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229 diff -urN linux-2.6.20.orig/fs/reiser4/as_ops.c linux-2.6.20/fs/reiser4/as_ops.c
230 --- linux-2.6.20.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231 +++ linux-2.6.20/fs/reiser4/as_ops.c 2007-05-06 14:50:43.666965726 +0400
232 @@ -0,0 +1,337 @@
233 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234 +
235 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236 +
237 +#include "forward.h"
238 +#include "debug.h"
239 +#include "dformat.h"
240 +#include "coord.h"
241 +#include "plugin/item/item.h"
242 +#include "plugin/file/file.h"
243 +#include "plugin/security/perm.h"
244 +#include "plugin/disk_format/disk_format.h"
245 +#include "plugin/plugin.h"
246 +#include "plugin/plugin_set.h"
247 +#include "plugin/object.h"
248 +#include "txnmgr.h"
249 +#include "jnode.h"
250 +#include "znode.h"
251 +#include "block_alloc.h"
252 +#include "tree.h"
253 +#include "vfs_ops.h"
254 +#include "inode.h"
255 +#include "page_cache.h"
256 +#include "ktxnmgrd.h"
257 +#include "super.h"
258 +#include "reiser4.h"
259 +#include "entd.h"
260 +
261 +#include <linux/profile.h>
262 +#include <linux/types.h>
263 +#include <linux/mount.h>
264 +#include <linux/vfs.h>
265 +#include <linux/mm.h>
266 +#include <linux/buffer_head.h>
267 +#include <linux/dcache.h>
268 +#include <linux/list.h>
269 +#include <linux/pagemap.h>
270 +#include <linux/slab.h>
271 +#include <linux/seq_file.h>
272 +#include <linux/init.h>
273 +#include <linux/module.h>
274 +#include <linux/writeback.h>
275 +#include <linux/backing-dev.h>
276 +#include <linux/quotaops.h>
277 +#include <linux/security.h>
278 +
279 +/* address space operations */
280 +
281 +/**
282 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283 + * @page: page to be dirtied
284 + *
285 + * Operation of struct address_space_operations. This implementation is used by
286 + * unix and cryptcompress file plugins.
287 + *
288 + * This is called when reiser4 page gets dirtied outside of reiser4, for
289 + * example, when dirty bit is moved from pte to physical page.
290 + *
291 + * Tags page in the mapping's page tree with special tag so that it is possible
292 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293 + * capturing by an atom) later because it can not be done in the contexts where
294 + * set_page_dirty is called.
295 + */
296 +int reiser4_set_page_dirty(struct page *page)
297 +{
298 + /* this page can be unformatted only */
299 + assert("vs-1734", (page->mapping &&
300 + page->mapping->host &&
301 + reiser4_get_super_fake(page->mapping->host->i_sb) !=
302 + page->mapping->host
303 + && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304 + page->mapping->host
305 + && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306 + page->mapping->host));
307 +
308 + if (!TestSetPageDirty(page)) {
309 + struct address_space *mapping = page->mapping;
310 +
311 + if (mapping) {
312 + write_lock_irq(&mapping->tree_lock);
313 +
314 + /* check for race with truncate */
315 + if (page->mapping) {
316 + assert("vs-1652", page->mapping == mapping);
317 + if (mapping_cap_account_dirty(mapping))
318 + inc_zone_page_state(page,
319 + NR_FILE_DIRTY);
320 + radix_tree_tag_set(&mapping->page_tree,
321 + page->index,
322 + PAGECACHE_TAG_REISER4_MOVED);
323 + }
324 + write_unlock_irq(&mapping->tree_lock);
325 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326 + }
327 + }
328 + return 0;
329 +}
330 +
331 +/* ->invalidatepage method for reiser4 */
332 +
333 +/*
334 + * this is called for each truncated page from
335 + * truncate_inode_pages()->truncate_{complete,partial}_page().
336 + *
337 + * At the moment of call, page is under lock, and outstanding io (if any) has
338 + * completed.
339 + */
340 +
341 +/**
342 + * reiser4_invalidatepage
343 + * @page: page to invalidate
344 + * @offset: starting offset for partial invalidation
345 + *
346 + */
347 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
348 +{
349 + int ret = 0;
350 + reiser4_context *ctx;
351 + struct inode *inode;
352 + jnode *node;
353 +
354 + /*
355 + * This is called to truncate file's page.
356 + *
357 + * Originally, reiser4 implemented truncate in a standard way
358 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
359 + * first, then file system ->truncate() call-back is invoked).
360 + *
361 + * This lead to the problem when ->invalidatepage() was called on a
362 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363 + * process. That is, truncate was bypassing transactions. To avoid
364 + * this, try_capture_page_to_invalidate() call was added here.
365 + *
366 + * After many troubles with vmtruncate() based truncate (including
367 + * races with flush, tail conversion, etc.) it was re-written in the
368 + * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369 + * and pages belonging to extent are invalidated in kill_hook_extent().
370 + * So probably now additional call to capture is not needed here.
371 + */
372 +
373 + assert("nikita-3137", PageLocked(page));
374 + assert("nikita-3138", !PageWriteback(page));
375 + inode = page->mapping->host;
376 +
377 + /*
378 + * ->invalidatepage() should only be called for the unformatted
379 + * jnodes. Destruction of all other types of jnodes is performed
380 + * separately. But, during some corner cases (like handling errors
381 + * during mount) it is simpler to let ->invalidatepage to be called on
382 + * them. Check for this, and do nothing.
383 + */
384 + if (reiser4_get_super_fake(inode->i_sb) == inode)
385 + return;
386 + if (reiser4_get_cc_fake(inode->i_sb) == inode)
387 + return;
388 + if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389 + return;
390 + assert("vs-1426", PagePrivate(page));
391 + assert("vs-1427",
392 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
393 + assert("", jprivate(page) != NULL);
394 + assert("", ergo(inode_file_plugin(inode) !=
395 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396 + offset == 0));
397 +
398 + ctx = reiser4_init_context(inode->i_sb);
399 + if (IS_ERR(ctx))
400 + return;
401 +
402 + node = jprivate(page);
403 + spin_lock_jnode(node);
404 + if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406 + /* there is not need to capture */
407 + jref(node);
408 + JF_SET(node, JNODE_HEARD_BANSHEE);
409 + page_clear_jnode(page, node);
410 + reiser4_uncapture_jnode(node);
411 + unhash_unformatted_jnode(node);
412 + jput(node);
413 + reiser4_exit_context(ctx);
414 + return;
415 + }
416 + spin_unlock_jnode(node);
417 +
418 + /* capture page being truncated. */
419 + ret = try_capture_page_to_invalidate(page);
420 + if (ret != 0)
421 + warning("nikita-3141", "Cannot capture: %i", ret);
422 +
423 + if (offset == 0) {
424 + /* remove jnode from transaction and detach it from page. */
425 + jref(node);
426 + JF_SET(node, JNODE_HEARD_BANSHEE);
427 + /* page cannot be detached from jnode concurrently, because it
428 + * is locked */
429 + reiser4_uncapture_page(page);
430 +
431 + /* this detaches page from jnode, so that jdelete will not try
432 + * to lock page which is already locked */
433 + spin_lock_jnode(node);
434 + page_clear_jnode(page, node);
435 + spin_unlock_jnode(node);
436 + unhash_unformatted_jnode(node);
437 +
438 + jput(node);
439 + }
440 +
441 + reiser4_exit_context(ctx);
442 +}
443 +
444 +/* help function called from reiser4_releasepage(). It returns true if jnode
445 + * can be detached from its page and page released. */
446 +int jnode_is_releasable(jnode * node /* node to check */ )
447 +{
448 + assert("nikita-2781", node != NULL);
449 + assert_spin_locked(&(node->guard));
450 + assert_spin_locked(&(node->load));
451 +
452 + /* is some thread is currently using jnode page, later cannot be
453 + * detached */
454 + if (atomic_read(&node->d_count) != 0) {
455 + return 0;
456 + }
457 +
458 + assert("vs-1214", !jnode_is_loaded(node));
459 +
460 + /*
461 + * can only release page if real block number is assigned to it. Simple
462 + * check for ->atom wouldn't do, because it is possible for node to be
463 + * clean, not it atom yet, and still having fake block number. For
464 + * example, node just created in jinit_new().
465 + */
466 + if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467 + return 0;
468 +
469 + /*
470 + * pages prepared for write can not be released anyway, so avoid
471 + * detaching jnode from the page
472 + */
473 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474 + return 0;
475 +
476 + /*
477 + * dirty jnode cannot be released. It can however be submitted to disk
478 + * as part of early flushing, but only after getting flush-prepped.
479 + */
480 + if (JF_ISSET(node, JNODE_DIRTY))
481 + return 0;
482 +
483 + /* overwrite set is only written by log writer. */
484 + if (JF_ISSET(node, JNODE_OVRWR))
485 + return 0;
486 +
487 + /* jnode is already under writeback */
488 + if (JF_ISSET(node, JNODE_WRITEBACK))
489 + return 0;
490 +
491 + /* don't flush bitmaps or journal records */
492 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493 + return 0;
494 +
495 + return 1;
496 +}
497 +
498 +/*
499 + * ->releasepage method for reiser4
500 + *
501 + * This is called by VM scanner when it comes across clean page. What we have
502 + * to do here is to check whether page can really be released (freed that is)
503 + * and if so, detach jnode from it and remove page from the page cache.
504 + *
505 + * Check for releasability is done by releasable() function.
506 + */
507 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508 +{
509 + jnode *node;
510 +
511 + assert("nikita-2257", PagePrivate(page));
512 + assert("nikita-2259", PageLocked(page));
513 + assert("nikita-2892", !PageWriteback(page));
514 + assert("nikita-3019", reiser4_schedulable());
515 +
516 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517 + is not clear what to do in this case. A lot of deadlocks seems be
518 + possible. */
519 +
520 + node = jnode_by_page(page);
521 + assert("nikita-2258", node != NULL);
522 + assert("reiser4-4", page->mapping != NULL);
523 + assert("reiser4-5", page->mapping->host != NULL);
524 +
525 + if (PageDirty(page))
526 + return 0;
527 +
528 + /* extra page reference is used by reiser4 to protect
529 + * jnode<->page link from this ->releasepage(). */
530 + if (page_count(page) > 3)
531 + return 0;
532 +
533 + /* releasable() needs jnode lock, because it looks at the jnode fields
534 + * and we need jload_lock here to avoid races with jload(). */
535 + spin_lock_jnode(node);
536 + spin_lock(&(node->load));
537 + if (jnode_is_releasable(node)) {
538 + struct address_space *mapping;
539 +
540 + mapping = page->mapping;
541 + jref(node);
542 + /* there is no need to synchronize against
543 + * jnode_extent_write() here, because pages seen by
544 + * jnode_extent_write() are !releasable(). */
545 + page_clear_jnode(page, node);
546 + spin_unlock(&(node->load));
547 + spin_unlock_jnode(node);
548 +
549 + /* we are under memory pressure so release jnode also. */
550 + jput(node);
551 +
552 + return 1;
553 + } else {
554 + spin_unlock(&(node->load));
555 + spin_unlock_jnode(node);
556 + assert("nikita-3020", reiser4_schedulable());
557 + return 0;
558 + }
559 +}
560 +
561 +/* Make Linus happy.
562 + Local variables:
563 + c-indentation-style: "K&R"
564 + mode-name: "LC"
565 + c-basic-offset: 8
566 + tab-width: 8
567 + fill-column: 120
568 + End:
569 +*/
570 diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.c linux-2.6.20/fs/reiser4/block_alloc.c
571 --- linux-2.6.20.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
572 +++ linux-2.6.20/fs/reiser4/block_alloc.c 2007-05-06 14:50:43.682970725 +0400
573 @@ -0,0 +1,1137 @@
574 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
575 +
576 +#include "debug.h"
577 +#include "dformat.h"
578 +#include "plugin/plugin.h"
579 +#include "txnmgr.h"
580 +#include "znode.h"
581 +#include "block_alloc.h"
582 +#include "tree.h"
583 +#include "super.h"
584 +
585 +#include <linux/types.h> /* for __u?? */
586 +#include <linux/fs.h> /* for struct super_block */
587 +#include <linux/spinlock.h>
588 +
589 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
590 +
591 +/* We need to be able to reserve enough disk space to ensure that an atomic
592 + operation will have enough disk space to flush (see flush.c and
593 + http://namesys.com/v4/v4.html) and commit it once it is started.
594 +
595 + In our design a call for reserving disk space may fail but not an actual
596 + block allocation.
597 +
598 + All free blocks, already allocated blocks, and all kinds of reserved blocks
599 + are counted in different per-fs block counters.
600 +
601 + A reiser4 super block's set of block counters currently is:
602 +
603 + free -- free blocks,
604 + used -- already allocated blocks,
605 +
606 + grabbed -- initially reserved for performing an fs operation, those blocks
607 + are taken from free blocks, then grabbed disk space leaks from grabbed
608 + blocks counter to other counters like "fake allocated", "flush
609 + reserved", "used", the rest of not used grabbed space is returned to
610 + free space at the end of fs operation;
611 +
612 + fake allocated -- counts all nodes without real disk block numbers assigned,
613 + we have separate accounting for formatted and unformatted
614 + nodes (for easier debugging);
615 +
616 + flush reserved -- disk space needed for flushing and committing an atom.
617 + Each dirty already allocated block could be written as a
618 + part of atom's overwrite set or as a part of atom's
619 + relocate set. In both case one additional block is needed,
620 + it is used as a wandered block if we do overwrite or as a
621 + new location for a relocated block.
622 +
623 + In addition, blocks in some states are counted on per-thread and per-atom
624 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
625 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
626 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
627 + blocks, which are reserved for flush processing and atom commit. */
628 +
629 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
630 + number of blocks to grab for most expensive case of balancing when the leaf
631 + node we insert new item to gets split and new leaf node is allocated.
632 +
633 + So, we need to grab blocks for
634 +
635 + 1) one block for possible dirtying the node we insert an item to. That block
636 + would be used for node relocation at flush time or for allocating of a
637 + wandered one, it depends what will be a result (what set, relocate or
638 + overwrite the node gets assigned to) of the node processing by the flush
639 + algorithm.
640 +
641 + 2) one block for either allocating a new node, or dirtying of right or left
642 + clean neighbor, only one case may happen.
643 +
644 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
645 + node, and creation of new node. have I forgotten something? email me.
646 +
647 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
648 + counter and in the fs-wide one (both ctx->grabbed_blocks and
649 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
650 + decremented by 2.
651 +
652 + Suppose both two blocks were spent for dirtying of an already allocated clean
653 + node (one block went from "grabbed" to "flush reserved") and for new block
654 + allocating (one block went from "grabbed" to "fake allocated formatted").
655 +
656 + Inserting of a child pointer to the parent node caused parent node to be
657 + split, the balancing code takes care about this grabbing necessary space
658 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
659 + "can use the 5% reserved disk space".
660 +
661 + At this moment insertion completes and grabbed blocks (if they were not used)
662 + should be returned to the free space counter.
663 +
664 + However the atom life-cycle is not completed. The atom had one "flush
665 + reserved" block added by our insertion and the new fake allocated node is
666 + counted as a "fake allocated formatted" one. The atom has to be fully
667 + processed by flush before commit. Suppose that the flush moved the first,
668 + already allocated node to the atom's overwrite list, the new fake allocated
669 + node, obviously, went into the atom relocate set. The reiser4 flush
670 + allocates the new node using one unit from "fake allocated formatted"
671 + counter, the log writer uses one from "flush reserved" for wandered block
672 + allocation.
673 +
674 + And, it is not the end. When the wandered block is deallocated after the
675 + atom gets fully played (see wander.c for term description), the disk space
676 + occupied for it is returned to free blocks. */
677 +
678 +/* BLOCK NUMBERS */
679 +
680 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
681 + indexing in hash tables, so if a block has not yet been assigned a location
682 + on disk we need to give it a temporary fake block number.
683 +
684 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
685 + use highest bit in 64-bit block number to distinguish fake and real block
686 + numbers. So, only 63 bits may be used to addressing of real device
687 + blocks. That "fake" block numbers space is divided into subspaces of fake
688 + block numbers for data blocks and for shadow (working) bitmap blocks.
689 +
690 + Fake block numbers for data blocks are generated by a cyclic counter, which
691 + gets incremented after each real block allocation. We assume that it is
692 + impossible to overload this counter during one transaction life. */
693 +
694 +/* Initialize a blocknr hint. */
695 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
696 +{
697 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
698 +}
699 +
700 +/* Release any resources of a blocknr hint. */
701 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
702 +{
703 + /* No resources should be freed in current blocknr_hint implementation. */
704 +}
705 +
706 +/* see above for explanation of fake block number. */
707 +/* Audited by: green(2002.06.11) */
708 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
709 +{
710 + /* The reason for not simply returning result of '&' operation is that
711 + while return value is (possibly 32bit) int, the reiser4_block_nr is
712 + at least 64 bits long, and high bit (which is the only possible
713 + non zero bit after the masking) would be stripped off */
714 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
715 +}
716 +
717 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
718 + arithmetic. Mostly, they are isolated to not to code same assertions in
719 + several places. */
720 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
721 +{
722 + BUG_ON(ctx->grabbed_blocks < count);
723 + assert("zam-527", ctx->grabbed_blocks >= count);
724 + ctx->grabbed_blocks -= count;
725 +}
726 +
727 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
728 +{
729 + ctx->grabbed_blocks += count;
730 +}
731 +
732 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
733 +{
734 + assert("zam-525", sbinfo->blocks_grabbed >= count);
735 + sbinfo->blocks_grabbed -= count;
736 +}
737 +
738 +/* Decrease the counter of block reserved for flush in super block. */
739 +static void
740 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
741 +{
742 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
743 + sbinfo->blocks_flush_reserved -= count;
744 +}
745 +
746 +static void
747 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
748 + reiser4_ba_flags_t flags)
749 +{
750 + if (flags & BA_FORMATTED) {
751 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
752 + sbinfo->blocks_fake_allocated -= count;
753 + } else {
754 + assert("zam-528",
755 + sbinfo->blocks_fake_allocated_unformatted >= count);
756 + sbinfo->blocks_fake_allocated_unformatted -= count;
757 + }
758 +}
759 +
760 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
761 +{
762 + assert("zam-530",
763 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
764 + sbinfo->blocks_used -= count;
765 +}
766 +
767 +static void
768 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
769 +{
770 + assert("edward-501", sbinfo->blocks_clustered >= count);
771 + sbinfo->blocks_clustered -= count;
772 +}
773 +
774 +/* Increase the counter of block reserved for flush in atom. */
775 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
776 +{
777 + assert("zam-772", atom != NULL);
778 + assert_spin_locked(&(atom->alock));
779 + atom->flush_reserved += count;
780 +}
781 +
782 +/* Decrease the counter of block reserved for flush in atom. */
783 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
784 +{
785 + assert("zam-774", atom != NULL);
786 + assert_spin_locked(&(atom->alock));
787 + assert("nikita-2790", atom->flush_reserved >= count);
788 + atom->flush_reserved -= count;
789 +}
790 +
791 +/* super block has 6 counters: free, used, grabbed, fake allocated
792 + (formatted and unformatted) and flush reserved. Their sum must be
793 + number of blocks on a device. This function checks this */
794 +int reiser4_check_block_counters(const struct super_block *super)
795 +{
796 + __u64 sum;
797 +
798 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
799 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
800 + reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
801 + reiser4_clustered_blocks(super);
802 + if (reiser4_block_count(super) != sum) {
803 + printk("super block counters: "
804 + "used %llu, free %llu, "
805 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
806 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
807 + (unsigned long long)reiser4_data_blocks(super),
808 + (unsigned long long)reiser4_free_blocks(super),
809 + (unsigned long long)reiser4_grabbed_blocks(super),
810 + (unsigned long long)reiser4_fake_allocated(super),
811 + (unsigned long long)
812 + reiser4_fake_allocated_unformatted(super),
813 + (unsigned long long)reiser4_flush_reserved(super),
814 + (unsigned long long)reiser4_clustered_blocks(super),
815 + (unsigned long long)sum,
816 + (unsigned long long)reiser4_block_count(super));
817 + return 0;
818 + }
819 + return 1;
820 +}
821 +
822 +/* Adjust "working" free blocks counter for number of blocks we are going to
823 + allocate. Record number of grabbed blocks in fs-wide and per-thread
824 + counters. This function should be called before bitmap scanning or
825 + allocating fake block numbers
826 +
827 + @super -- pointer to reiser4 super block;
828 + @count -- number of blocks we reserve;
829 +
830 + @return -- 0 if success, -ENOSPC, if all
831 + free blocks are preserved or already allocated.
832 +*/
833 +
834 +static int
835 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
836 +{
837 + __u64 free_blocks;
838 + int ret = 0, use_reserved = flags & BA_RESERVED;
839 + reiser4_super_info_data *sbinfo;
840 +
841 + assert("vs-1276", ctx == get_current_context());
842 +
843 + /* Do not grab anything on ro-mounted fs. */
844 + if (rofs_super(ctx->super)) {
845 + ctx->grab_enabled = 0;
846 + return 0;
847 + }
848 +
849 + sbinfo = get_super_private(ctx->super);
850 +
851 + spin_lock_reiser4_super(sbinfo);
852 +
853 + free_blocks = sbinfo->blocks_free;
854 +
855 + if ((use_reserved && free_blocks < count) ||
856 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
857 + ret = RETERR(-ENOSPC);
858 + goto unlock_and_ret;
859 + }
860 +
861 + add_to_ctx_grabbed(ctx, count);
862 +
863 + sbinfo->blocks_grabbed += count;
864 + sbinfo->blocks_free -= count;
865 +
866 +#if REISER4_DEBUG
867 + if (ctx->grabbed_initially == 0)
868 + ctx->grabbed_initially = count;
869 +#endif
870 +
871 + assert("nikita-2986", reiser4_check_block_counters(ctx->super));
872 +
873 + /* disable grab space in current context */
874 + ctx->grab_enabled = 0;
875 +
876 + unlock_and_ret:
877 + spin_unlock_reiser4_super(sbinfo);
878 +
879 + return ret;
880 +}
881 +
882 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
883 +{
884 + int ret;
885 + reiser4_context *ctx;
886 +
887 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
888 + lock_stack_isclean(get_current_lock_stack
889 + ())));
890 + ctx = get_current_context();
891 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
892 + return 0;
893 + }
894 +
895 + ret = reiser4_grab(ctx, count, flags);
896 + if (ret == -ENOSPC) {
897 +
898 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
899 + if (flags & BA_CAN_COMMIT) {
900 + txnmgr_force_commit_all(ctx->super, 0);
901 + ctx->grab_enabled = 1;
902 + ret = reiser4_grab(ctx, count, flags);
903 + }
904 + }
905 + /*
906 + * allocation from reserved pool cannot fail. This is severe error.
907 + */
908 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
909 + return ret;
910 +}
911 +
912 +/*
913 + * SPACE RESERVED FOR UNLINK/TRUNCATE
914 + *
915 + * Unlink and truncate require space in transaction (to update stat data, at
916 + * least). But we don't want rm(1) to fail with "No space on device" error.
917 + *
918 + * Solution is to reserve 5% of disk space for truncates and
919 + * unlinks. Specifically, normal space grabbing requests don't grab space from
920 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
921 + * drain it. Per super block delete mutex is used to allow only one
922 + * thread at a time to grab from reserved area.
923 + *
924 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
925 + * flag.
926 + *
927 + */
928 +
929 +int reiser4_grab_reserved(struct super_block *super,
930 + __u64 count, reiser4_ba_flags_t flags)
931 +{
932 + reiser4_super_info_data *sbinfo = get_super_private(super);
933 +
934 + assert("nikita-3175", flags & BA_CAN_COMMIT);
935 +
936 + /* Check the delete mutex already taken by us, we assume that
937 + * reading of machine word is atomic. */
938 + if (sbinfo->delete_mutex_owner == current) {
939 + if (reiser4_grab_space
940 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
941 + warning("zam-1003",
942 + "nested call of grab_reserved fails count=(%llu)",
943 + (unsigned long long)count);
944 + reiser4_release_reserved(super);
945 + return RETERR(-ENOSPC);
946 + }
947 + return 0;
948 + }
949 +
950 + if (reiser4_grab_space(count, flags)) {
951 + mutex_lock(&sbinfo->delete_mutex);
952 + assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
953 + sbinfo->delete_mutex_owner = current;
954 +
955 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
956 + warning("zam-833",
957 + "reserved space is not enough (%llu)",
958 + (unsigned long long)count);
959 + reiser4_release_reserved(super);
960 + return RETERR(-ENOSPC);
961 + }
962 + }
963 + return 0;
964 +}
965 +
966 +void reiser4_release_reserved(struct super_block *super)
967 +{
968 + reiser4_super_info_data *info;
969 +
970 + info = get_super_private(super);
971 + if (info->delete_mutex_owner == current) {
972 + info->delete_mutex_owner = NULL;
973 + mutex_unlock(&info->delete_mutex);
974 + }
975 +}
976 +
977 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
978 +{
979 + reiser4_context *ctx;
980 + reiser4_super_info_data *sbinfo;
981 +
982 + ctx = get_current_context();
983 + sub_from_ctx_grabbed(ctx, count);
984 +
985 + sbinfo = get_super_private(ctx->super);
986 + spin_lock_reiser4_super(sbinfo);
987 +
988 + sub_from_sb_grabbed(sbinfo, count);
989 + /* return sbinfo locked */
990 + return sbinfo;
991 +}
992 +
993 +/* is called after @count fake block numbers are allocated and pointer to
994 + those blocks are inserted into tree. */
995 +static void grabbed2fake_allocated_formatted(void)
996 +{
997 + reiser4_super_info_data *sbinfo;
998 +
999 + sbinfo = grabbed2fake_allocated_head(1);
1000 + sbinfo->blocks_fake_allocated++;
1001 +
1002 + assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1003 +
1004 + spin_unlock_reiser4_super(sbinfo);
1005 +}
1006 +
1007 +/**
1008 + * grabbed2fake_allocated_unformatted
1009 + * @count:
1010 + *
1011 + */
1012 +static void grabbed2fake_allocated_unformatted(int count)
1013 +{
1014 + reiser4_super_info_data *sbinfo;
1015 +
1016 + sbinfo = grabbed2fake_allocated_head(count);
1017 + sbinfo->blocks_fake_allocated_unformatted += count;
1018 +
1019 + assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1020 +
1021 + spin_unlock_reiser4_super(sbinfo);
1022 +}
1023 +
1024 +void grabbed2cluster_reserved(int count)
1025 +{
1026 + reiser4_context *ctx;
1027 + reiser4_super_info_data *sbinfo;
1028 +
1029 + ctx = get_current_context();
1030 + sub_from_ctx_grabbed(ctx, count);
1031 +
1032 + sbinfo = get_super_private(ctx->super);
1033 + spin_lock_reiser4_super(sbinfo);
1034 +
1035 + sub_from_sb_grabbed(sbinfo, count);
1036 + sbinfo->blocks_clustered += count;
1037 +
1038 + assert("edward-504", reiser4_check_block_counters(ctx->super));
1039 +
1040 + spin_unlock_reiser4_super(sbinfo);
1041 +}
1042 +
1043 +void cluster_reserved2grabbed(int count)
1044 +{
1045 + reiser4_context *ctx;
1046 + reiser4_super_info_data *sbinfo;
1047 +
1048 + ctx = get_current_context();
1049 +
1050 + sbinfo = get_super_private(ctx->super);
1051 + spin_lock_reiser4_super(sbinfo);
1052 +
1053 + sub_from_cluster_reserved(sbinfo, count);
1054 + sbinfo->blocks_grabbed += count;
1055 +
1056 + assert("edward-505", reiser4_check_block_counters(ctx->super));
1057 +
1058 + spin_unlock_reiser4_super(sbinfo);
1059 + add_to_ctx_grabbed(ctx, count);
1060 +}
1061 +
1062 +void cluster_reserved2free(int count)
1063 +{
1064 + reiser4_context *ctx;
1065 + reiser4_super_info_data *sbinfo;
1066 +
1067 + ctx = get_current_context();
1068 + sbinfo = get_super_private(ctx->super);
1069 +
1070 + cluster_reserved2grabbed(count);
1071 + grabbed2free(ctx, sbinfo, count);
1072 +}
1073 +
1074 +static DEFINE_SPINLOCK(fake_lock);
1075 +static reiser4_block_nr fake_gen = 0;
1076 +
1077 +/**
1078 + * assign_fake_blocknr
1079 + * @blocknr:
1080 + * @count:
1081 + *
1082 + * Obtain a fake block number for new node which will be used to refer to
1083 + * this newly allocated node until real allocation is done.
1084 + */
1085 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1086 +{
1087 + spin_lock(&fake_lock);
1088 + *blocknr = fake_gen;
1089 + fake_gen += count;
1090 + spin_unlock(&fake_lock);
1091 +
1092 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1093 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1094 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1095 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1096 +}
1097 +
1098 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1099 +{
1100 + assign_fake_blocknr(blocknr, 1);
1101 + grabbed2fake_allocated_formatted();
1102 + return 0;
1103 +}
1104 +
1105 +/**
1106 + * fake_blocknrs_unformatted
1107 + * @count: number of fake numbers to get
1108 + *
1109 + * Allocates @count fake block numbers which will be assigned to jnodes
1110 + */
1111 +reiser4_block_nr fake_blocknr_unformatted(int count)
1112 +{
1113 + reiser4_block_nr blocknr;
1114 +
1115 + assign_fake_blocknr(&blocknr, count);
1116 + grabbed2fake_allocated_unformatted(count);
1117 +
1118 + return blocknr;
1119 +}
1120 +
1121 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1122 + follows grabbing of free disk space. */
1123 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1124 + __u64 count)
1125 +{
1126 + sub_from_ctx_grabbed(ctx, count);
1127 +
1128 + spin_lock_reiser4_super(sbinfo);
1129 +
1130 + sub_from_sb_grabbed(sbinfo, count);
1131 + sbinfo->blocks_used += count;
1132 +
1133 + assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1134 +
1135 + spin_unlock_reiser4_super(sbinfo);
1136 +}
1137 +
1138 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1139 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1140 + reiser4_ba_flags_t flags)
1141 +{
1142 + spin_lock_reiser4_super(sbinfo);
1143 +
1144 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1145 + sbinfo->blocks_used += count;
1146 +
1147 + assert("nikita-2680",
1148 + reiser4_check_block_counters(reiser4_get_current_sb()));
1149 +
1150 + spin_unlock_reiser4_super(sbinfo);
1151 +}
1152 +
1153 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1154 +{
1155 + reiser4_super_info_data *sbinfo;
1156 +
1157 + assert("zam-787", atom != NULL);
1158 + assert_spin_locked(&(atom->alock));
1159 +
1160 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1161 +
1162 + sbinfo = get_current_super_private();
1163 + spin_lock_reiser4_super(sbinfo);
1164 +
1165 + sub_from_sb_flush_reserved(sbinfo, count);
1166 + sbinfo->blocks_used += count;
1167 +
1168 + assert("zam-789",
1169 + reiser4_check_block_counters(reiser4_get_current_sb()));
1170 +
1171 + spin_unlock_reiser4_super(sbinfo);
1172 +}
1173 +
1174 +/* update the per fs blocknr hint default value. */
1175 +void
1176 +update_blocknr_hint_default(const struct super_block *s,
1177 + const reiser4_block_nr * block)
1178 +{
1179 + reiser4_super_info_data *sbinfo = get_super_private(s);
1180 +
1181 + assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1182 +
1183 + spin_lock_reiser4_super(sbinfo);
1184 + if (*block < sbinfo->block_count) {
1185 + sbinfo->blocknr_hint_default = *block;
1186 + } else {
1187 + warning("zam-676",
1188 + "block number %llu is too large to be used in a blocknr hint\n",
1189 + (unsigned long long)*block);
1190 + dump_stack();
1191 + DEBUGON(1);
1192 + }
1193 + spin_unlock_reiser4_super(sbinfo);
1194 +}
1195 +
1196 +/* get current value of the default blocknr hint. */
1197 +void get_blocknr_hint_default(reiser4_block_nr * result)
1198 +{
1199 + reiser4_super_info_data *sbinfo = get_current_super_private();
1200 +
1201 + spin_lock_reiser4_super(sbinfo);
1202 + *result = sbinfo->blocknr_hint_default;
1203 + assert("zam-677", *result < sbinfo->block_count);
1204 + spin_unlock_reiser4_super(sbinfo);
1205 +}
1206 +
1207 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1208 + * method. Blocks are allocated in one contiguous disk region. The plugin
1209 + * independent part accounts blocks by subtracting allocated amount from grabbed
1210 + * or fake block counter and add the same amount to the counter of allocated
1211 + * blocks.
1212 + *
1213 + * @hint -- a reiser4 blocknr hint object which contains further block
1214 + * allocation hints and parameters (search start, a stage of block
1215 + * which will be mapped to disk, etc.),
1216 + * @blk -- an out parameter for the beginning of the allocated region,
1217 + * @len -- in/out parameter, it should contain the maximum number of allocated
1218 + * blocks, after block allocation completes, it contains the length of
1219 + * allocated disk region.
1220 + * @flags -- see reiser4_ba_flags_t description.
1221 + *
1222 + * @return -- 0 if success, error code otherwise.
1223 + */
1224 +int
1225 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1226 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1227 +{
1228 + __u64 needed = *len;
1229 + reiser4_context *ctx;
1230 + reiser4_super_info_data *sbinfo;
1231 + int ret;
1232 +
1233 + assert("zam-986", hint != NULL);
1234 +
1235 + ctx = get_current_context();
1236 + sbinfo = get_super_private(ctx->super);
1237 +
1238 + /* For write-optimized data we use default search start value, which is
1239 + * close to last write location. */
1240 + if (flags & BA_USE_DEFAULT_SEARCH_START) {
1241 + get_blocknr_hint_default(&hint->blk);
1242 + }
1243 +
1244 + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1245 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1246 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1247 + ret = reiser4_grab_space_force(*len, flags);
1248 + if (ret != 0)
1249 + return ret;
1250 + }
1251 +
1252 + ret =
1253 + sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1254 + hint, (int)needed, blk, len);
1255 +
1256 + if (!ret) {
1257 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1258 + assert("zam-681",
1259 + *blk + *len <= reiser4_block_count(ctx->super));
1260 +
1261 + if (flags & BA_PERMANENT) {
1262 + /* we assume that current atom exists at this moment */
1263 + txn_atom *atom = get_current_atom_locked();
1264 + atom->nr_blocks_allocated += *len;
1265 + spin_unlock_atom(atom);
1266 + }
1267 +
1268 + switch (hint->block_stage) {
1269 + case BLOCK_NOT_COUNTED:
1270 + case BLOCK_GRABBED:
1271 + grabbed2used(ctx, sbinfo, *len);
1272 + break;
1273 + case BLOCK_UNALLOCATED:
1274 + fake_allocated2used(sbinfo, *len, flags);
1275 + break;
1276 + case BLOCK_FLUSH_RESERVED:
1277 + {
1278 + txn_atom *atom = get_current_atom_locked();
1279 + flush_reserved2used(atom, *len);
1280 + spin_unlock_atom(atom);
1281 + }
1282 + break;
1283 + default:
1284 + impossible("zam-531", "wrong block stage");
1285 + }
1286 + } else {
1287 + assert("zam-821",
1288 + ergo(hint->max_dist == 0
1289 + && !hint->backward, ret != -ENOSPC));
1290 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1291 + grabbed2free(ctx, sbinfo, needed);
1292 + }
1293 +
1294 + return ret;
1295 +}
1296 +
1297 +/* used -> fake_allocated -> grabbed -> free */
1298 +
1299 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1300 + disk */
1301 +static void
1302 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1303 + int formatted)
1304 +{
1305 + spin_lock_reiser4_super(sbinfo);
1306 +
1307 + if (formatted)
1308 + sbinfo->blocks_fake_allocated += count;
1309 + else
1310 + sbinfo->blocks_fake_allocated_unformatted += count;
1311 +
1312 + sub_from_sb_used(sbinfo, count);
1313 +
1314 + assert("nikita-2681",
1315 + reiser4_check_block_counters(reiser4_get_current_sb()));
1316 +
1317 + spin_unlock_reiser4_super(sbinfo);
1318 +}
1319 +
1320 +static void
1321 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1322 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1323 +{
1324 + assert("nikita-2791", atom != NULL);
1325 + assert_spin_locked(&(atom->alock));
1326 +
1327 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1328 +
1329 + spin_lock_reiser4_super(sbinfo);
1330 +
1331 + sbinfo->blocks_flush_reserved += count;
1332 + /*add_to_sb_flush_reserved(sbinfo, count); */
1333 + sub_from_sb_used(sbinfo, count);
1334 +
1335 + assert("nikita-2681",
1336 + reiser4_check_block_counters(reiser4_get_current_sb()));
1337 +
1338 + spin_unlock_reiser4_super(sbinfo);
1339 +}
1340 +
1341 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1342 +static void
1343 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1344 + __u64 count, reiser4_ba_flags_t flags)
1345 +{
1346 + add_to_ctx_grabbed(ctx, count);
1347 +
1348 + spin_lock_reiser4_super(sbinfo);
1349 +
1350 + assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1351 +
1352 + sbinfo->blocks_grabbed += count;
1353 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1354 +
1355 + assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1356 +
1357 + spin_unlock_reiser4_super(sbinfo);
1358 +}
1359 +
1360 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1361 +{
1362 + reiser4_context *ctx;
1363 + reiser4_super_info_data *sbinfo;
1364 +
1365 + ctx = get_current_context();
1366 + sbinfo = get_super_private(ctx->super);
1367 +
1368 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1369 + grabbed2free(ctx, sbinfo, count);
1370 +}
1371 +
1372 +void grabbed2free_mark(__u64 mark)
1373 +{
1374 + reiser4_context *ctx;
1375 + reiser4_super_info_data *sbinfo;
1376 +
1377 + ctx = get_current_context();
1378 + sbinfo = get_super_private(ctx->super);
1379 +
1380 + assert("nikita-3007", (__s64) mark >= 0);
1381 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1382 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1383 +}
1384 +
1385 +/**
1386 + * grabbed2free - adjust grabbed and free block counters
1387 + * @ctx: context to update grabbed block counter of
1388 + * @sbinfo: super block to update grabbed and free block counters of
1389 + * @count: number of blocks to adjust counters by
1390 + *
1391 + * Decreases context's and per filesystem's counters of grabbed
1392 + * blocks. Increases per filesystem's counter of free blocks.
1393 + */
1394 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1395 + __u64 count)
1396 +{
1397 + sub_from_ctx_grabbed(ctx, count);
1398 +
1399 + spin_lock_reiser4_super(sbinfo);
1400 +
1401 + sub_from_sb_grabbed(sbinfo, count);
1402 + sbinfo->blocks_free += count;
1403 + assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1404 +
1405 + spin_unlock_reiser4_super(sbinfo);
1406 +}
1407 +
1408 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1409 +{
1410 + reiser4_context *ctx;
1411 + reiser4_super_info_data *sbinfo;
1412 +
1413 + assert("vs-1095", atom);
1414 +
1415 + ctx = get_current_context();
1416 + sbinfo = get_super_private(ctx->super);
1417 +
1418 + sub_from_ctx_grabbed(ctx, count);
1419 +
1420 + add_to_atom_flush_reserved_nolock(atom, count);
1421 +
1422 + spin_lock_reiser4_super(sbinfo);
1423 +
1424 + sbinfo->blocks_flush_reserved += count;
1425 + sub_from_sb_grabbed(sbinfo, count);
1426 +
1427 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1428 +
1429 + spin_unlock_reiser4_super(sbinfo);
1430 +}
1431 +
1432 +void grabbed2flush_reserved(__u64 count)
1433 +{
1434 + txn_atom *atom = get_current_atom_locked();
1435 +
1436 + grabbed2flush_reserved_nolock(atom, count);
1437 +
1438 + spin_unlock_atom(atom);
1439 +}
1440 +
1441 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1442 +{
1443 + reiser4_context *ctx;
1444 + reiser4_super_info_data *sbinfo;
1445 +
1446 + assert("nikita-2788", atom != NULL);
1447 + assert_spin_locked(&(atom->alock));
1448 +
1449 + ctx = get_current_context();
1450 + sbinfo = get_super_private(ctx->super);
1451 +
1452 + add_to_ctx_grabbed(ctx, count);
1453 +
1454 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1455 +
1456 + spin_lock_reiser4_super(sbinfo);
1457 +
1458 + sbinfo->blocks_grabbed += count;
1459 + sub_from_sb_flush_reserved(sbinfo, count);
1460 +
1461 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1462 +
1463 + spin_unlock_reiser4_super(sbinfo);
1464 +}
1465 +
1466 +/**
1467 + * all_grabbed2free - releases all blocks grabbed in context
1468 + *
1469 + * Decreases context's and super block's grabbed block counters by number of
1470 + * blocks grabbed by current context and increases super block's free block
1471 + * counter correspondingly.
1472 + */
1473 +void all_grabbed2free(void)
1474 +{
1475 + reiser4_context *ctx = get_current_context();
1476 +
1477 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1478 +}
1479 +
1480 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1481 + after freeing, @count blocks become "grabbed". */
1482 +static void
1483 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1484 + __u64 count)
1485 +{
1486 + add_to_ctx_grabbed(ctx, count);
1487 +
1488 + spin_lock_reiser4_super(sbinfo);
1489 +
1490 + sbinfo->blocks_grabbed += count;
1491 + sub_from_sb_used(sbinfo, count);
1492 +
1493 + assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1494 +
1495 + spin_unlock_reiser4_super(sbinfo);
1496 +}
1497 +
1498 +/* this used to be done through used2grabbed and grabbed2free*/
1499 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1500 +{
1501 + spin_lock_reiser4_super(sbinfo);
1502 +
1503 + sbinfo->blocks_free += count;
1504 + sub_from_sb_used(sbinfo, count);
1505 +
1506 + assert("nikita-2685",
1507 + reiser4_check_block_counters(reiser4_get_current_sb()));
1508 +
1509 + spin_unlock_reiser4_super(sbinfo);
1510 +}
1511 +
1512 +#if REISER4_DEBUG
1513 +
1514 +/* check "allocated" state of given block range */
1515 +static void
1516 +reiser4_check_blocks(const reiser4_block_nr * start,
1517 + const reiser4_block_nr * len, int desired)
1518 +{
1519 + sa_check_blocks(start, len, desired);
1520 +}
1521 +
1522 +/* check "allocated" state of given block */
1523 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1524 +{
1525 + const reiser4_block_nr one = 1;
1526 +
1527 + reiser4_check_blocks(block, &one, desired);
1528 +}
1529 +
1530 +#endif
1531 +
1532 +/* Blocks deallocation function may do an actual deallocation through space
1533 + plugin allocation or store deleted block numbers in atom's delete_set data
1534 + structure depend on @defer parameter. */
1535 +
1536 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1537 + will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1538 + freed but disk space is still grabbed by current thread, or these blocks must
1539 + not be counted in any reiser4 sb block counters, see block_stage_t comment */
1540 +
1541 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1542 + distinguish blocks allocated for unformatted and formatted nodes */
1543 +
1544 +int
1545 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1546 + const reiser4_block_nr * len,
1547 + block_stage_t target_stage, reiser4_ba_flags_t flags)
1548 +{
1549 + txn_atom *atom = NULL;
1550 + int ret;
1551 + reiser4_context *ctx;
1552 + reiser4_super_info_data *sbinfo;
1553 +
1554 + ctx = get_current_context();
1555 + sbinfo = get_super_private(ctx->super);
1556 +
1557 + if (REISER4_DEBUG) {
1558 + assert("zam-431", *len != 0);
1559 + assert("zam-432", *start != 0);
1560 + assert("zam-558", !reiser4_blocknr_is_fake(start));
1561 +
1562 + spin_lock_reiser4_super(sbinfo);
1563 + assert("zam-562", *start < sbinfo->block_count);
1564 + spin_unlock_reiser4_super(sbinfo);
1565 + }
1566 +
1567 + if (flags & BA_DEFER) {
1568 + blocknr_set_entry *bsep = NULL;
1569 +
1570 + /* storing deleted block numbers in a blocknr set
1571 + datastructure for further actual deletion */
1572 + do {
1573 + atom = get_current_atom_locked();
1574 + assert("zam-430", atom != NULL);
1575 +
1576 + ret =
1577 + blocknr_set_add_extent(atom, &atom->delete_set,
1578 + &bsep, start, len);
1579 +
1580 + if (ret == -ENOMEM)
1581 + return ret;
1582 +
1583 + /* This loop might spin at most two times */
1584 + } while (ret == -E_REPEAT);
1585 +
1586 + assert("zam-477", ret == 0);
1587 + assert("zam-433", atom != NULL);
1588 +
1589 + spin_unlock_atom(atom);
1590 +
1591 + } else {
1592 + assert("zam-425", get_current_super_private() != NULL);
1593 + sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1594 + *start, *len);
1595 +
1596 + if (flags & BA_PERMANENT) {
1597 + /* These blocks were counted as allocated, we have to revert it
1598 + * back if allocation is discarded. */
1599 + txn_atom *atom = get_current_atom_locked();
1600 + atom->nr_blocks_allocated -= *len;
1601 + spin_unlock_atom(atom);
1602 + }
1603 +
1604 + switch (target_stage) {
1605 + case BLOCK_NOT_COUNTED:
1606 + assert("vs-960", flags & BA_FORMATTED);
1607 + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1608 + used2free(sbinfo, *len);
1609 + break;
1610 +
1611 + case BLOCK_GRABBED:
1612 + used2grabbed(ctx, sbinfo, *len);
1613 + break;
1614 +
1615 + case BLOCK_UNALLOCATED:
1616 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1617 + break;
1618 +
1619 + case BLOCK_FLUSH_RESERVED:{
1620 + txn_atom *atom;
1621 +
1622 + atom = get_current_atom_locked();
1623 + used2flush_reserved(sbinfo, atom, *len,
1624 + flags & BA_FORMATTED);
1625 + spin_unlock_atom(atom);
1626 + break;
1627 + }
1628 + default:
1629 + impossible("zam-532", "wrong block stage");
1630 + }
1631 + }
1632 +
1633 + return 0;
1634 +}
1635 +
1636 +/* wrappers for block allocator plugin methods */
1637 +int reiser4_pre_commit_hook(void)
1638 +{
1639 + assert("zam-502", get_current_super_private() != NULL);
1640 + sa_pre_commit_hook();
1641 + return 0;
1642 +}
1643 +
1644 +/* an actor which applies delete set to block allocator data */
1645 +static int
1646 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1647 + const reiser4_block_nr * b, void *data UNUSED_ARG)
1648 +{
1649 + reiser4_context *ctx;
1650 + reiser4_super_info_data *sbinfo;
1651 +
1652 + __u64 len = 1;
1653 +
1654 + ctx = get_current_context();
1655 + sbinfo = get_super_private(ctx->super);
1656 +
1657 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1658 + assert("zam-552", sbinfo != NULL);
1659 +
1660 + if (b != NULL)
1661 + len = *b;
1662 +
1663 + if (REISER4_DEBUG) {
1664 + spin_lock_reiser4_super(sbinfo);
1665 +
1666 + assert("zam-554", *a < reiser4_block_count(ctx->super));
1667 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1668 +
1669 + spin_unlock_reiser4_super(sbinfo);
1670 + }
1671 +
1672 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1673 + /* adjust sb block counters */
1674 + used2free(sbinfo, len);
1675 + return 0;
1676 +}
1677 +
1678 +void reiser4_post_commit_hook(void)
1679 +{
1680 + txn_atom *atom;
1681 +
1682 + atom = get_current_atom_locked();
1683 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1684 + spin_unlock_atom(atom);
1685 +
1686 + /* do the block deallocation which was deferred
1687 + until commit is done */
1688 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1689 +
1690 + assert("zam-504", get_current_super_private() != NULL);
1691 + sa_post_commit_hook();
1692 +}
1693 +
1694 +void reiser4_post_write_back_hook(void)
1695 +{
1696 + assert("zam-504", get_current_super_private() != NULL);
1697 +
1698 + sa_post_commit_hook();
1699 +}
1700 +
1701 +/*
1702 + Local variables:
1703 + c-indentation-style: "K&R"
1704 + mode-name: "LC"
1705 + c-basic-offset: 8
1706 + tab-width: 8
1707 + fill-column: 120
1708 + scroll-step: 1
1709 + End:
1710 +*/
1711 diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.h linux-2.6.20/fs/reiser4/block_alloc.h
1712 --- linux-2.6.20.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1713 +++ linux-2.6.20/fs/reiser4/block_alloc.h 2007-05-06 14:50:43.682970725 +0400
1714 @@ -0,0 +1,175 @@
1715 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1716 +
1717 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1718 +#define __FS_REISER4_BLOCK_ALLOC_H__
1719 +
1720 +#include "dformat.h"
1721 +#include "forward.h"
1722 +
1723 +#include <linux/types.h> /* for __u?? */
1724 +#include <linux/fs.h>
1725 +
1726 +/* Mask when is applied to given block number shows is that block number is a fake one */
1727 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1728 +/* Mask which isolates a type of object this fake block number was assigned to */
1729 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1730 +
1731 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1732 + against these two values to understand is the object unallocated or bitmap
1733 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1734 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1735 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1736 +
1737 +/* specification how block allocation was counted in sb block counters */
1738 +typedef enum {
1739 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1740 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1741 + of this block */
1742 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1743 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1744 + ( unallocated formatted or unformatted
1745 + node) */
1746 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1747 + number assigned */
1748 +} block_stage_t;
1749 +
1750 +/* a hint for block allocator */
1751 +struct reiser4_blocknr_hint {
1752 + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1753 + is to prevent jnode_flush() calls from interleaving allocations on the same
1754 + bitmap, once a hint is established. */
1755 +
1756 + /* search start hint */
1757 + reiser4_block_nr blk;
1758 + /* if not zero, it is a region size we search for free blocks in */
1759 + reiser4_block_nr max_dist;
1760 + /* level for allocation, may be useful have branch-level and higher
1761 + write-optimized. */
1762 + tree_level level;
1763 + /* block allocator assumes that blocks, which will be mapped to disk,
1764 + are in this specified block_stage */
1765 + block_stage_t block_stage;
1766 + /* If direction = 1 allocate blocks in backward direction from the end
1767 + * of disk to the beginning of disk. */
1768 + unsigned int backward:1;
1769 +
1770 +};
1771 +
1772 +/* These flags control block allocation/deallocation behavior */
1773 +enum reiser4_ba_flags {
1774 + /* do allocatations from reserved (5%) area */
1775 + BA_RESERVED = (1 << 0),
1776 +
1777 + /* block allocator can do commit trying to recover free space */
1778 + BA_CAN_COMMIT = (1 << 1),
1779 +
1780 + /* if operation will be applied to formatted block */
1781 + BA_FORMATTED = (1 << 2),
1782 +
1783 + /* defer actual block freeing until transaction commit */
1784 + BA_DEFER = (1 << 3),
1785 +
1786 + /* allocate blocks for permanent fs objects (formatted or unformatted), not
1787 + wandered of log blocks */
1788 + BA_PERMANENT = (1 << 4),
1789 +
1790 + /* grab space even it was disabled */
1791 + BA_FORCE = (1 << 5),
1792 +
1793 + /* use default start value for free blocks search. */
1794 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1795 +};
1796 +
1797 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1798 +
1799 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1800 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1801 +extern void update_blocknr_hint_default(const struct super_block *,
1802 + const reiser4_block_nr *);
1803 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1804 +
1805 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1806 +
1807 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1808 +reiser4_block_nr fake_blocknr_unformatted(int);
1809 +
1810 +/* free -> grabbed -> fake_allocated -> used */
1811 +
1812 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1813 +void all_grabbed2free(void);
1814 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1815 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1816 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1817 +void grabbed2flush_reserved(__u64 count);
1818 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1819 + reiser4_block_nr * start,
1820 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
1821 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1822 + const reiser4_block_nr *,
1823 + block_stage_t, reiser4_ba_flags_t flags);
1824 +
1825 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1826 + reiser4_block_nr * start,
1827 + reiser4_ba_flags_t flags)
1828 +{
1829 + reiser4_block_nr one = 1;
1830 + return reiser4_alloc_blocks(hint, start, &one, flags);
1831 +}
1832 +
1833 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1834 + block_stage_t stage,
1835 + reiser4_ba_flags_t flags)
1836 +{
1837 + const reiser4_block_nr one = 1;
1838 + return reiser4_dealloc_blocks(block, &one, stage, flags);
1839 +}
1840 +
1841 +#define reiser4_grab_space_force(count, flags) \
1842 + reiser4_grab_space(count, flags | BA_FORCE)
1843 +
1844 +extern void grabbed2free_mark(__u64 mark);
1845 +extern int reiser4_grab_reserved(struct super_block *,
1846 + __u64, reiser4_ba_flags_t);
1847 +extern void reiser4_release_reserved(struct super_block *super);
1848 +
1849 +/* grabbed -> fake_allocated */
1850 +
1851 +/* fake_allocated -> used */
1852 +
1853 +/* used -> fake_allocated -> grabbed -> free */
1854 +
1855 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1856 +
1857 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1858 +
1859 +extern void grabbed2cluster_reserved(int count);
1860 +extern void cluster_reserved2grabbed(int count);
1861 +extern void cluster_reserved2free(int count);
1862 +
1863 +extern int reiser4_check_block_counters(const struct super_block *);
1864 +
1865 +#if REISER4_DEBUG
1866 +
1867 +extern void reiser4_check_block(const reiser4_block_nr *, int);
1868 +
1869 +#else
1870 +
1871 +# define reiser4_check_block(beg, val) noop
1872 +
1873 +#endif
1874 +
1875 +extern int reiser4_pre_commit_hook(void);
1876 +extern void reiser4_post_commit_hook(void);
1877 +extern void reiser4_post_write_back_hook(void);
1878 +
1879 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1880 +
1881 +/* Make Linus happy.
1882 + Local variables:
1883 + c-indentation-style: "K&R"
1884 + mode-name: "LC"
1885 + c-basic-offset: 8
1886 + tab-width: 8
1887 + fill-column: 120
1888 + End:
1889 +*/
1890 diff -urN linux-2.6.20.orig/fs/reiser4/blocknrset.c linux-2.6.20/fs/reiser4/blocknrset.c
1891 --- linux-2.6.20.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1892 +++ linux-2.6.20/fs/reiser4/blocknrset.c 2007-05-06 14:50:43.686971975 +0400
1893 @@ -0,0 +1,368 @@
1894 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1895 +
1896 +/* This file contains code for various block number sets used by the atom to
1897 + track the deleted set and wandered block mappings. */
1898 +
1899 +#include "debug.h"
1900 +#include "dformat.h"
1901 +#include "txnmgr.h"
1902 +#include "context.h"
1903 +
1904 +#include <linux/slab.h>
1905 +
1906 +/* The proposed data structure for storing unordered block number sets is a
1907 + list of elements, each of which contains an array of block number or/and
1908 + array of block number pairs. That element called blocknr_set_entry is used
1909 + to store block numbers from the beginning and for extents from the end of
1910 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1911 + count numbers of blocks and extents.
1912 +
1913 + +------------------- blocknr_set_entry->data ------------------+
1914 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1915 + +------------------------------------------------------------+
1916 +
1917 + When current blocknr_set_entry is full, allocate a new one. */
1918 +
1919 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1920 + * set (single blocks and block extents), in that case blocknr pair represent an
1921 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1922 + * there represent a (real block) -> (wandered block) mapping. */
1923 +
1924 +/* Protection: blocknr sets belong to reiser4 atom, and
1925 + * their modifications are performed with the atom lock held */
1926 +
1927 +typedef struct blocknr_pair blocknr_pair;
1928 +
1929 +/* The total size of a blocknr_set_entry. */
1930 +#define BLOCKNR_SET_ENTRY_SIZE 128
1931 +
1932 +/* The number of blocks that can fit the blocknr data area. */
1933 +#define BLOCKNR_SET_ENTRIES_NUMBER \
1934 + ((BLOCKNR_SET_ENTRY_SIZE - \
1935 + 2 * sizeof (unsigned) - \
1936 + sizeof(struct list_head)) / \
1937 + sizeof(reiser4_block_nr))
1938 +
1939 +/* An entry of the blocknr_set */
1940 +struct blocknr_set_entry {
1941 + unsigned nr_singles;
1942 + unsigned nr_pairs;
1943 + struct list_head link;
1944 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1945 +};
1946 +
1947 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
1948 +struct blocknr_pair {
1949 + reiser4_block_nr a;
1950 + reiser4_block_nr b;
1951 +};
1952 +
1953 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
1954 +/* Audited by: green(2002.06.11) */
1955 +static unsigned bse_avail(blocknr_set_entry * bse)
1956 +{
1957 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1958 +
1959 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1960 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1961 +
1962 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
1963 +}
1964 +
1965 +/* Initialize a blocknr_set_entry. */
1966 +static void bse_init(blocknr_set_entry *bse)
1967 +{
1968 + bse->nr_singles = 0;
1969 + bse->nr_pairs = 0;
1970 + INIT_LIST_HEAD(&bse->link);
1971 +}
1972 +
1973 +/* Allocate and initialize a blocknr_set_entry. */
1974 +/* Audited by: green(2002.06.11) */
1975 +static blocknr_set_entry *bse_alloc(void)
1976 +{
1977 + blocknr_set_entry *e;
1978 +
1979 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1980 + reiser4_ctx_gfp_mask_get())) == NULL)
1981 + return NULL;
1982 +
1983 + bse_init(e);
1984 +
1985 + return e;
1986 +}
1987 +
1988 +/* Free a blocknr_set_entry. */
1989 +/* Audited by: green(2002.06.11) */
1990 +static void bse_free(blocknr_set_entry * bse)
1991 +{
1992 + kfree(bse);
1993 +}
1994 +
1995 +/* Add a block number to a blocknr_set_entry */
1996 +/* Audited by: green(2002.06.11) */
1997 +static void
1998 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
1999 +{
2000 + assert("jmacd-5099", bse_avail(bse) >= 1);
2001 +
2002 + bse->entries[bse->nr_singles++] = *block;
2003 +}
2004 +
2005 +/* Get a pair of block numbers */
2006 +/* Audited by: green(2002.06.11) */
2007 +static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2008 +{
2009 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2010 +
2011 + return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2012 + 2 * (pno + 1));
2013 +}
2014 +
2015 +/* Add a pair of block numbers to a blocknr_set_entry */
2016 +/* Audited by: green(2002.06.11) */
2017 +static void
2018 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2019 + const reiser4_block_nr * b)
2020 +{
2021 + blocknr_pair *pair;
2022 +
2023 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2024 +
2025 + pair = bse_get_pair(bse, bse->nr_pairs++);
2026 +
2027 + pair->a = *a;
2028 + pair->b = *b;
2029 +}
2030 +
2031 +/* Add either a block or pair of blocks to the block number set. The first
2032 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2033 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2034 + the call is made with the atom lock held. There may not be enough space in
2035 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2036 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2037 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2038 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2039 + returned with the atom unlocked for the operation to be tried again. If
2040 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2041 + used during the call, it will be freed automatically. */
2042 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2043 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2044 + const reiser4_block_nr *b)
2045 +{
2046 + blocknr_set_entry *bse;
2047 + unsigned entries_needed;
2048 +
2049 + assert("jmacd-5101", a != NULL);
2050 +
2051 + entries_needed = (b == NULL) ? 1 : 2;
2052 + if (list_empty(bset) ||
2053 + bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2054 + /* See if a bse was previously allocated. */
2055 + if (*new_bsep == NULL) {
2056 + spin_unlock_atom(atom);
2057 + *new_bsep = bse_alloc();
2058 + return (*new_bsep != NULL) ? -E_REPEAT :
2059 + RETERR(-ENOMEM);
2060 + }
2061 +
2062 + /* Put it on the head of the list. */
2063 + list_add(&((*new_bsep)->link), bset);
2064 +
2065 + *new_bsep = NULL;
2066 + }
2067 +
2068 + /* Add the single or pair. */
2069 + bse = list_entry(bset->next, blocknr_set_entry, link);
2070 + if (b == NULL) {
2071 + bse_put_single(bse, a);
2072 + } else {
2073 + bse_put_pair(bse, a, b);
2074 + }
2075 +
2076 + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2077 + if (*new_bsep != NULL) {
2078 + bse_free(*new_bsep);
2079 + *new_bsep = NULL;
2080 + }
2081 +
2082 + return 0;
2083 +}
2084 +
2085 +/* Add an extent to the block set. If the length is 1, it is treated as a
2086 + single block (e.g., reiser4_set_add_block). */
2087 +/* Audited by: green(2002.06.11) */
2088 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2089 + kmalloc might schedule. The only exception is atom spinlock, which is
2090 + properly freed. */
2091 +int
2092 +blocknr_set_add_extent(txn_atom * atom,
2093 + struct list_head * bset,
2094 + blocknr_set_entry ** new_bsep,
2095 + const reiser4_block_nr * start,
2096 + const reiser4_block_nr * len)
2097 +{
2098 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2099 + return blocknr_set_add(atom, bset, new_bsep, start,
2100 + *len == 1 ? NULL : len);
2101 +}
2102 +
2103 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2104 + * by an assertion that both arguments are not null.*/
2105 +/* Audited by: green(2002.06.11) */
2106 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2107 + kmalloc might schedule. The only exception is atom spinlock, which is
2108 + properly freed. */
2109 +int
2110 +blocknr_set_add_pair(txn_atom * atom,
2111 + struct list_head * bset,
2112 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2113 + const reiser4_block_nr * b)
2114 +{
2115 + assert("jmacd-5103", a != NULL && b != NULL);
2116 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2117 +}
2118 +
2119 +/* Initialize a blocknr_set. */
2120 +void blocknr_set_init(struct list_head *bset)
2121 +{
2122 + INIT_LIST_HEAD(bset);
2123 +}
2124 +
2125 +/* Release the entries of a blocknr_set. */
2126 +void blocknr_set_destroy(struct list_head *bset)
2127 +{
2128 + blocknr_set_entry *bse;
2129 +
2130 + while (!list_empty(bset)) {
2131 + bse = list_entry(bset->next, blocknr_set_entry, link);
2132 + list_del_init(&bse->link);
2133 + bse_free(bse);
2134 + }
2135 +}
2136 +
2137 +/* Merge blocknr_set entries out of @from into @into. */
2138 +/* Audited by: green(2002.06.11) */
2139 +/* Auditor comments: This merge does not know if merged sets contain
2140 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2141 + overlapping ranges if there is some. So I believe it may lead to
2142 + some blocks being presented several times in one blocknr_set. To help
2143 + debugging such problems it might help to check for duplicate entries on
2144 + actual processing of this set. Testing this kind of stuff right here is
2145 + also complicated by the fact that these sets are not sorted and going
2146 + through whole set on each element addition is going to be CPU-heavy task */
2147 +void blocknr_set_merge(struct list_head * from, struct list_head * into)
2148 +{
2149 + blocknr_set_entry *bse_into = NULL;
2150 +
2151 + /* If @from is empty, no work to perform. */
2152 + if (list_empty(from))
2153 + return;
2154 + /* If @into is not empty, try merging partial-entries. */
2155 + if (!list_empty(into)) {
2156 +
2157 + /* Neither set is empty, pop the front to members and try to combine them. */
2158 + blocknr_set_entry *bse_from;
2159 + unsigned into_avail;
2160 +
2161 + bse_into = list_entry(into->next, blocknr_set_entry, link);
2162 + list_del_init(&bse_into->link);
2163 + bse_from = list_entry(from->next, blocknr_set_entry, link);
2164 + list_del_init(&bse_from->link);
2165 +
2166 + /* Combine singles. */
2167 + for (into_avail = bse_avail(bse_into);
2168 + into_avail != 0 && bse_from->nr_singles != 0;
2169 + into_avail -= 1) {
2170 + bse_put_single(bse_into,
2171 + &bse_from->entries[--bse_from->
2172 + nr_singles]);
2173 + }
2174 +
2175 + /* Combine pairs. */
2176 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2177 + into_avail -= 2) {
2178 + blocknr_pair *pair =
2179 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2180 + bse_put_pair(bse_into, &pair->a, &pair->b);
2181 + }
2182 +
2183 + /* If bse_from is empty, delete it now. */
2184 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2185 + bse_free(bse_from);
2186 + } else {
2187 + /* Otherwise, bse_into is full or nearly full (e.g.,
2188 + it could have one slot avail and bse_from has one
2189 + pair left). Push it back onto the list. bse_from
2190 + becomes bse_into, which will be the new partial. */
2191 + list_add(&bse_into->link, into);
2192 + bse_into = bse_from;
2193 + }
2194 + }
2195 +
2196 + /* Splice lists together. */
2197 + list_splice_init(from, into->prev);
2198 +
2199 + /* Add the partial entry back to the head of the list. */
2200 + if (bse_into != NULL)
2201 + list_add(&bse_into->link, into);
2202 +}
2203 +
2204 +/* Iterate over all blocknr set elements. */
2205 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2206 + blocknr_set_actor_f actor, void *data, int delete)
2207 +{
2208 +
2209 + blocknr_set_entry *entry;
2210 +
2211 + assert("zam-429", atom != NULL);
2212 + assert("zam-430", atom_is_protected(atom));
2213 + assert("zam-431", bset != 0);
2214 + assert("zam-432", actor != NULL);
2215 +
2216 + entry = list_entry(bset->next, blocknr_set_entry, link);
2217 + while (bset != &entry->link) {
2218 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2219 + unsigned int i;
2220 + int ret;
2221 +
2222 + for (i = 0; i < entry->nr_singles; i++) {
2223 + ret = actor(atom, &entry->entries[i], NULL, data);
2224 +
2225 + /* We can't break a loop if delete flag is set. */
2226 + if (ret != 0 && !delete)
2227 + return ret;
2228 + }
2229 +
2230 + for (i = 0; i < entry->nr_pairs; i++) {
2231 + struct blocknr_pair *ab;
2232 +
2233 + ab = bse_get_pair(entry, i);
2234 +
2235 + ret = actor(atom, &ab->a, &ab->b, data);
2236 +
2237 + if (ret != 0 && !delete)
2238 + return ret;
2239 + }
2240 +
2241 + if (delete) {
2242 + list_del(&entry->link);
2243 + bse_free(entry);
2244 + }
2245 +
2246 + entry = tmp;
2247 + }
2248 +
2249 + return 0;
2250 +}
2251 +
2252 +/*
2253 + * Local variables:
2254 + * c-indentation-style: "K&R"
2255 + * mode-name: "LC"
2256 + * c-basic-offset: 8
2257 + * tab-width: 8
2258 + * fill-column: 79
2259 + * scroll-step: 1
2260 + * End:
2261 + */
2262 diff -urN linux-2.6.20.orig/fs/reiser4/carry.c linux-2.6.20/fs/reiser4/carry.c
2263 --- linux-2.6.20.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2264 +++ linux-2.6.20/fs/reiser4/carry.c 2007-05-06 14:50:43.686971975 +0400
2265 @@ -0,0 +1,1391 @@
2266 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2267 +/* Functions to "carry" tree modification(s) upward. */
2268 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2269 + set of changes that need to be propagated to the next level. We manage
2270 + node locking such that any searches that collide with carrying are
2271 + restarted, from the root if necessary.
2272 +
2273 + Insertion of a new item may result in items being moved among nodes and
2274 + this requires the delimiting key to be updated at the least common parent
2275 + of the nodes modified to preserve search tree invariants. Also, insertion
2276 + may require allocation of a new node. A pointer to the new node has to be
2277 + inserted into some node on the parent level, etc.
2278 +
2279 + Tree carrying is meant to be analogous to arithmetic carrying.
2280 +
2281 + A carry operation is always associated with some node (&carry_node).
2282 +
2283 + Carry process starts with some initial set of operations to be performed
2284 + and an initial set of already locked nodes. Operations are performed one
2285 + by one. Performing each single operation has following possible effects:
2286 +
2287 + - content of carry node associated with operation is modified
2288 + - new carry nodes are locked and involved into carry process on this level
2289 + - new carry operations are posted to the next level
2290 +
2291 + After all carry operations on this level are done, process is repeated for
2292 + the accumulated sequence on carry operations for the next level. This
2293 + starts by trying to lock (in left to right order) all carry nodes
2294 + associated with carry operations on the parent level. After this, we decide
2295 + whether more nodes are required on the left of already locked set. If so,
2296 + all locks taken on the parent level are released, new carry nodes are
2297 + added, and locking process repeats.
2298 +
2299 + It may happen that balancing process fails owing to unrecoverable error on
2300 + some of upper levels of a tree (possible causes are io error, failure to
2301 + allocate new node, etc.). In this case we should unmount the filesystem,
2302 + rebooting if it is the root, and possibly advise the use of fsck.
2303 +
2304 + USAGE:
2305 +
2306 + int some_tree_operation( znode *node, ... )
2307 + {
2308 + // Allocate on a stack pool of carry objects: operations and nodes.
2309 + // Most carry processes will only take objects from here, without
2310 + // dynamic allocation.
2311 +
2312 +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2313 +
2314 + carry_pool pool;
2315 + carry_level lowest_level;
2316 + carry_op *op;
2317 +
2318 + init_carry_pool( &pool );
2319 + init_carry_level( &lowest_level, &pool );
2320 +
2321 + // operation may be one of:
2322 + // COP_INSERT --- insert new item into node
2323 + // COP_CUT --- remove part of or whole node
2324 + // COP_PASTE --- increase size of item
2325 + // COP_DELETE --- delete pointer from parent node
2326 + // COP_UPDATE --- update delimiting key in least
2327 + // common ancestor of two
2328 +
2329 + op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2330 + if( IS_ERR( op ) || ( op == NULL ) ) {
2331 + handle error
2332 + } else {
2333 + // fill in remaining fields in @op, according to carry.h:carry_op
2334 + result = carry( &lowest_level, NULL );
2335 + }
2336 + done_carry_pool( &pool );
2337 + }
2338 +
2339 + When you are implementing node plugin method that participates in carry
2340 + (shifting, insertion, deletion, etc.), do the following:
2341 +
2342 + int foo_node_method( znode *node, ..., carry_level *todo )
2343 + {
2344 + carry_op *op;
2345 +
2346 + ....
2347 +
2348 + // note, that last argument to reiser4_post_carry() is non-null
2349 + // here, because @op is to be applied to the parent of @node, rather
2350 + // than to the @node itself as in the previous case.
2351 +
2352 + op = node_post_carry( todo, operation, node, 1 );
2353 + // fill in remaining fields in @op, according to carry.h:carry_op
2354 +
2355 + ....
2356 +
2357 + }
2358 +
2359 + BATCHING:
2360 +
2361 + One of the main advantages of level-by-level balancing implemented here is
2362 + ability to batch updates on a parent level and to peform them more
2363 + efficiently as a result.
2364 +
2365 + Description To Be Done (TBD).
2366 +
2367 + DIFFICULTIES AND SUBTLE POINTS:
2368 +
2369 + 1. complex plumbing is required, because:
2370 +
2371 + a. effective allocation through pools is needed
2372 +
2373 + b. target of operation is not exactly known when operation is
2374 + posted. This is worked around through bitfields in &carry_node and
2375 + logic in lock_carry_node()
2376 +
2377 + c. of interaction with locking code: node should be added into sibling
2378 + list when pointer to it is inserted into its parent, which is some time
2379 + after node was created. Between these moments, node is somewhat in
2380 + suspended state and is only registered in the carry lists
2381 +
2382 + 2. whole balancing logic is implemented here, in particular, insertion
2383 + logic is coded in make_space().
2384 +
2385 + 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2386 + (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2387 + (insert_paste()) have to be handled.
2388 +
2389 + 4. there is non-trivial interdependency between allocation of new nodes
2390 + and almost everything else. This is mainly due to the (1.c) above. I shall
2391 + write about this later.
2392 +
2393 +*/
2394 +
2395 +#include "forward.h"
2396 +#include "debug.h"
2397 +#include "key.h"
2398 +#include "coord.h"
2399 +#include "plugin/item/item.h"
2400 +#include "plugin/item/extent.h"
2401 +#include "plugin/node/node.h"
2402 +#include "jnode.h"
2403 +#include "znode.h"
2404 +#include "tree_mod.h"
2405 +#include "tree_walk.h"
2406 +#include "block_alloc.h"
2407 +#include "pool.h"
2408 +#include "tree.h"
2409 +#include "carry.h"
2410 +#include "carry_ops.h"
2411 +#include "super.h"
2412 +#include "reiser4.h"
2413 +
2414 +#include <linux/types.h>
2415 +
2416 +/* level locking/unlocking */
2417 +static int lock_carry_level(carry_level * level);
2418 +static void unlock_carry_level(carry_level * level, int failure);
2419 +static void done_carry_level(carry_level * level);
2420 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2421 +
2422 +int lock_carry_node(carry_level * level, carry_node * node);
2423 +int lock_carry_node_tail(carry_node * node);
2424 +
2425 +/* carry processing proper */
2426 +static int carry_on_level(carry_level * doing, carry_level * todo);
2427 +
2428 +static carry_op *add_op(carry_level * level, pool_ordering order,
2429 + carry_op * reference);
2430 +
2431 +/* handlers for carry operations. */
2432 +
2433 +static void fatal_carry_error(carry_level * doing, int ecode);
2434 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2435 +
2436 +static void print_level(const char *prefix, carry_level * level);
2437 +
2438 +#if REISER4_DEBUG
2439 +typedef enum {
2440 + CARRY_TODO,
2441 + CARRY_DOING
2442 +} carry_queue_state;
2443 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2444 +#endif
2445 +
2446 +/* main entry point for tree balancing.
2447 +
2448 + Tree carry performs operations from @doing and while doing so accumulates
2449 + information about operations to be performed on the next level ("carried"
2450 + to the parent level). Carried operations are performed, causing possibly
2451 + more operations to be carried upward etc. carry() takes care about
2452 + locking and pinning znodes while operating on them.
2453 +
2454 + For usage, see comment at the top of fs/reiser4/carry.c
2455 +
2456 +*/
2457 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2458 + * performed */ ,
2459 + carry_level * done /* set of nodes, already performed
2460 + * at the previous level.
2461 + * NULL in most cases */)
2462 +{
2463 + int result = 0;
2464 + /* queue of new requests */
2465 + carry_level *todo;
2466 + ON_DEBUG(STORE_COUNTERS);
2467 +
2468 + assert("nikita-888", doing != NULL);
2469 + BUG_ON(done != NULL);
2470 +
2471 + todo = doing + 1;
2472 + init_carry_level(todo, doing->pool);
2473 +
2474 + /* queue of requests preformed on the previous level */
2475 + done = todo + 1;
2476 + init_carry_level(done, doing->pool);
2477 +
2478 + /* iterate until there is nothing more to do */
2479 + while (result == 0 && doing->ops_num > 0) {
2480 + carry_level *tmp;
2481 +
2482 + /* at this point @done is locked. */
2483 + /* repeat lock/do/unlock while
2484 +
2485 + (1) lock_carry_level() fails due to deadlock avoidance, or
2486 +
2487 + (2) carry_on_level() decides that more nodes have to
2488 + be involved.
2489 +
2490 + (3) some unexpected error occurred while balancing on the
2491 + upper levels. In this case all changes are rolled back.
2492 +
2493 + */
2494 + while (1) {
2495 + result = lock_carry_level(doing);
2496 + if (result == 0) {
2497 + /* perform operations from @doing and
2498 + accumulate new requests in @todo */
2499 + result = carry_on_level(doing, todo);
2500 + if (result == 0)
2501 + break;
2502 + else if (result != -E_REPEAT ||
2503 + !doing->restartable) {
2504 + warning("nikita-1043",
2505 + "Fatal error during carry: %i",
2506 + result);
2507 + print_level("done", done);
2508 + print_level("doing", doing);
2509 + print_level("todo", todo);
2510 + /* do some rough stuff like aborting
2511 + all pending transcrashes and thus
2512 + pushing tree back to the consistent
2513 + state. Alternatvely, just panic.
2514 + */
2515 + fatal_carry_error(doing, result);
2516 + return result;
2517 + }
2518 + } else if (result != -E_REPEAT) {
2519 + fatal_carry_error(doing, result);
2520 + return result;
2521 + }
2522 + unlock_carry_level(doing, 1);
2523 + }
2524 + /* at this point @done can be safely unlocked */
2525 + done_carry_level(done);
2526 +
2527 + /* cyclically shift queues */
2528 + tmp = done;
2529 + done = doing;
2530 + doing = todo;
2531 + todo = tmp;
2532 + init_carry_level(todo, doing->pool);
2533 +
2534 + /* give other threads chance to run */
2535 + reiser4_preempt_point();
2536 + }
2537 + done_carry_level(done);
2538 +
2539 + /* all counters, but x_refs should remain the same. x_refs can change
2540 + owing to transaction manager */
2541 + ON_DEBUG(CHECK_COUNTERS);
2542 + return result;
2543 +}
2544 +
2545 +/* perform carry operations on given level.
2546 +
2547 + Optimizations proposed by pooh:
2548 +
2549 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2550 + required;
2551 +
2552 + (2) unlock node if there are no more operations to be performed upon it and
2553 + node didn't add any operation to @todo. This can be implemented by
2554 + attaching to each node two counters: counter of operaions working on this
2555 + node and counter and operations carried upward from this node.
2556 +
2557 +*/
2558 +static int carry_on_level(carry_level * doing /* queue of carry operations to
2559 + * do on this level */ ,
2560 + carry_level * todo /* queue where new carry
2561 + * operations to be performed on
2562 + * the * parent level are
2563 + * accumulated during @doing
2564 + * processing. */ )
2565 +{
2566 + int result;
2567 + int (*f) (carry_op *, carry_level *, carry_level *);
2568 + carry_op *op;
2569 + carry_op *tmp_op;
2570 +
2571 + assert("nikita-1034", doing != NULL);
2572 + assert("nikita-1035", todo != NULL);
2573 +
2574 + /* @doing->nodes are locked. */
2575 +
2576 + /* This function can be split into two phases: analysis and modification.
2577 +
2578 + Analysis calculates precisely what items should be moved between
2579 + nodes. This information is gathered in some structures attached to
2580 + each carry_node in a @doing queue. Analysis also determines whether
2581 + new nodes are to be allocated etc.
2582 +
2583 + After analysis is completed, actual modification is performed. Here
2584 + we can take advantage of "batch modification": if there are several
2585 + operations acting on the same node, modifications can be performed
2586 + more efficiently when batched together.
2587 +
2588 + Above is an optimization left for the future.
2589 + */
2590 + /* Important, but delayed optimization: it's possible to batch
2591 + operations together and perform them more efficiently as a
2592 + result. For example, deletion of several neighboring items from a
2593 + node can be converted to a single ->cut() operation.
2594 +
2595 + Before processing queue, it should be scanned and "mergeable"
2596 + operations merged.
2597 + */
2598 + result = 0;
2599 + for_all_ops(doing, op, tmp_op) {
2600 + carry_opcode opcode;
2601 +
2602 + assert("nikita-1041", op != NULL);
2603 + opcode = op->op;
2604 + assert("nikita-1042", op->op < COP_LAST_OP);
2605 + f = op_dispatch_table[op->op].handler;
2606 + result = f(op, doing, todo);
2607 + /* locking can fail with -E_REPEAT. Any different error is fatal
2608 + and will be handled by fatal_carry_error() sledgehammer.
2609 + */
2610 + if (result != 0)
2611 + break;
2612 + }
2613 + if (result == 0) {
2614 + carry_plugin_info info;
2615 + carry_node *scan;
2616 + carry_node *tmp_scan;
2617 +
2618 + info.doing = doing;
2619 + info.todo = todo;
2620 +
2621 + assert("nikita-3002",
2622 + carry_level_invariant(doing, CARRY_DOING));
2623 + for_all_nodes(doing, scan, tmp_scan) {
2624 + znode *node;
2625 +
2626 + node = reiser4_carry_real(scan);
2627 + assert("nikita-2547", node != NULL);
2628 + if (node_is_empty(node)) {
2629 + result =
2630 + node_plugin_by_node(node)->
2631 + prepare_removal(node, &info);
2632 + if (result != 0)
2633 + break;
2634 + }
2635 + }
2636 + }
2637 + return result;
2638 +}
2639 +
2640 +/* post carry operation
2641 +
2642 + This is main function used by external carry clients: node layout plugins
2643 + and tree operations to create new carry operation to be performed on some
2644 + level.
2645 +
2646 + New operation will be included in the @level queue. To actually perform it,
2647 + call carry( level, ... ). This function takes write lock on @node. Carry
2648 + manages all its locks by itself, don't worry about this.
2649 +
2650 + This function adds operation and node at the end of the queue. It is up to
2651 + caller to guarantee proper ordering of node queue.
2652 +
2653 +*/
2654 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2655 + * is to be posted at */ ,
2656 + carry_opcode op /* opcode of operation */ ,
2657 + znode * node /* node on which this operation
2658 + * will operate */ ,
2659 + int apply_to_parent_p /* whether operation will
2660 + * operate directly on @node
2661 + * or on it parent. */)
2662 +{
2663 + carry_op *result;
2664 + carry_node *child;
2665 +
2666 + assert("nikita-1046", level != NULL);
2667 + assert("nikita-1788", znode_is_write_locked(node));
2668 +
2669 + result = add_op(level, POOLO_LAST, NULL);
2670 + if (IS_ERR(result))
2671 + return result;
2672 + child = reiser4_add_carry(level, POOLO_LAST, NULL);
2673 + if (IS_ERR(child)) {
2674 + reiser4_pool_free(&level->pool->op_pool, &result->header);
2675 + return (carry_op *) child;
2676 + }
2677 + result->node = child;
2678 + result->op = op;
2679 + child->parent = apply_to_parent_p;
2680 + if (ZF_ISSET(node, JNODE_ORPHAN))
2681 + child->left_before = 1;
2682 + child->node = node;
2683 + return result;
2684 +}
2685 +
2686 +/* initialize carry queue */
2687 +void init_carry_level(carry_level * level /* level to initialize */ ,
2688 + carry_pool * pool /* pool @level will allocate objects
2689 + * from */ )
2690 +{
2691 + assert("nikita-1045", level != NULL);
2692 + assert("nikita-967", pool != NULL);
2693 +
2694 + memset(level, 0, sizeof *level);
2695 + level->pool = pool;
2696 +
2697 + INIT_LIST_HEAD(&level->nodes);
2698 + INIT_LIST_HEAD(&level->ops);
2699 +}
2700 +
2701 +/* allocate carry pool and initialize pools within queue */
2702 +carry_pool *init_carry_pool(int size)
2703 +{
2704 + carry_pool *pool;
2705 +
2706 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2707 + pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2708 + if (pool == NULL)
2709 + return ERR_PTR(RETERR(-ENOMEM));
2710 +
2711 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2712 + (char *)pool->op);
2713 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2714 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2715 + return pool;
2716 +}
2717 +
2718 +/* finish with queue pools */
2719 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2720 +{
2721 + reiser4_done_pool(&pool->op_pool);
2722 + reiser4_done_pool(&pool->node_pool);
2723 + kfree(pool);
2724 +}
2725 +
2726 +/* add new carry node to the @level.
2727 +
2728 + Returns pointer to the new carry node allocated from pool. It's up to
2729 + callers to maintain proper order in the @level. Assumption is that if carry
2730 + nodes on one level are already sorted and modifications are peroformed from
2731 + left to right, carry nodes added on the parent level will be ordered
2732 + automatically. To control ordering use @order and @reference parameters.
2733 +
2734 +*/
2735 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2736 + * node to */ ,
2737 + pool_ordering order /* where to insert:
2738 + * at the beginning of
2739 + * @level,
2740 + * before @reference,
2741 + * after @reference,
2742 + * at the end of @level
2743 + */ ,
2744 + carry_node * reference/* reference node for
2745 + * insertion */)
2746 +{
2747 + ON_DEBUG(carry_node * orig_ref = reference);
2748 +
2749 + if (order == POOLO_BEFORE) {
2750 + reference = find_left_carry(reference, level);
2751 + if (reference == NULL)
2752 + reference = list_entry(level->nodes.next, carry_node,
2753 + header.level_linkage);
2754 + else
2755 + reference = list_entry(reference->header.level_linkage.next,
2756 + carry_node, header.level_linkage);
2757 + } else if (order == POOLO_AFTER) {
2758 + reference = find_right_carry(reference, level);
2759 + if (reference == NULL)
2760 + reference = list_entry(level->nodes.prev, carry_node,
2761 + header.level_linkage);
2762 + else
2763 + reference = list_entry(reference->header.level_linkage.prev,
2764 + carry_node, header.level_linkage);
2765 + }
2766 + assert("nikita-2209",
2767 + ergo(orig_ref != NULL,
2768 + reiser4_carry_real(reference) ==
2769 + reiser4_carry_real(orig_ref)));
2770 + return reiser4_add_carry(level, order, reference);
2771 +}
2772 +
2773 +carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2774 + * to */ ,
2775 + pool_ordering order /* where to insert: at the
2776 + * beginning of @level, before
2777 + * @reference, after @reference,
2778 + * at the end of @level */ ,
2779 + carry_node * reference /* reference node for
2780 + * insertion */ )
2781 +{
2782 + carry_node *result;
2783 +
2784 + result =
2785 + (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2786 + &level->nodes,
2787 + order, &reference->header);
2788 + if (!IS_ERR(result) && (result != NULL))
2789 + ++level->nodes_num;
2790 + return result;
2791 +}
2792 +
2793 +/* add new carry operation to the @level.
2794 +
2795 + Returns pointer to the new carry operations allocated from pool. It's up to
2796 + callers to maintain proper order in the @level. To control ordering use
2797 + @order and @reference parameters.
2798 +
2799 +*/
2800 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2801 + pool_ordering order /* where to insert: at the beginning of
2802 + * @level, before @reference, after
2803 + * @reference, at the end of @level */ ,
2804 + carry_op *
2805 + reference /* reference node for insertion */ )
2806 +{
2807 + carry_op *result;
2808 +
2809 + result =
2810 + (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2811 + order, &reference->header);
2812 + if (!IS_ERR(result) && (result != NULL))
2813 + ++level->ops_num;
2814 + return result;
2815 +}
2816 +
2817 +/* Return node on the right of which @node was created.
2818 +
2819 + Each node is created on the right of some existing node (or it is new root,
2820 + which is special case not handled here).
2821 +
2822 + @node is new node created on some level, but not yet inserted into its
2823 + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2824 +
2825 +*/
2826 +static carry_node *find_begetting_brother(carry_node * node /* node to start search
2827 + * from */ ,
2828 + carry_level * kin UNUSED_ARG /* level to
2829 + * scan */ )
2830 +{
2831 + carry_node *scan;
2832 +
2833 + assert("nikita-1614", node != NULL);
2834 + assert("nikita-1615", kin != NULL);
2835 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2836 + assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2837 + ZF_ISSET(reiser4_carry_real(node),
2838 + JNODE_ORPHAN)));
2839 + for (scan = node;;
2840 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
2841 + header.level_linkage)) {
2842 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2843 + if ((scan->node != node->node) &&
2844 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2845 + assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2846 + break;
2847 + }
2848 + }
2849 + return scan;
2850 +}
2851 +
2852 +static cmp_t
2853 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2854 +{
2855 + assert("nikita-2199", n1 != NULL);
2856 + assert("nikita-2200", n2 != NULL);
2857 +
2858 + if (n1 == n2)
2859 + return EQUAL_TO;
2860 + while (1) {
2861 + n1 = carry_node_next(n1);
2862 + if (carry_node_end(level, n1))
2863 + return GREATER_THAN;
2864 + if (n1 == n2)
2865 + return LESS_THAN;
2866 + }
2867 + impossible("nikita-2201", "End of level reached");
2868 +}
2869 +
2870 +carry_node *find_carry_node(carry_level * level, const znode * node)
2871 +{
2872 + carry_node *scan;
2873 + carry_node *tmp_scan;
2874 +
2875 + assert("nikita-2202", level != NULL);
2876 + assert("nikita-2203", node != NULL);
2877 +
2878 + for_all_nodes(level, scan, tmp_scan) {
2879 + if (reiser4_carry_real(scan) == node)
2880 + return scan;
2881 + }
2882 + return NULL;
2883 +}
2884 +
2885 +znode *reiser4_carry_real(const carry_node * node)
2886 +{
2887 + assert("nikita-3061", node != NULL);
2888 +
2889 + return node->lock_handle.node;
2890 +}
2891 +
2892 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2893 + const znode * node)
2894 +{
2895 + carry_node *base;
2896 + carry_node *scan;
2897 + carry_node *tmp_scan;
2898 + carry_node *proj;
2899 +
2900 + base = find_carry_node(doing, node);
2901 + assert("nikita-2204", base != NULL);
2902 +
2903 + for_all_nodes(todo, scan, tmp_scan) {
2904 + proj = find_carry_node(doing, scan->node);
2905 + assert("nikita-2205", proj != NULL);
2906 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2907 + break;
2908 + }
2909 + return scan;
2910 +}
2911 +
2912 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2913 + znode * node)
2914 +{
2915 + carry_node *reference;
2916 +
2917 + assert("nikita-2994", doing != NULL);
2918 + assert("nikita-2995", todo != NULL);
2919 + assert("nikita-2996", node != NULL);
2920 +
2921 + reference = insert_carry_node(doing, todo, node);
2922 + assert("nikita-2997", reference != NULL);
2923 +
2924 + return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2925 +}
2926 +
2927 +/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2928 + This function is different from reiser4_post_carry() in that it finds proper
2929 + place to insert node in the queue. */
2930 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2931 + * passed down to node
2932 + * plugin */ ,
2933 + carry_opcode op /* opcode of operation */ ,
2934 + znode * node /* node on which this
2935 + * operation will operate */ ,
2936 + int apply_to_parent_p /* whether operation will
2937 + * operate directly on @node
2938 + * or on it parent. */ )
2939 +{
2940 + carry_op *result;
2941 + carry_node *child;
2942 +
2943 + assert("nikita-2207", info != NULL);
2944 + assert("nikita-2208", info->todo != NULL);
2945 +
2946 + if (info->doing == NULL)
2947 + return reiser4_post_carry(info->todo, op, node,
2948 + apply_to_parent_p);
2949 +
2950 + result = add_op(info->todo, POOLO_LAST, NULL);
2951 + if (IS_ERR(result))
2952 + return result;
2953 + child = add_carry_atplace(info->doing, info->todo, node);
2954 + if (IS_ERR(child)) {
2955 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2956 + return (carry_op *) child;
2957 + }
2958 + result->node = child;
2959 + result->op = op;
2960 + child->parent = apply_to_parent_p;
2961 + if (ZF_ISSET(node, JNODE_ORPHAN))
2962 + child->left_before = 1;
2963 + child->node = node;
2964 + return result;
2965 +}
2966 +
2967 +/* lock all carry nodes in @level */
2968 +static int lock_carry_level(carry_level * level /* level to lock */ )
2969 +{
2970 + int result;
2971 + carry_node *node;
2972 + carry_node *tmp_node;
2973 +
2974 + assert("nikita-881", level != NULL);
2975 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2976 +
2977 + /* lock nodes from left to right */
2978 + result = 0;
2979 + for_all_nodes(level, node, tmp_node) {
2980 + result = lock_carry_node(level, node);
2981 + if (result != 0)
2982 + break;
2983 + }
2984 + return result;
2985 +}
2986 +
2987 +/* Synchronize delimiting keys between @node and its left neighbor.
2988 +
2989 + To reduce contention on dk key and simplify carry code, we synchronize
2990 + delimiting keys only when carry ultimately leaves tree level (carrying
2991 + changes upward) and unlocks nodes at this level.
2992 +
2993 + This function first finds left neighbor of @node and then updates left
2994 + neighbor's right delimiting key to conincide with least key in @node.
2995 +
2996 +*/
2997 +
2998 +ON_DEBUG(extern atomic_t delim_key_version;
2999 + )
3000 +
3001 +static void sync_dkeys(znode * spot /* node to update */ )
3002 +{
3003 + reiser4_key pivot;
3004 + reiser4_tree *tree;
3005 +
3006 + assert("nikita-1610", spot != NULL);
3007 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3008 +
3009 + tree = znode_get_tree(spot);
3010 + read_lock_tree(tree);
3011 + write_lock_dk(tree);
3012 +
3013 + assert("nikita-2192", znode_is_loaded(spot));
3014 +
3015 + /* sync left delimiting key of @spot with key in its leftmost item */
3016 + if (node_is_empty(spot))
3017 + pivot = *znode_get_rd_key(spot);
3018 + else
3019 + leftmost_key_in_node(spot, &pivot);
3020 +
3021 + znode_set_ld_key(spot, &pivot);
3022 +
3023 + /* there can be sequence of empty nodes pending removal on the left of
3024 + @spot. Scan them and update their left and right delimiting keys to
3025 + match left delimiting key of @spot. Also, update right delimiting
3026 + key of first non-empty left neighbor.
3027 + */
3028 + while (1) {
3029 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3030 + break;
3031 +
3032 + spot = spot->left;
3033 + if (spot == NULL)
3034 + break;
3035 +
3036 + znode_set_rd_key(spot, &pivot);
3037 + /* don't sink into the domain of another balancing */
3038 + if (!znode_is_write_locked(spot))
3039 + break;
3040 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3041 + znode_set_ld_key(spot, &pivot);
3042 + else
3043 + break;
3044 + }
3045 +
3046 + write_unlock_dk(tree);
3047 + read_unlock_tree(tree);
3048 +}
3049 +
3050 +/* unlock all carry nodes in @level */
3051 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3052 + int failure /* true if unlocking owing to
3053 + * failure */ )
3054 +{
3055 + carry_node *node;
3056 + carry_node *tmp_node;
3057 +
3058 + assert("nikita-889", level != NULL);
3059 +
3060 + if (!failure) {
3061 + znode *spot;
3062 +
3063 + spot = NULL;
3064 + /* update delimiting keys */
3065 + for_all_nodes(level, node, tmp_node) {
3066 + if (reiser4_carry_real(node) != spot) {
3067 + spot = reiser4_carry_real(node);
3068 + sync_dkeys(spot);
3069 + }
3070 + }
3071 + }
3072 +
3073 + /* nodes can be unlocked in arbitrary order. In preemptible
3074 + environment it's better to unlock in reverse order of locking,
3075 + though.
3076 + */
3077 + for_all_nodes_back(level, node, tmp_node) {
3078 + /* all allocated nodes should be already linked to their
3079 + parents at this moment. */
3080 + assert("nikita-1631",
3081 + ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3082 + JNODE_ORPHAN)));
3083 + ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3084 + unlock_carry_node(level, node, failure);
3085 + }
3086 + level->new_root = NULL;
3087 +}
3088 +
3089 +/* finish with @level
3090 +
3091 + Unlock nodes and release all allocated resources */
3092 +static void done_carry_level(carry_level * level /* level to finish */ )
3093 +{
3094 + carry_node *node;
3095 + carry_node *tmp_node;
3096 + carry_op *op;
3097 + carry_op *tmp_op;
3098 +
3099 + assert("nikita-1076", level != NULL);
3100 +
3101 + unlock_carry_level(level, 0);
3102 + for_all_nodes(level, node, tmp_node) {
3103 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3104 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3105 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3106 + }
3107 + for_all_ops(level, op, tmp_op)
3108 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3109 +}
3110 +
3111 +/* helper function to complete locking of carry node
3112 +
3113 + Finish locking of carry node. There are several ways in which new carry
3114 + node can be added into carry level and locked. Normal is through
3115 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3116 + function factors out common final part of all locking scenarios. It
3117 + supposes that @node -> lock_handle is lock handle for lock just taken and
3118 + fills ->real_node from this lock handle.
3119 +
3120 +*/
3121 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3122 +{
3123 + assert("nikita-1052", node != NULL);
3124 + assert("nikita-1187", reiser4_carry_real(node) != NULL);
3125 + assert("nikita-1188", !node->unlock);
3126 +
3127 + node->unlock = 1;
3128 + /* Load node content into memory and install node plugin by
3129 + looking at the node header.
3130 +
3131 + Most of the time this call is cheap because the node is
3132 + already in memory.
3133 +
3134 + Corresponding zrelse() is in unlock_carry_node()
3135 + */
3136 + return zload(reiser4_carry_real(node));
3137 +}
3138 +
3139 +/* lock carry node
3140 +
3141 + "Resolve" node to real znode, lock it and mark as locked.
3142 + This requires recursive locking of znodes.
3143 +
3144 + When operation is posted to the parent level, node it will be applied to is
3145 + not yet known. For example, when shifting data between two nodes,
3146 + delimiting has to be updated in parent or parents of nodes involved. But
3147 + their parents is not yet locked and, moreover said nodes can be reparented
3148 + by concurrent balancing.
3149 +
3150 + To work around this, carry operation is applied to special "carry node"
3151 + rather than to the znode itself. Carry node consists of some "base" or
3152 + "reference" znode and flags indicating how to get to the target of carry
3153 + operation (->real_node field of carry_node) from base.
3154 +
3155 +*/
3156 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3157 + carry_node * node /* node to lock */ )
3158 +{
3159 + int result;
3160 + znode *reference_point;
3161 + lock_handle lh;
3162 + lock_handle tmp_lh;
3163 + reiser4_tree *tree;
3164 +
3165 + assert("nikita-887", level != NULL);
3166 + assert("nikita-882", node != NULL);
3167 +
3168 + result = 0;
3169 + reference_point = node->node;
3170 + init_lh(&lh);
3171 + init_lh(&tmp_lh);
3172 + if (node->left_before) {
3173 + /* handling of new nodes, allocated on the previous level:
3174 +
3175 + some carry ops were propably posted from the new node, but
3176 + this node neither has parent pointer set, nor is
3177 + connected. This will be done in ->create_hook() for
3178 + internal item.
3179 +
3180 + No then less, parent of new node has to be locked. To do
3181 + this, first go to the "left" in the carry order. This
3182 + depends on the decision to always allocate new node on the
3183 + right of existing one.
3184 +
3185 + Loop handles case when multiple nodes, all orphans, were
3186 + inserted.
3187 +
3188 + Strictly speaking, taking tree lock is not necessary here,
3189 + because all nodes scanned by loop in
3190 + find_begetting_brother() are write-locked by this thread,
3191 + and thus, their sibling linkage cannot change.
3192 +
3193 + */
3194 + tree = znode_get_tree(reference_point);
3195 + read_lock_tree(tree);
3196 + reference_point = find_begetting_brother(node, level)->node;
3197 + read_unlock_tree(tree);
3198 + assert("nikita-1186", reference_point != NULL);
3199 + }
3200 + if (node->parent && (result == 0)) {
3201 + result =
3202 + reiser4_get_parent(&tmp_lh, reference_point,
3203 + ZNODE_WRITE_LOCK);
3204 + if (result != 0) {
3205 + ; /* nothing */
3206 + } else if (znode_get_level(tmp_lh.node) == 0) {
3207 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3208 + result = add_new_root(level, node, tmp_lh.node);
3209 + if (result == 0) {
3210 + reference_point = level->new_root;
3211 + move_lh(&lh, &node->lock_handle);
3212 + }
3213 + } else if ((level->new_root != NULL)
3214 + && (level->new_root !=
3215 + znode_parent_nolock(reference_point))) {
3216 + /* parent of node exists, but this level aready
3217 + created different new root, so */
3218 + warning("nikita-1109",
3219 + /* it should be "radicis", but tradition is
3220 + tradition. do banshees read latin? */
3221 + "hodie natus est radici frater");
3222 + result = -EIO;
3223 + } else {
3224 + move_lh(&lh, &tmp_lh);
3225 + reference_point = lh.node;
3226 + }
3227 + }
3228 + if (node->left && (result == 0)) {
3229 + assert("nikita-1183", node->parent);
3230 + assert("nikita-883", reference_point != NULL);
3231 + result =
3232 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3233 + ZNODE_WRITE_LOCK,
3234 + GN_CAN_USE_UPPER_LEVELS);
3235 + if (result == 0) {
3236 + done_lh(&lh);
3237 + move_lh(&lh, &tmp_lh);
3238 + reference_point = lh.node;
3239 + }
3240 + }
3241 + if (!node->parent && !node->left && !node->left_before) {
3242 + result =
3243 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3244 + ZNODE_LOCK_HIPRI);
3245 + }
3246 + if (result == 0) {
3247 + move_lh(&node->lock_handle, &lh);
3248 + result = lock_carry_node_tail(node);
3249 + }
3250 + done_lh(&tmp_lh);
3251 + done_lh(&lh);
3252 + return result;
3253 +}
3254 +
3255 +/* release a lock on &carry_node.
3256 +
3257 + Release if necessary lock on @node. This opearion is pair of
3258 + lock_carry_node() and is idempotent: you can call it more than once on the
3259 + same node.
3260 +
3261 +*/
3262 +static void
3263 +unlock_carry_node(carry_level * level,
3264 + carry_node * node /* node to be released */ ,
3265 + int failure /* 0 if node is unlocked due
3266 + * to some error */ )
3267 +{
3268 + znode *real_node;
3269 +
3270 + assert("nikita-884", node != NULL);
3271 +
3272 + real_node = reiser4_carry_real(node);
3273 + /* pair to zload() in lock_carry_node_tail() */
3274 + zrelse(real_node);
3275 + if (node->unlock && (real_node != NULL)) {
3276 + assert("nikita-899", real_node == node->lock_handle.node);
3277 + longterm_unlock_znode(&node->lock_handle);
3278 + }
3279 + if (failure) {
3280 + if (node->deallocate && (real_node != NULL)) {
3281 + /* free node in bitmap
3282 +
3283 + Prepare node for removal. Last zput() will finish
3284 + with it.
3285 + */
3286 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3287 + }
3288 + if (node->free) {
3289 + assert("nikita-2177",
3290 + list_empty_careful(&node->lock_handle.locks_link));
3291 + assert("nikita-2112",
3292 + list_empty_careful(&node->lock_handle.owners_link));
3293 + reiser4_pool_free(&level->pool->node_pool,
3294 + &node->header);
3295 + }
3296 + }
3297 +}
3298 +
3299 +/* fatal_carry_error() - all-catching error handling function
3300 +
3301 + It is possible that carry faces unrecoverable error, like unability to
3302 + insert pointer at the internal level. Our simple solution is just panic in
3303 + this situation. More sophisticated things like attempt to remount
3304 + file-system as read-only can be implemented without much difficlties.
3305 +
3306 + It is believed, that:
3307 +
3308 + 1. in stead of panicking, all current transactions can be aborted rolling
3309 + system back to the consistent state.
3310 +
3311 +Umm, if you simply panic without doing anything more at all, then all current
3312 +transactions are aborted and the system is rolled back to a consistent state,
3313 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3314 +precise. If an internal node is corrupted on disk due to hardware failure,
3315 +then there may be no consistent state that can be rolled back to, so instead
3316 +we should say that it will rollback the transactions, which barring other
3317 +factors means rolling back to a consistent state.
3318 +
3319 +# Nikita: there is a subtle difference between panic and aborting
3320 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3321 +# don't using reiser4 (not that we care about such processes), or using other
3322 +# reiser4 mounts (about them we do care) will simply continue to run. With
3323 +# some luck, even application using aborted file system can survive: it will
3324 +# get some error, like EBADF, from each file descriptor on failed file system,
3325 +# but applications that do care about tolerance will cope with this (squid
3326 +# will).
3327 +
3328 +It would be a nice feature though to support rollback without rebooting
3329 +followed by remount, but this can wait for later versions.
3330 +
3331 + 2. once isolated transactions will be implemented it will be possible to
3332 + roll back offending transaction.
3333 +
3334 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3335 +it more before deciding if it should be done. -Hans
3336 +
3337 +*/
3338 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3339 + * where
3340 + * unrecoverable
3341 + * error
3342 + * occurred */ ,
3343 + int ecode /* error code */ )
3344 +{
3345 + assert("nikita-1230", doing != NULL);
3346 + assert("nikita-1231", ecode < 0);
3347 +
3348 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3349 +}
3350 +
3351 +/* add new root to the tree
3352 +
3353 + This function itself only manages changes in carry structures and delegates
3354 + all hard work (allocation of znode for new root, changes of parent and
3355 + sibling pointers to the reiser4_add_tree_root().
3356 +
3357 + Locking: old tree root is locked by carry at this point. Fake znode is also
3358 + locked.
3359 +
3360 +*/
3361 +static int add_new_root(carry_level * level /* carry level in context of which
3362 + * operation is performed */ ,
3363 + carry_node * node /* carry node for existing root */ ,
3364 + znode * fake /* "fake" znode already locked by
3365 + * us */ )
3366 +{
3367 + int result;
3368 +
3369 + assert("nikita-1104", level != NULL);
3370 + assert("nikita-1105", node != NULL);
3371 +
3372 + assert("nikita-1403", znode_is_write_locked(node->node));
3373 + assert("nikita-1404", znode_is_write_locked(fake));
3374 +
3375 + /* trying to create new root. */
3376 + /* @node is root and it's already locked by us. This
3377 + means that nobody else can be trying to add/remove
3378 + tree root right now.
3379 + */
3380 + if (level->new_root == NULL)
3381 + level->new_root = reiser4_add_tree_root(node->node, fake);
3382 + if (!IS_ERR(level->new_root)) {
3383 + assert("nikita-1210", znode_is_root(level->new_root));
3384 + node->deallocate = 1;
3385 + result =
3386 + longterm_lock_znode(&node->lock_handle, level->new_root,
3387 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3388 + if (result == 0)
3389 + zput(level->new_root);
3390 + } else {
3391 + result = PTR_ERR(level->new_root);
3392 + level->new_root = NULL;
3393 + }
3394 + return result;
3395 +}
3396 +
3397 +/* allocate new znode and add the operation that inserts the
3398 + pointer to it into the parent node into the todo level
3399 +
3400 + Allocate new znode, add it into carry queue and post into @todo queue
3401 + request to add pointer to new node into its parent.
3402 +
3403 + This is carry related routing that calls reiser4_new_node() to allocate new
3404 + node.
3405 +*/
3406 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3407 + * node */ ,
3408 + carry_node * ref /* carry node after which new
3409 + * carry node is to be inserted
3410 + * into queue. This affects
3411 + * locking. */ ,
3412 + carry_level * doing /* carry queue where new node is
3413 + * to be added */ ,
3414 + carry_level * todo /* carry queue where COP_INSERT
3415 + * operation to add pointer to
3416 + * new node will ne added */ )
3417 +{
3418 + carry_node *fresh;
3419 + znode *new_znode;
3420 + carry_op *add_pointer;
3421 + carry_plugin_info info;
3422 +
3423 + assert("nikita-1048", brother != NULL);
3424 + assert("nikita-1049", todo != NULL);
3425 +
3426 + /* There is a lot of possible variations here: to what parent
3427 + new node will be attached and where. For simplicity, always
3428 + do the following:
3429 +
3430 + (1) new node and @brother will have the same parent.
3431 +
3432 + (2) new node is added on the right of @brother
3433 +
3434 + */
3435 +
3436 + fresh = reiser4_add_carry_skip(doing,
3437 + ref ? POOLO_AFTER : POOLO_LAST, ref);
3438 + if (IS_ERR(fresh))
3439 + return fresh;
3440 +
3441 + fresh->deallocate = 1;
3442 + fresh->free = 1;
3443 +
3444 + new_znode = reiser4_new_node(brother, znode_get_level(brother));
3445 + if (IS_ERR(new_znode))
3446 + /* @fresh will be deallocated automatically by error
3447 + handling code in the caller. */
3448 + return (carry_node *) new_znode;
3449 +
3450 + /* new_znode returned znode with x_count 1. Caller has to decrease
3451 + it. make_space() does. */
3452 +
3453 + ZF_SET(new_znode, JNODE_ORPHAN);
3454 + fresh->node = new_znode;
3455 +
3456 + while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3457 + ref = carry_node_prev(ref);
3458 + assert("nikita-1606", !carry_node_end(doing, ref));
3459 + }
3460 +
3461 + info.todo = todo;
3462 + info.doing = doing;
3463 + add_pointer = node_post_carry(&info, COP_INSERT,
3464 + reiser4_carry_real(ref), 1);
3465 + if (IS_ERR(add_pointer)) {
3466 + /* no need to deallocate @new_znode here: it will be
3467 + deallocated during carry error handling. */
3468 + return (carry_node *) add_pointer;
3469 + }
3470 +
3471 + add_pointer->u.insert.type = COPT_CHILD;
3472 + add_pointer->u.insert.child = fresh;
3473 + add_pointer->u.insert.brother = brother;
3474 + /* initially new node spawns empty key range */
3475 + write_lock_dk(znode_get_tree(brother));
3476 + znode_set_ld_key(new_znode,
3477 + znode_set_rd_key(new_znode,
3478 + znode_get_rd_key(brother)));
3479 + write_unlock_dk(znode_get_tree(brother));
3480 + return fresh;
3481 +}
3482 +
3483 +/* DEBUGGING FUNCTIONS.
3484 +
3485 + Probably we also should leave them on even when
3486 + debugging is turned off to print dumps at errors.
3487 +*/
3488 +#if REISER4_DEBUG
3489 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3490 +{
3491 + carry_node *node;
3492 + carry_node *tmp_node;
3493 +
3494 + if (level == NULL)
3495 + return 0;
3496 +
3497 + if (level->track_type != 0 &&
3498 + level->track_type != CARRY_TRACK_NODE &&
3499 + level->track_type != CARRY_TRACK_CHANGE)
3500 + return 0;
3501 +
3502 + /* check that nodes are in ascending order */
3503 + for_all_nodes(level, node, tmp_node) {
3504 + znode *left;
3505 + znode *right;
3506 +
3507 + reiser4_key lkey;
3508 + reiser4_key rkey;
3509 +
3510 + if (node != carry_node_front(level)) {
3511 + if (state == CARRY_TODO) {
3512 + right = node->node;
3513 + left = carry_node_prev(node)->node;
3514 + } else {
3515 + right = reiser4_carry_real(node);
3516 + left = reiser4_carry_real(carry_node_prev(node));
3517 + }
3518 + if (right == NULL || left == NULL)
3519 + continue;
3520 + if (node_is_empty(right) || node_is_empty(left))
3521 + continue;
3522 + if (!keyle(leftmost_key_in_node(left, &lkey),
3523 + leftmost_key_in_node(right, &rkey))) {
3524 + warning("", "wrong key order");
3525 + return 0;
3526 + }
3527 + }
3528 + }
3529 + return 1;
3530 +}
3531 +#endif
3532 +
3533 +/* get symbolic name for boolean */
3534 +static const char *tf(int boolean /* truth value */ )
3535 +{
3536 + return boolean ? "t" : "f";
3537 +}
3538 +
3539 +/* symbolic name for carry operation */
3540 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3541 +{
3542 + switch (op) {
3543 + case COP_INSERT:
3544 + return "COP_INSERT";
3545 + case COP_DELETE:
3546 + return "COP_DELETE";
3547 + case COP_CUT:
3548 + return "COP_CUT";
3549 + case COP_PASTE:
3550 + return "COP_PASTE";
3551 + case COP_UPDATE:
3552 + return "COP_UPDATE";
3553 + case COP_EXTENT:
3554 + return "COP_EXTENT";
3555 + case COP_INSERT_FLOW:
3556 + return "COP_INSERT_FLOW";
3557 + default:{
3558 + /* not mt safe, but who cares? */
3559 + static char buf[20];
3560 +
3561 + sprintf(buf, "unknown op: %x", op);
3562 + return buf;
3563 + }
3564 + }
3565 +}
3566 +
3567 +/* dump information about carry node */
3568 +static void print_carry(const char *prefix /* prefix to print */ ,
3569 + carry_node * node /* node to print */ )
3570 +{
3571 + if (node == NULL) {
3572 + printk("%s: null\n", prefix);
3573 + return;
3574 + }
3575 + printk
3576 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3577 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3578 + tf(node->free), tf(node->deallocate));
3579 +}
3580 +
3581 +/* dump information about carry operation */
3582 +static void print_op(const char *prefix /* prefix to print */ ,
3583 + carry_op * op /* operation to print */ )
3584 +{
3585 + if (op == NULL) {
3586 + printk("%s: null\n", prefix);
3587 + return;
3588 + }
3589 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3590 + print_carry("\tnode", op->node);
3591 + switch (op->op) {
3592 + case COP_INSERT:
3593 + case COP_PASTE:
3594 + print_coord("\tcoord",
3595 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3596 + reiser4_print_key("\tkey",
3597 + op->u.insert.d ? op->u.insert.d->key : NULL);
3598 + print_carry("\tchild", op->u.insert.child);
3599 + break;
3600 + case COP_DELETE:
3601 + print_carry("\tchild", op->u.delete.child);
3602 + break;
3603 + case COP_CUT:
3604 + if (op->u.cut_or_kill.is_cut) {
3605 + print_coord("\tfrom",
3606 + op->u.cut_or_kill.u.kill->params.from, 0);
3607 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3608 + 0);
3609 + } else {
3610 + print_coord("\tfrom",
3611 + op->u.cut_or_kill.u.cut->params.from, 0);
3612 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3613 + 0);
3614 + }
3615 + break;
3616 + case COP_UPDATE:
3617 + print_carry("\tleft", op->u.update.left);
3618 + break;
3619 + default:
3620 + /* do nothing */
3621 + break;
3622 + }
3623 +}
3624 +
3625 +/* dump information about all nodes and operations in a @level */
3626 +static void print_level(const char *prefix /* prefix to print */ ,
3627 + carry_level * level /* level to print */ )
3628 +{
3629 + carry_node *node;
3630 + carry_node *tmp_node;
3631 + carry_op *op;
3632 + carry_op *tmp_op;
3633 +
3634 + if (level == NULL) {
3635 + printk("%s: null\n", prefix);
3636 + return;
3637 + }
3638 + printk("%s: %p, restartable: %s\n",
3639 + prefix, level, tf(level->restartable));
3640 +
3641 + for_all_nodes(level, node, tmp_node)
3642 + print_carry("\tcarry node", node);
3643 + for_all_ops(level, op, tmp_op)
3644 + print_op("\tcarry op", op);
3645 +}
3646 +
3647 +/* Make Linus happy.
3648 + Local variables:
3649 + c-indentation-style: "K&R"
3650 + mode-name: "LC"
3651 + c-basic-offset: 8
3652 + tab-width: 8
3653 + fill-column: 120
3654 + scroll-step: 1
3655 + End:
3656 +*/
3657 diff -urN linux-2.6.20.orig/fs/reiser4/carry.h linux-2.6.20/fs/reiser4/carry.h
3658 --- linux-2.6.20.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3659 +++ linux-2.6.20/fs/reiser4/carry.h 2007-05-06 14:50:43.690973225 +0400
3660 @@ -0,0 +1,442 @@
3661 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3662 +
3663 +/* Functions and data types to "carry" tree modification(s) upward.
3664 + See fs/reiser4/carry.c for details. */
3665 +
3666 +#if !defined( __FS_REISER4_CARRY_H__ )
3667 +#define __FS_REISER4_CARRY_H__
3668 +
3669 +#include "forward.h"
3670 +#include "debug.h"
3671 +#include "pool.h"
3672 +#include "znode.h"
3673 +
3674 +#include <linux/types.h>
3675 +
3676 +/* &carry_node - "location" of carry node.
3677 +
3678 + "location" of node that is involved or going to be involved into
3679 + carry process. Node where operation will be carried to on the
3680 + parent level cannot be recorded explicitly. Operation will be carried
3681 + usually to the parent of some node (where changes are performed at
3682 + the current level) or, to the left neighbor of its parent. But while
3683 + modifications are performed at the current level, parent may
3684 + change. So, we have to allow some indirection (or, positevly,
3685 + flexibility) in locating carry nodes.
3686 +
3687 +*/
3688 +typedef struct carry_node {
3689 + /* pool linkage */
3690 + reiser4_pool_header header;
3691 +
3692 + /* base node from which real_node is calculated. See
3693 + fs/reiser4/carry.c:lock_carry_node(). */
3694 + znode *node;
3695 +
3696 + /* how to get ->real_node */
3697 + /* to get ->real_node obtain parent of ->node */
3698 + __u32 parent:1;
3699 + /* to get ->real_node obtain left neighbor of parent of
3700 + ->node */
3701 + __u32 left:1;
3702 + __u32 left_before:1;
3703 +
3704 + /* locking */
3705 +
3706 + /* this node was locked by carry process and should be
3707 + unlocked when carry leaves a level */
3708 + __u32 unlock:1;
3709 +
3710 + /* disk block for this node was allocated by carry process and
3711 + should be deallocated when carry leaves a level */
3712 + __u32 deallocate:1;
3713 + /* this carry node was allocated by carry process and should be
3714 + freed when carry leaves a level */
3715 + __u32 free:1;
3716 +
3717 + /* type of lock we want to take on this node */
3718 + lock_handle lock_handle;
3719 +} carry_node;
3720 +
3721 +/* &carry_opcode - elementary operations that can be carried upward
3722 +
3723 + Operations that carry() can handle. This list is supposed to be
3724 + expanded.
3725 +
3726 + Each carry operation (cop) is handled by appropriate function defined
3727 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
3728 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3729 + call plugins of nodes affected by operation to modify nodes' content
3730 + and to gather operations to be performed on the next level.
3731 +
3732 +*/
3733 +typedef enum {
3734 + /* insert new item into node. */
3735 + COP_INSERT,
3736 + /* delete pointer from parent node */
3737 + COP_DELETE,
3738 + /* remove part of or whole node. */
3739 + COP_CUT,
3740 + /* increase size of item. */
3741 + COP_PASTE,
3742 + /* insert extent (that is sequence of unformatted nodes). */
3743 + COP_EXTENT,
3744 + /* update delimiting key in least common ancestor of two
3745 + nodes. This is performed when items are moved between two
3746 + nodes.
3747 + */
3748 + COP_UPDATE,
3749 + /* insert flow */
3750 + COP_INSERT_FLOW,
3751 + COP_LAST_OP,
3752 +} carry_opcode;
3753 +
3754 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3755 +
3756 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3757 + item is determined. */
3758 +typedef enum {
3759 + /* target item is one containing pointer to the ->child node */
3760 + COPT_CHILD,
3761 + /* target item is given explicitly by @coord */
3762 + COPT_ITEM_DATA,
3763 + /* target item is given by key */
3764 + COPT_KEY,
3765 + /* see insert_paste_common() for more comments on this. */
3766 + COPT_PASTE_RESTARTED,
3767 +} cop_insert_pos_type;
3768 +
3769 +/* flags to cut and delete */
3770 +typedef enum {
3771 + /* don't kill node even if it became completely empty as results of
3772 + * cut. This is needed for eottl handling. See carry_extent() for
3773 + * details. */
3774 + DELETE_RETAIN_EMPTY = (1 << 0)
3775 +} cop_delete_flag;
3776 +
3777 +/*
3778 + * carry() implements "lock handle tracking" feature.
3779 + *
3780 + * Callers supply carry with node where to perform initial operation and lock
3781 + * handle on this node. Trying to optimize node utilization carry may actually
3782 + * move insertion point to different node. Callers expect that lock handle
3783 + * will rebe transferred to the new node also.
3784 + *
3785 + */
3786 +typedef enum {
3787 + /* transfer lock handle along with insertion point */
3788 + CARRY_TRACK_CHANGE = 1,
3789 + /* acquire new lock handle to the node where insertion point is. This
3790 + * is used when carry() client doesn't initially possess lock handle
3791 + * on the insertion point node, for example, by extent insertion
3792 + * code. See carry_extent(). */
3793 + CARRY_TRACK_NODE = 2
3794 +} carry_track_type;
3795 +
3796 +/* data supplied to COP_{INSERT|PASTE} by callers */
3797 +typedef struct carry_insert_data {
3798 + /* position where new item is to be inserted */
3799 + coord_t *coord;
3800 + /* new item description */
3801 + reiser4_item_data *data;
3802 + /* key of new item */
3803 + const reiser4_key *key;
3804 +} carry_insert_data;
3805 +
3806 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3807 +struct cut_kill_params {
3808 + /* coord where cut starts (inclusive) */
3809 + coord_t *from;
3810 + /* coord where cut stops (inclusive, this item/unit will also be
3811 + * cut) */
3812 + coord_t *to;
3813 + /* starting key. This is necessary when item and unit pos don't
3814 + * uniquely identify what portion or tree to remove. For example, this
3815 + * indicates what portion of extent unit will be affected. */
3816 + const reiser4_key *from_key;
3817 + /* exclusive stop key */
3818 + const reiser4_key *to_key;
3819 + /* if this is not NULL, smallest actually removed key is stored
3820 + * here. */
3821 + reiser4_key *smallest_removed;
3822 + /* kill_node_content() is called for file truncate */
3823 + int truncate;
3824 +};
3825 +
3826 +struct carry_cut_data {
3827 + struct cut_kill_params params;
3828 +};
3829 +
3830 +struct carry_kill_data {
3831 + struct cut_kill_params params;
3832 + /* parameter to be passed to the ->kill_hook() method of item
3833 + * plugin */
3834 + /*void *iplug_params; *//* FIXME: unused currently */
3835 + /* if not NULL---inode whose items are being removed. This is needed
3836 + * for ->kill_hook() of extent item to update VM structures when
3837 + * removing pages. */
3838 + struct inode *inode;
3839 + /* sibling list maintenance is complicated by existence of eottl. When
3840 + * eottl whose left and right neighbors are formatted leaves is
3841 + * removed, one has to connect said leaves in the sibling list. This
3842 + * cannot be done when extent removal is just started as locking rules
3843 + * require sibling list update to happen atomically with removal of
3844 + * extent item. Therefore: 1. pointers to left and right neighbors
3845 + * have to be passed down to the ->kill_hook() of extent item, and
3846 + * 2. said neighbors have to be locked. */
3847 + lock_handle *left;
3848 + lock_handle *right;
3849 + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3850 + unsigned flags;
3851 + char *buf;
3852 +};
3853 +
3854 +/* &carry_tree_op - operation to "carry" upward.
3855 +
3856 + Description of an operation we want to "carry" to the upper level of
3857 + a tree: e.g, when we insert something and there is not enough space
3858 + we allocate a new node and "carry" the operation of inserting a
3859 + pointer to the new node to the upper level, on removal of empty node,
3860 + we carry up operation of removing appropriate entry from parent.
3861 +
3862 + There are two types of carry ops: when adding or deleting node we
3863 + node at the parent level where appropriate modification has to be
3864 + performed is known in advance. When shifting items between nodes
3865 + (split, merge), delimiting key should be changed in the least common
3866 + parent of the nodes involved that is not known in advance.
3867 +
3868 + For the operations of the first type we store in &carry_op pointer to
3869 + the &carry_node at the parent level. For the operation of the second
3870 + type we store &carry_node or parents of the left and right nodes
3871 + modified and keep track of them upward until they coincide.
3872 +
3873 +*/
3874 +typedef struct carry_op {
3875 + /* pool linkage */
3876 + reiser4_pool_header header;
3877 + carry_opcode op;
3878 + /* node on which operation is to be performed:
3879 +
3880 + for insert, paste: node where new item is to be inserted
3881 +
3882 + for delete: node where pointer is to be deleted
3883 +
3884 + for cut: node to cut from
3885 +
3886 + for update: node where delimiting key is to be modified
3887 +
3888 + for modify: parent of modified node
3889 +
3890 + */
3891 + carry_node *node;
3892 + union {
3893 + struct {
3894 + /* (sub-)type of insertion/paste. Taken from
3895 + cop_insert_pos_type. */
3896 + __u8 type;
3897 + /* various operation flags. Taken from
3898 + cop_insert_flag. */
3899 + __u8 flags;
3900 + carry_insert_data *d;
3901 + carry_node *child;
3902 + znode *brother;
3903 + } insert, paste, extent;
3904 +
3905 + struct {
3906 + int is_cut;
3907 + union {
3908 + carry_kill_data *kill;
3909 + carry_cut_data *cut;
3910 + } u;
3911 + } cut_or_kill;
3912 +
3913 + struct {
3914 + carry_node *left;
3915 + } update;
3916 + struct {
3917 + /* changed child */
3918 + carry_node *child;
3919 + /* bitmask of changes. See &cop_modify_flag */
3920 + __u32 flag;
3921 + } modify;
3922 + struct {
3923 + /* flags to deletion operation. Are taken from
3924 + cop_delete_flag */
3925 + __u32 flags;
3926 + /* child to delete from parent. If this is
3927 + NULL, delete op->node. */
3928 + carry_node *child;
3929 + } delete;
3930 + struct {
3931 + /* various operation flags. Taken from
3932 + cop_insert_flag. */
3933 + __u32 flags;
3934 + flow_t *flow;
3935 + coord_t *insert_point;
3936 + reiser4_item_data *data;
3937 + /* flow insertion is limited by number of new blocks
3938 + added in that operation which do not get any data
3939 + but part of flow. This limit is set by macro
3940 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3941 + of nodes added already during one carry_flow */
3942 + int new_nodes;
3943 + } insert_flow;
3944 + } u;
3945 +} carry_op;
3946 +
3947 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3948 +typedef struct carry_pool {
3949 + carry_op op[CARRIES_POOL_SIZE];
3950 + reiser4_pool op_pool;
3951 + carry_node node[NODES_LOCKED_POOL_SIZE];
3952 + reiser4_pool node_pool;
3953 +} carry_pool;
3954 +
3955 +/* &carry_tree_level - carry process on given level
3956 +
3957 + Description of balancing process on the given level.
3958 +
3959 + No need for locking here, as carry_tree_level is essentially per
3960 + thread thing (for now).
3961 +
3962 +*/
3963 +struct carry_level {
3964 + /* this level may be restarted */
3965 + __u32 restartable:1;
3966 + /* list of carry nodes on this level, ordered by key order */
3967 + struct list_head nodes;
3968 + struct list_head ops;
3969 + /* pool where new objects are allocated from */
3970 + carry_pool *pool;
3971 + int ops_num;
3972 + int nodes_num;
3973 + /* new root created on this level, if any */
3974 + znode *new_root;
3975 + /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
3976 + when they want ->tracked to automagically wander to the node where
3977 + insertion point moved after insert or paste.
3978 + */
3979 + carry_track_type track_type;
3980 + /* lock handle supplied by user that we are tracking. See
3981 + above. */
3982 + lock_handle *tracked;
3983 +};
3984 +
3985 +/* information carry passes to plugin methods that may add new operations to
3986 + the @todo queue */
3987 +struct carry_plugin_info {
3988 + carry_level *doing;
3989 + carry_level *todo;
3990 +};
3991 +
3992 +int reiser4_carry(carry_level * doing, carry_level * done);
3993 +
3994 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
3995 + carry_node * reference);
3996 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
3997 + carry_node * reference);
3998 +
3999 +extern carry_node *insert_carry_node(carry_level * doing,
4000 + carry_level * todo, const znode * node);
4001 +
4002 +extern carry_pool *init_carry_pool(int);
4003 +extern void done_carry_pool(carry_pool * pool);
4004 +
4005 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4006 +
4007 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4008 + znode * node, int apply_to_parent);
4009 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4010 + znode * node, int apply_to_parent_p);
4011 +
4012 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4013 + carry_level * doing, carry_level * todo);
4014 +
4015 +carry_node *find_carry_node(carry_level * level, const znode * node);
4016 +
4017 +extern znode *reiser4_carry_real(const carry_node * node);
4018 +
4019 +/* helper macros to iterate over carry queues */
4020 +
4021 +#define carry_node_next( node ) \
4022 + list_entry((node)->header.level_linkage.next, carry_node, \
4023 + header.level_linkage)
4024 +
4025 +#define carry_node_prev( node ) \
4026 + list_entry((node)->header.level_linkage.prev, carry_node, \
4027 + header.level_linkage)
4028 +
4029 +#define carry_node_front( level ) \
4030 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4031 +
4032 +#define carry_node_back( level ) \
4033 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4034 +
4035 +#define carry_node_end( level, node ) \
4036 + (&(level)->nodes == &(node)->header.level_linkage)
4037 +
4038 +/* macro to iterate over all operations in a @level */
4039 +#define for_all_ops( level /* carry level (of type carry_level *) */, \
4040 + op /* pointer to carry operation, modified by loop (of \
4041 + * type carry_op *) */, \
4042 + tmp /* pointer to carry operation (of type carry_op *), \
4043 + * used to make iterator stable in the face of \
4044 + * deletions from the level */ ) \
4045 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4046 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4047 + &op->header.level_linkage != &level->ops; \
4048 + op = tmp, \
4049 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4050 +
4051 +#if 0
4052 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4053 + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4054 + ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4055 + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4056 +#endif
4057 +
4058 +/* macro to iterate over all nodes in a @level */ \
4059 +#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4060 + node /* pointer to carry node, modified by loop (of \
4061 + * type carry_node *) */, \
4062 + tmp /* pointer to carry node (of type carry_node *), \
4063 + * used to make iterator stable in the face of * \
4064 + * deletions from the level */ ) \
4065 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4066 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4067 + &node->header.level_linkage != &level->nodes; \
4068 + node = tmp, \
4069 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4070 +
4071 +#if 0
4072 +for( node = carry_node_front( level ), \
4073 + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4074 + node = tmp, tmp = carry_node_next( node ) )
4075 +#endif
4076 +
4077 +/* macro to iterate over all nodes in a @level in reverse order
4078 +
4079 + This is used, because nodes are unlocked in reversed order of locking */
4080 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4081 + node /* pointer to carry node, modified by loop \
4082 + * (of type carry_node *) */, \
4083 + tmp /* pointer to carry node (of type carry_node \
4084 + * *), used to make iterator stable in the \
4085 + * face of deletions from the level */ ) \
4086 +for( node = carry_node_back( level ), \
4087 + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4088 + node = tmp, tmp = carry_node_prev( node ) )
4089 +
4090 +/* __FS_REISER4_CARRY_H__ */
4091 +#endif
4092 +
4093 +/* Make Linus happy.
4094 + Local variables:
4095 + c-indentation-style: "K&R"
4096 + mode-name: "LC"
4097 + c-basic-offset: 8
4098 + tab-width: 8
4099 + fill-column: 120
4100 + scroll-step: 1
4101 + End:
4102 +*/
4103 diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.c linux-2.6.20/fs/reiser4/carry_ops.c
4104 --- linux-2.6.20.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4105 +++ linux-2.6.20/fs/reiser4/carry_ops.c 2007-05-06 14:50:43.694974475 +0400
4106 @@ -0,0 +1,2131 @@
4107 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4108 +
4109 +/* implementation of carry operations */
4110 +
4111 +#include "forward.h"
4112 +#include "debug.h"
4113 +#include "key.h"
4114 +#include "coord.h"
4115 +#include "plugin/item/item.h"
4116 +#include "plugin/node/node.h"
4117 +#include "jnode.h"
4118 +#include "znode.h"
4119 +#include "block_alloc.h"
4120 +#include "tree_walk.h"
4121 +#include "pool.h"
4122 +#include "tree_mod.h"
4123 +#include "carry.h"
4124 +#include "carry_ops.h"
4125 +#include "tree.h"
4126 +#include "super.h"
4127 +#include "reiser4.h"
4128 +
4129 +#include <linux/types.h>
4130 +#include <linux/err.h>
4131 +
4132 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4133 + carry_level * doing, carry_level * todo,
4134 + unsigned int including_insert_coord_p);
4135 +
4136 +extern int lock_carry_node(carry_level * level, carry_node * node);
4137 +extern int lock_carry_node_tail(carry_node * node);
4138 +
4139 +/* find left neighbor of a carry node
4140 +
4141 + Look for left neighbor of @node and add it to the @doing queue. See
4142 + comments in the body.
4143 +
4144 +*/
4145 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4146 + * neighbor of */ ,
4147 + carry_level * doing /* level to scan */ )
4148 +{
4149 + int result;
4150 + carry_node *node;
4151 + carry_node *left;
4152 + int flags;
4153 + reiser4_tree *tree;
4154 +
4155 + node = op->node;
4156 +
4157 + tree = current_tree;
4158 + read_lock_tree(tree);
4159 + /* first, check whether left neighbor is already in a @doing queue */
4160 + if (reiser4_carry_real(node)->left != NULL) {
4161 + /* NOTE: there is locking subtlety here. Look into
4162 + * find_right_neighbor() for more info */
4163 + if (find_carry_node(doing,
4164 + reiser4_carry_real(node)->left) != NULL) {
4165 + read_unlock_tree(tree);
4166 + left = node;
4167 + do {
4168 + left = list_entry(left->header.level_linkage.prev,
4169 + carry_node, header.level_linkage);
4170 + assert("nikita-3408", !carry_node_end(doing,
4171 + left));
4172 + } while (reiser4_carry_real(left) ==
4173 + reiser4_carry_real(node));
4174 + return left;
4175 + }
4176 + }
4177 + read_unlock_tree(tree);
4178 +
4179 + left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4180 + if (IS_ERR(left))
4181 + return left;
4182 +
4183 + left->node = node->node;
4184 + left->free = 1;
4185 +
4186 + flags = GN_TRY_LOCK;
4187 + if (!op->u.insert.flags & COPI_LOAD_LEFT)
4188 + flags |= GN_NO_ALLOC;
4189 +
4190 + /* then, feeling lucky, peek left neighbor in the cache. */
4191 + result = reiser4_get_left_neighbor(&left->lock_handle,
4192 + reiser4_carry_real(node),
4193 + ZNODE_WRITE_LOCK, flags);
4194 + if (result == 0) {
4195 + /* ok, node found and locked. */
4196 + result = lock_carry_node_tail(left);
4197 + if (result != 0)
4198 + left = ERR_PTR(result);
4199 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4200 + /* node is leftmost node in a tree, or neighbor wasn't in
4201 + cache, or there is an extent on the left. */
4202 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4203 + left = NULL;
4204 + } else if (doing->restartable) {
4205 + /* if left neighbor is locked, and level is restartable, add
4206 + new node to @doing and restart. */
4207 + assert("nikita-913", node->parent != 0);
4208 + assert("nikita-914", node->node != NULL);
4209 + left->left = 1;
4210 + left->free = 0;
4211 + left = ERR_PTR(-E_REPEAT);
4212 + } else {
4213 + /* left neighbor is locked, level cannot be restarted. Just
4214 + ignore left neighbor. */
4215 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4216 + left = NULL;
4217 + }
4218 + return left;
4219 +}
4220 +
4221 +/* find right neighbor of a carry node
4222 +
4223 + Look for right neighbor of @node and add it to the @doing queue. See
4224 + comments in the body.
4225 +
4226 +*/
4227 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4228 + * neighbor of */ ,
4229 + carry_level * doing /* level to scan */ )
4230 +{
4231 + int result;
4232 + carry_node *node;
4233 + carry_node *right;
4234 + lock_handle lh;
4235 + int flags;
4236 + reiser4_tree *tree;
4237 +
4238 + init_lh(&lh);
4239 +
4240 + node = op->node;
4241 +
4242 + tree = current_tree;
4243 + read_lock_tree(tree);
4244 + /* first, check whether right neighbor is already in a @doing queue */
4245 + if (reiser4_carry_real(node)->right != NULL) {
4246 + /*
4247 + * Tree lock is taken here anyway, because, even if _outcome_
4248 + * of (find_carry_node() != NULL) doesn't depends on
4249 + * concurrent updates to ->right, find_carry_node() cannot
4250 + * work with second argument NULL. Hence, following comment is
4251 + * of historic importance only.
4252 + *
4253 + * Subtle:
4254 + *
4255 + * Q: why don't we need tree lock here, looking for the right
4256 + * neighbor?
4257 + *
4258 + * A: even if value of node->real_node->right were changed
4259 + * during find_carry_node() execution, outcome of execution
4260 + * wouldn't change, because (in short) other thread cannot add
4261 + * elements to the @doing, and if node->real_node->right
4262 + * already was in @doing, value of node->real_node->right
4263 + * couldn't change, because node cannot be inserted between
4264 + * locked neighbors.
4265 + */
4266 + if (find_carry_node(doing,
4267 + reiser4_carry_real(node)->right) != NULL) {
4268 + read_unlock_tree(tree);
4269 + /*
4270 + * What we are doing here (this is also applicable to
4271 + * the find_left_neighbor()).
4272 + *
4273 + * tree_walk.c code requires that insertion of a
4274 + * pointer to a child, modification of parent pointer
4275 + * in the child, and insertion of the child into
4276 + * sibling list are atomic (see
4277 + * plugin/item/internal.c:create_hook_internal()).
4278 + *
4279 + * carry allocates new node long before pointer to it
4280 + * is inserted into parent and, actually, long before
4281 + * parent is even known. Such allocated-but-orphaned
4282 + * nodes are only trackable through carry level lists.
4283 + *
4284 + * Situation that is handled here is following: @node
4285 + * has valid ->right pointer, but there is
4286 + * allocated-but-orphaned node in the carry queue that
4287 + * is logically between @node and @node->right. Here
4288 + * we are searching for it. Critical point is that
4289 + * this is only possible if @node->right is also in
4290 + * the carry queue (this is checked above), because
4291 + * this is the only way new orphaned node could be
4292 + * inserted between them (before inserting new node,
4293 + * make_space() first tries to shift to the right, so,
4294 + * right neighbor will be locked and queued).
4295 + *
4296 + */
4297 + right = node;
4298 + do {
4299 + right = list_entry(right->header.level_linkage.next,
4300 + carry_node, header.level_linkage);
4301 + assert("nikita-3408", !carry_node_end(doing,
4302 + right));
4303 + } while (reiser4_carry_real(right) ==
4304 + reiser4_carry_real(node));
4305 + return right;
4306 + }
4307 + }
4308 + read_unlock_tree(tree);
4309 +
4310 + flags = GN_CAN_USE_UPPER_LEVELS;
4311 + if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4312 + flags = GN_NO_ALLOC;
4313 +
4314 + /* then, try to lock right neighbor */
4315 + init_lh(&lh);
4316 + result = reiser4_get_right_neighbor(&lh,
4317 + reiser4_carry_real(node),
4318 + ZNODE_WRITE_LOCK, flags);
4319 + if (result == 0) {
4320 + /* ok, node found and locked. */
4321 + right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4322 + if (!IS_ERR(right)) {
4323 + right->node = lh.node;
4324 + move_lh(&right->lock_handle, &lh);
4325 + right->free = 1;
4326 + result = lock_carry_node_tail(right);
4327 + if (result != 0)
4328 + right = ERR_PTR(result);
4329 + }
4330 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4331 + /* node is rightmost node in a tree, or neighbor wasn't in
4332 + cache, or there is an extent on the right. */
4333 + right = NULL;
4334 + } else
4335 + right = ERR_PTR(result);
4336 + done_lh(&lh);
4337 + return right;
4338 +}
4339 +
4340 +/* how much free space in a @node is needed for @op
4341 +
4342 + How much space in @node is required for completion of @op, where @op is
4343 + insert or paste operation.
4344 +*/
4345 +static unsigned int space_needed_for_op(znode * node /* znode data are
4346 + * inserted or
4347 + * pasted in */ ,
4348 + carry_op * op /* carry
4349 + operation */ )
4350 +{
4351 + assert("nikita-919", op != NULL);
4352 +
4353 + switch (op->op) {
4354 + default:
4355 + impossible("nikita-1701", "Wrong opcode");
4356 + case COP_INSERT:
4357 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4358 + case COP_PASTE:
4359 + return space_needed(node, op->u.insert.d->coord,
4360 + op->u.insert.d->data, 0);
4361 + }
4362 +}
4363 +
4364 +/* how much space in @node is required to insert or paste @data at
4365 + @coord. */
4366 +unsigned int space_needed(const znode * node /* node data are inserted or
4367 + * pasted in */ ,
4368 + const coord_t * coord /* coord where data are
4369 + * inserted or pasted
4370 + * at */ ,
4371 + const reiser4_item_data * data /* data to insert or
4372 + * paste */ ,
4373 + int insertion /* non-0 is inserting, 0---paste */ )
4374 +{
4375 + int result;
4376 + item_plugin *iplug;
4377 +
4378 + assert("nikita-917", node != NULL);
4379 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4380 + assert("vs-230", !insertion || (coord == NULL));
4381 +
4382 + result = 0;
4383 + iplug = data->iplug;
4384 + if (iplug->b.estimate != NULL) {
4385 + /* ask item plugin how much space is needed to insert this
4386 + item */
4387 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4388 + } else {
4389 + /* reasonable default */
4390 + result += data->length;
4391 + }
4392 + if (insertion) {
4393 + node_plugin *nplug;
4394 +
4395 + nplug = node->nplug;
4396 + /* and add node overhead */
4397 + if (nplug->item_overhead != NULL) {
4398 + result += nplug->item_overhead(node, NULL);
4399 + }
4400 + }
4401 + return result;
4402 +}
4403 +
4404 +/* find &coord in parent where pointer to new child is to be stored. */
4405 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4406 + * insert pointer to new
4407 + * child */ )
4408 +{
4409 + int result;
4410 + znode *node;
4411 + znode *child;
4412 +
4413 + assert("nikita-941", op != NULL);
4414 + assert("nikita-942", op->op == COP_INSERT);
4415 +
4416 + node = reiser4_carry_real(op->node);
4417 + assert("nikita-943", node != NULL);
4418 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4419 +
4420 + child = reiser4_carry_real(op->u.insert.child);
4421 + result =
4422 + find_new_child_ptr(node, child, op->u.insert.brother,
4423 + op->u.insert.d->coord);
4424 +
4425 + build_child_ptr_data(child, op->u.insert.d->data);
4426 + return result;
4427 +}
4428 +
4429 +/* additional amount of free space in @node required to complete @op */
4430 +static int free_space_shortage(znode * node /* node to check */ ,
4431 + carry_op * op /* operation being performed */ )
4432 +{
4433 + assert("nikita-1061", node != NULL);
4434 + assert("nikita-1062", op != NULL);
4435 +
4436 + switch (op->op) {
4437 + default:
4438 + impossible("nikita-1702", "Wrong opcode");
4439 + case COP_INSERT:
4440 + case COP_PASTE:
4441 + return space_needed_for_op(node, op) - znode_free_space(node);
4442 + case COP_EXTENT:
4443 + /* when inserting extent shift data around until insertion
4444 + point is utmost in the node. */
4445 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4446 + return +1;
4447 + else
4448 + return -1;
4449 + }
4450 +}
4451 +
4452 +/* helper function: update node pointer in operation after insertion
4453 + point was probably shifted into @target. */
4454 +static znode *sync_op(carry_op * op, carry_node * target)
4455 +{
4456 + znode *insertion_node;
4457 +
4458 + /* reget node from coord: shift might move insertion coord to
4459 + the neighbor */
4460 + insertion_node = op->u.insert.d->coord->node;
4461 + /* if insertion point was actually moved into new node,
4462 + update carry node pointer in operation. */
4463 + if (insertion_node != reiser4_carry_real(op->node)) {
4464 + op->node = target;
4465 + assert("nikita-2540",
4466 + reiser4_carry_real(target) == insertion_node);
4467 + }
4468 + assert("nikita-2541",
4469 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4470 + return insertion_node;
4471 +}
4472 +
4473 +/*
4474 + * complete make_space() call: update tracked lock handle if necessary. See
4475 + * comments for fs/reiser4/carry.h:carry_track_type
4476 + */
4477 +static int
4478 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4479 +{
4480 + int result;
4481 + carry_track_type tracking;
4482 + znode *node;
4483 +
4484 + tracking = doing->track_type;
4485 + node = op->u.insert.d->coord->node;
4486 +
4487 + if (tracking == CARRY_TRACK_NODE ||
4488 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4489 + /* inserting or pasting into node different from
4490 + original. Update lock handle supplied by caller. */
4491 + assert("nikita-1417", doing->tracked != NULL);
4492 + done_lh(doing->tracked);
4493 + init_lh(doing->tracked);
4494 + result = longterm_lock_znode(doing->tracked, node,
4495 + ZNODE_WRITE_LOCK,
4496 + ZNODE_LOCK_HIPRI);
4497 + } else
4498 + result = 0;
4499 + return result;
4500 +}
4501 +
4502 +/* This is insertion policy function. It shifts data to the left and right
4503 + neighbors of insertion coord and allocates new nodes until there is enough
4504 + free space to complete @op.
4505 +
4506 + See comments in the body.
4507 +
4508 + Assumes that the node format favors insertions at the right end of the node
4509 + as node40 does.
4510 +
4511 + See carry_flow() on detail about flow insertion
4512 +*/
4513 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4514 + carry_level * doing /* current carry queue */ ,
4515 + carry_level * todo /* carry queue on the parent level */ )
4516 +{
4517 + znode *node;
4518 + int result;
4519 + int not_enough_space;
4520 + int blk_alloc;
4521 + znode *orig_node;
4522 + __u32 flags;
4523 +
4524 + coord_t *coord;
4525 +
4526 + assert("nikita-890", op != NULL);
4527 + assert("nikita-891", todo != NULL);
4528 + assert("nikita-892",
4529 + op->op == COP_INSERT ||
4530 + op->op == COP_PASTE || op->op == COP_EXTENT);
4531 + assert("nikita-1607",
4532 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4533 +
4534 + flags = op->u.insert.flags;
4535 +
4536 + /* NOTE check that new node can only be allocated after checking left
4537 + * and right neighbors. This is necessary for proper work of
4538 + * find_{left,right}_neighbor(). */
4539 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4540 + flags & COPI_DONT_SHIFT_LEFT));
4541 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4542 + flags & COPI_DONT_SHIFT_RIGHT));
4543 +
4544 + coord = op->u.insert.d->coord;
4545 + orig_node = node = coord->node;
4546 +
4547 + assert("nikita-908", node != NULL);
4548 + assert("nikita-909", node_plugin_by_node(node) != NULL);
4549 +
4550 + result = 0;
4551 + /* If there is not enough space in a node, try to shift something to
4552 + the left neighbor. This is a bit tricky, as locking to the left is
4553 + low priority. This is handled by restart logic in carry().
4554 + */
4555 + not_enough_space = free_space_shortage(node, op);
4556 + if (not_enough_space <= 0)
4557 + /* it is possible that carry was called when there actually
4558 + was enough space in the node. For example, when inserting
4559 + leftmost item so that delimiting keys have to be updated.
4560 + */
4561 + return make_space_tail(op, doing, orig_node);
4562 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4563 + carry_node *left;
4564 + /* make note in statistics of an attempt to move
4565 + something into the left neighbor */
4566 + left = find_left_neighbor(op, doing);
4567 + if (unlikely(IS_ERR(left))) {
4568 + if (PTR_ERR(left) == -E_REPEAT)
4569 + return -E_REPEAT;
4570 + else {
4571 + /* some error other than restart request
4572 + occurred. This shouldn't happen. Issue a
4573 + warning and continue as if left neighbor
4574 + weren't existing.
4575 + */
4576 + warning("nikita-924",
4577 + "Error accessing left neighbor: %li",
4578 + PTR_ERR(left));
4579 + }
4580 + } else if (left != NULL) {
4581 +
4582 + /* shift everything possible on the left of and
4583 + including insertion coord into the left neighbor */
4584 + result = carry_shift_data(LEFT_SIDE, coord,
4585 + reiser4_carry_real(left),
4586 + doing, todo,
4587 + flags & COPI_GO_LEFT);
4588 +
4589 + /* reget node from coord: shift_left() might move
4590 + insertion coord to the left neighbor */
4591 + node = sync_op(op, left);
4592 +
4593 + not_enough_space = free_space_shortage(node, op);
4594 + /* There is not enough free space in @node, but
4595 + may be, there is enough free space in
4596 + @left. Various balancing decisions are valid here.
4597 + The same for the shifiting to the right.
4598 + */
4599 + }
4600 + }
4601 + /* If there still is not enough space, shift to the right */
4602 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4603 + carry_node *right;
4604 +
4605 + right = find_right_neighbor(op, doing);
4606 + if (IS_ERR(right)) {
4607 + warning("nikita-1065",
4608 + "Error accessing right neighbor: %li",
4609 + PTR_ERR(right));
4610 + } else if (right != NULL) {
4611 + /* node containing insertion point, and its right
4612 + neighbor node are write locked by now.
4613 +
4614 + shift everything possible on the right of but
4615 + excluding insertion coord into the right neighbor
4616 + */
4617 + result = carry_shift_data(RIGHT_SIDE, coord,
4618 + reiser4_carry_real(right),
4619 + doing, todo,
4620 + flags & COPI_GO_RIGHT);
4621 + /* reget node from coord: shift_right() might move
4622 + insertion coord to the right neighbor */
4623 + node = sync_op(op, right);
4624 + not_enough_space = free_space_shortage(node, op);
4625 + }
4626 + }
4627 + /* If there is still not enough space, allocate new node(s).
4628 +
4629 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4630 + the carry operation flags (currently this is needed during flush
4631 + only).
4632 + */
4633 + for (blk_alloc = 0;
4634 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4635 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4636 + carry_node *fresh; /* new node we are allocating */
4637 + coord_t coord_shadow; /* remembered insertion point before
4638 + * shifting data into new node */
4639 + carry_node *node_shadow; /* remembered insertion node before
4640 + * shifting */
4641 + unsigned int gointo; /* whether insertion point should move
4642 + * into newly allocated node */
4643 +
4644 + /* allocate new node on the right of @node. Znode and disk
4645 + fake block number for new node are allocated.
4646 +
4647 + add_new_znode() posts carry operation COP_INSERT with
4648 + COPT_CHILD option to the parent level to add
4649 + pointer to newly created node to its parent.
4650 +
4651 + Subtle point: if several new nodes are required to complete
4652 + insertion operation at this level, they will be inserted
4653 + into their parents in the order of creation, which means
4654 + that @node will be valid "cookie" at the time of insertion.
4655 +
4656 + */
4657 + fresh = add_new_znode(node, op->node, doing, todo);
4658 + if (IS_ERR(fresh))
4659 + return PTR_ERR(fresh);
4660 +
4661 + /* Try to shift into new node. */
4662 + result = lock_carry_node(doing, fresh);
4663 + zput(reiser4_carry_real(fresh));
4664 + if (result != 0) {
4665 + warning("nikita-947",
4666 + "Cannot lock new node: %i", result);
4667 + return result;
4668 + }
4669 +
4670 + /* both nodes are write locked by now.
4671 +
4672 + shift everything possible on the right of and
4673 + including insertion coord into the right neighbor.
4674 + */
4675 + coord_dup(&coord_shadow, op->u.insert.d->coord);
4676 + node_shadow = op->node;
4677 + /* move insertion point into newly created node if:
4678 +
4679 + . insertion point is rightmost in the source node, or
4680 + . this is not the first node we are allocating in a row.
4681 + */
4682 + gointo =
4683 + (blk_alloc > 0) ||
4684 + coord_is_after_rightmost(op->u.insert.d->coord);
4685 +
4686 + if (gointo &&
4687 + op->op == COP_PASTE &&
4688 + coord_is_existing_item(op->u.insert.d->coord) &&
4689 + is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4690 + /* paste into solid (atomic) item, which can contain
4691 + only one unit, so we need to shift it right, where
4692 + insertion point supposed to be */
4693 +
4694 + assert("edward-1444", op->u.insert.d->data->iplug ==
4695 + item_plugin_by_id(STATIC_STAT_DATA_ID));
4696 + assert("edward-1445",
4697 + op->u.insert.d->data->length >
4698 + node_plugin_by_node(coord->node)->free_space
4699 + (coord->node));
4700 +
4701 + op->u.insert.d->coord->between = BEFORE_UNIT;
4702 + }
4703 +
4704 + result = carry_shift_data(RIGHT_SIDE, coord,
4705 + reiser4_carry_real(fresh),
4706 + doing, todo, gointo);
4707 + /* if insertion point was actually moved into new node,
4708 + update carry node pointer in operation. */
4709 + node = sync_op(op, fresh);
4710 + not_enough_space = free_space_shortage(node, op);
4711 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4712 + /* there is not enough free in new node. Shift
4713 + insertion point back to the @shadow_node so that
4714 + next new node would be inserted between
4715 + @shadow_node and @fresh.
4716 + */
4717 + coord_normalize(&coord_shadow);
4718 + coord_dup(coord, &coord_shadow);
4719 + node = coord->node;
4720 + op->node = node_shadow;
4721 + if (1 || (flags & COPI_STEP_BACK)) {
4722 + /* still not enough space?! Maybe there is
4723 + enough space in the source node (i.e., node
4724 + data are moved from) now.
4725 + */
4726 + not_enough_space =
4727 + free_space_shortage(node, op);
4728 + }
4729 + }
4730 + }
4731 + if (not_enough_space > 0) {
4732 + if (!(flags & COPI_DONT_ALLOCATE))
4733 + warning("nikita-948", "Cannot insert new item");
4734 + result = -E_NODE_FULL;
4735 + }
4736 + assert("nikita-1622", ergo(result == 0,
4737 + reiser4_carry_real(op->node) == coord->node));
4738 + assert("nikita-2616", coord == op->u.insert.d->coord);
4739 + if (result == 0)
4740 + result = make_space_tail(op, doing, orig_node);
4741 + return result;
4742 +}
4743 +
4744 +/* insert_paste_common() - common part of insert and paste operations
4745 +
4746 + This function performs common part of COP_INSERT and COP_PASTE.
4747 +
4748 + There are two ways in which insertion/paste can be requested:
4749 +
4750 + . by directly supplying reiser4_item_data. In this case, op ->
4751 + u.insert.type is set to COPT_ITEM_DATA.
4752 +
4753 + . by supplying child pointer to which is to inserted into parent. In this
4754 + case op -> u.insert.type == COPT_CHILD.
4755 +
4756 + . by supplying key of new item/unit. This is currently only used during
4757 + extent insertion
4758 +
4759 + This is required, because when new node is allocated we don't know at what
4760 + position pointer to it is to be stored in the parent. Actually, we don't
4761 + even know what its parent will be, because parent can be re-balanced
4762 + concurrently and new node re-parented, and because parent can be full and
4763 + pointer to the new node will go into some other node.
4764 +
4765 + insert_paste_common() resolves pointer to child node into position in the
4766 + parent by calling find_new_child_coord(), that fills
4767 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
4768 +
4769 + Another complication is with finding free space during pasting. It may
4770 + happen that while shifting items to the neighbors and newly allocated
4771 + nodes, insertion coord can no longer be in the item we wanted to paste
4772 + into. At this point, paste becomes (morphs) into insert. Moreover free
4773 + space analysis has to be repeated, because amount of space required for
4774 + insertion is different from that of paste (item header overhead, etc).
4775 +
4776 + This function "unifies" different insertion modes (by resolving child
4777 + pointer or key into insertion coord), and then calls make_space() to free
4778 + enough space in the node by shifting data to the left and right and by
4779 + allocating new nodes if necessary. Carry operation knows amount of space
4780 + required for its completion. After enough free space is obtained, caller of
4781 + this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4782 + by calling item plugin method.
4783 +
4784 +*/
4785 +static int insert_paste_common(carry_op * op /* carry operation being
4786 + * performed */ ,
4787 + carry_level * doing /* current carry level */ ,
4788 + carry_level * todo /* next carry level */ ,
4789 + carry_insert_data * cdata /* pointer to
4790 + * cdata */ ,
4791 + coord_t * coord /* insertion/paste coord */ ,
4792 + reiser4_item_data * data /* data to be
4793 + * inserted/pasted */ )
4794 +{
4795 + assert("nikita-981", op != NULL);
4796 + assert("nikita-980", todo != NULL);
4797 + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4798 + || (op->op == COP_EXTENT));
4799 +
4800 + if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4801 + /* nothing to do. Fall through to make_space(). */
4802 + ;
4803 + } else if (op->u.insert.type == COPT_KEY) {
4804 + node_search_result intra_node;
4805 + znode *node;
4806 + /* Problem with doing batching at the lowest level, is that
4807 + operations here are given by coords where modification is
4808 + to be performed, and one modification can invalidate coords
4809 + of all following operations.
4810 +
4811 + So, we are implementing yet another type for operation that
4812 + will use (the only) "locator" stable across shifting of
4813 + data between nodes, etc.: key (COPT_KEY).
4814 +
4815 + This clause resolves key to the coord in the node.
4816 +
4817 + But node can change also. Probably some pieces have to be
4818 + added to the lock_carry_node(), to lock node by its key.
4819 +
4820 + */
4821 + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4822 + if you need something else. */
4823 + op->u.insert.d->coord = coord;
4824 + node = reiser4_carry_real(op->node);
4825 + intra_node = node_plugin_by_node(node)->lookup
4826 + (node, op->u.insert.d->key, FIND_EXACT,
4827 + op->u.insert.d->coord);
4828 + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4829 + warning("nikita-1715", "Intra node lookup failure: %i",
4830 + intra_node);
4831 + return intra_node;
4832 + }
4833 + } else if (op->u.insert.type == COPT_CHILD) {
4834 + /* if we are asked to insert pointer to the child into
4835 + internal node, first convert pointer to the child into
4836 + coord within parent node.
4837 + */
4838 + znode *child;
4839 + int result;
4840 +
4841 + op->u.insert.d = cdata;
4842 + op->u.insert.d->coord = coord;
4843 + op->u.insert.d->data = data;
4844 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4845 + result = find_new_child_coord(op);
4846 + child = reiser4_carry_real(op->u.insert.child);
4847 + if (result != NS_NOT_FOUND) {
4848 + warning("nikita-993",
4849 + "Cannot find a place for child pointer: %i",
4850 + result);
4851 + return result;
4852 + }
4853 + /* This only happens when we did multiple insertions at
4854 + the previous level, trying to insert single item and
4855 + it so happened, that insertion of pointers to all new
4856 + nodes before this one already caused parent node to
4857 + split (may be several times).
4858 +
4859 + I am going to come up with better solution.
4860 +
4861 + You are not expected to understand this.
4862 + -- v6root/usr/sys/ken/slp.c
4863 +
4864 + Basically, what happens here is the following: carry came
4865 + to the parent level and is about to insert internal item
4866 + pointing to the child node that it just inserted in the
4867 + level below. Position where internal item is to be inserted
4868 + was found by find_new_child_coord() above, but node of the
4869 + current carry operation (that is, parent node of child
4870 + inserted on the previous level), was determined earlier in
4871 + the lock_carry_level/lock_carry_node. It could so happen
4872 + that other carry operations already performed on the parent
4873 + level already split parent node, so that insertion point
4874 + moved into another node. Handle this by creating new carry
4875 + node for insertion point if necessary.
4876 + */
4877 + if (reiser4_carry_real(op->node) !=
4878 + op->u.insert.d->coord->node) {
4879 + pool_ordering direction;
4880 + znode *z1;
4881 + znode *z2;
4882 + reiser4_key k1;
4883 + reiser4_key k2;
4884 +
4885 + /*
4886 + * determine in what direction insertion point
4887 + * moved. Do this by comparing delimiting keys.
4888 + */
4889 + z1 = op->u.insert.d->coord->node;
4890 + z2 = reiser4_carry_real(op->node);
4891 + if (keyle(leftmost_key_in_node(z1, &k1),
4892 + leftmost_key_in_node(z2, &k2)))
4893 + /* insertion point moved to the left */
4894 + direction = POOLO_BEFORE;
4895 + else
4896 + /* insertion point moved to the right */
4897 + direction = POOLO_AFTER;
4898 +
4899 + op->node = reiser4_add_carry_skip(doing,
4900 + direction, op->node);
4901 + if (IS_ERR(op->node))
4902 + return PTR_ERR(op->node);
4903 + op->node->node = op->u.insert.d->coord->node;
4904 + op->node->free = 1;
4905 + result = lock_carry_node(doing, op->node);
4906 + if (result != 0)
4907 + return result;
4908 + }
4909 +
4910 + /*
4911 + * set up key of an item being inserted: we are inserting
4912 + * internal item and its key is (by the very definition of
4913 + * search tree) is leftmost key in the child node.
4914 + */
4915 + write_lock_dk(znode_get_tree(child));
4916 + op->u.insert.d->key = leftmost_key_in_node(child,
4917 + znode_get_ld_key(child));
4918 + write_unlock_dk(znode_get_tree(child));
4919 + op->u.insert.d->data->arg = op->u.insert.brother;
4920 + } else {
4921 + assert("vs-243", op->u.insert.d->coord != NULL);
4922 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4923 + }
4924 +
4925 + /* find free space. */
4926 + return make_space(op, doing, todo);
4927 +}
4928 +
4929 +/* handle carry COP_INSERT operation.
4930 +
4931 + Insert new item into node. New item can be given in one of two ways:
4932 +
4933 + - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4934 + only applicable at the leaf/twig level.
4935 +
4936 + - by passing a child node pointer to which is to be inserted by this
4937 + operation.
4938 +
4939 +*/
4940 +static int carry_insert(carry_op * op /* operation to perform */ ,
4941 + carry_level * doing /* queue of operations @op
4942 + * is part of */ ,
4943 + carry_level * todo /* queue where new operations
4944 + * are accumulated */ )
4945 +{
4946 + znode *node;
4947 + carry_insert_data cdata;
4948 + coord_t coord;
4949 + reiser4_item_data data;
4950 + carry_plugin_info info;
4951 + int result;
4952 +
4953 + assert("nikita-1036", op != NULL);
4954 + assert("nikita-1037", todo != NULL);
4955 + assert("nikita-1038", op->op == COP_INSERT);
4956 +
4957 + coord_init_zero(&coord);
4958 +
4959 + /* perform common functionality of insert and paste. */
4960 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4961 + if (result != 0)
4962 + return result;
4963 +
4964 + node = op->u.insert.d->coord->node;
4965 + assert("nikita-1039", node != NULL);
4966 + assert("nikita-1040", node_plugin_by_node(node) != NULL);
4967 +
4968 + assert("nikita-949",
4969 + space_needed_for_op(node, op) <= znode_free_space(node));
4970 +
4971 + /* ask node layout to create new item. */
4972 + info.doing = doing;
4973 + info.todo = todo;
4974 + result = node_plugin_by_node(node)->create_item
4975 + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
4976 + &info);
4977 + doing->restartable = 0;
4978 + znode_make_dirty(node);
4979 +
4980 + return result;
4981 +}
4982 +
4983 +/*
4984 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
4985 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
4986 + * by slicing into multiple items.
4987 + */
4988 +
4989 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
4990 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
4991 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
4992 +
4993 +static size_t item_data_overhead(carry_op * op)
4994 +{
4995 + if (flow_insert_data(op)->iplug->b.estimate == NULL)
4996 + return 0;
4997 + return (flow_insert_data(op)->iplug->b.
4998 + estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
4999 + flow_insert_data(op)->length);
5000 +}
5001 +
5002 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5003 + and it will always return the same result. Some optimization could be made
5004 + by calculating this value once at the beginning and passing it around. That
5005 + would reduce some flexibility in future changes
5006 +*/
5007 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5008 +static size_t flow_insertion_overhead(carry_op * op)
5009 +{
5010 + znode *node;
5011 + size_t insertion_overhead;
5012 +
5013 + node = flow_insert_point(op)->node;
5014 + insertion_overhead = 0;
5015 + if (node->nplug->item_overhead &&
5016 + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5017 + flow_insert_data(op)))
5018 + insertion_overhead =
5019 + node->nplug->item_overhead(node, NULL) +
5020 + item_data_overhead(op);
5021 + return insertion_overhead;
5022 +}
5023 +
5024 +/* how many bytes of flow does fit to the node */
5025 +static int what_can_fit_into_node(carry_op * op)
5026 +{
5027 + size_t free, overhead;
5028 +
5029 + overhead = flow_insertion_overhead(op);
5030 + free = znode_free_space(flow_insert_point(op)->node);
5031 + if (free <= overhead)
5032 + return 0;
5033 + free -= overhead;
5034 + /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5035 + if (free < op->u.insert_flow.flow->length)
5036 + return free;
5037 + return (int)op->u.insert_flow.flow->length;
5038 +}
5039 +
5040 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5041 + fits into a node or whether minimal fraction of flow fits into a node */
5042 +static int enough_space_for_whole_flow(carry_op * op)
5043 +{
5044 + return (unsigned)what_can_fit_into_node(op) ==
5045 + op->u.insert_flow.flow->length;
5046 +}
5047 +
5048 +#define MIN_FLOW_FRACTION 1
5049 +static int enough_space_for_min_flow_fraction(carry_op * op)
5050 +{
5051 + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5052 +
5053 + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5054 +}
5055 +
5056 +/* this returns 0 if left neighbor was obtained successfully and everything
5057 + upto insertion point including it were shifted and left neighbor still has
5058 + some free space to put minimal fraction of flow into it */
5059 +static int
5060 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5061 +{
5062 + carry_node *left;
5063 + znode *orig;
5064 +
5065 + left = find_left_neighbor(op, doing);
5066 + if (unlikely(IS_ERR(left))) {
5067 + warning("vs-899",
5068 + "make_space_by_shift_left: "
5069 + "error accessing left neighbor: %li", PTR_ERR(left));
5070 + return 1;
5071 + }
5072 + if (left == NULL)
5073 + /* left neighbor either does not exist or is unformatted
5074 + node */
5075 + return 1;
5076 +
5077 + orig = flow_insert_point(op)->node;
5078 + /* try to shift content of node @orig from its head upto insert point
5079 + including insertion point into the left neighbor */
5080 + carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5081 + reiser4_carry_real(left), doing, todo,
5082 + 1 /* including insert point */);
5083 + if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5084 + /* insertion point did not move */
5085 + return 1;
5086 + }
5087 +
5088 + /* insertion point is set after last item in the node */
5089 + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5090 +
5091 + if (!enough_space_for_min_flow_fraction(op)) {
5092 + /* insertion point node does not have enough free space to put
5093 + even minimal portion of flow into it, therefore, move
5094 + insertion point back to orig node (before first item) */
5095 + coord_init_before_first_item(flow_insert_point(op), orig);
5096 + return 1;
5097 + }
5098 +
5099 + /* part of flow is to be written to the end of node */
5100 + op->node = left;
5101 + return 0;
5102 +}
5103 +
5104 +/* this returns 0 if right neighbor was obtained successfully and everything to
5105 + the right of insertion point was shifted to it and node got enough free
5106 + space to put minimal fraction of flow into it */
5107 +static int
5108 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5109 + carry_level * todo)
5110 +{
5111 + carry_node *right;
5112 +
5113 + right = find_right_neighbor(op, doing);
5114 + if (unlikely(IS_ERR(right))) {
5115 + warning("nikita-1065", "shift_right_excluding_insert_point: "
5116 + "error accessing right neighbor: %li", PTR_ERR(right));
5117 + return 1;
5118 + }
5119 + if (right) {
5120 + /* shift everything possible on the right of but excluding
5121 + insertion coord into the right neighbor */
5122 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5123 + reiser4_carry_real(right), doing, todo,
5124 + 0 /* not including insert point */);
5125 + } else {
5126 + /* right neighbor either does not exist or is unformatted
5127 + node */
5128 + ;
5129 + }
5130 + if (coord_is_after_rightmost(flow_insert_point(op))) {
5131 + if (enough_space_for_min_flow_fraction(op)) {
5132 + /* part of flow is to be written to the end of node */
5133 + return 0;
5134 + }
5135 + }
5136 +
5137 + /* new node is to be added if insert point node did not get enough
5138 + space for whole flow */
5139 + return 1;
5140 +}
5141 +
5142 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5143 + fits into that node */
5144 +static int
5145 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5146 +{
5147 + int result;
5148 + znode *node;
5149 + carry_node *new;
5150 +
5151 + node = flow_insert_point(op)->node;
5152 +
5153 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5154 + return RETERR(-E_NODE_FULL);
5155 + /* add new node after insert point node */
5156 + new = add_new_znode(node, op->node, doing, todo);
5157 + if (unlikely(IS_ERR(new))) {
5158 + return PTR_ERR(new);
5159 + }
5160 + result = lock_carry_node(doing, new);
5161 + zput(reiser4_carry_real(new));
5162 + if (unlikely(result)) {
5163 + return result;
5164 + }
5165 + op->u.insert_flow.new_nodes++;
5166 + if (!coord_is_after_rightmost(flow_insert_point(op))) {
5167 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5168 + reiser4_carry_real(new), doing, todo,
5169 + 0 /* not including insert point */);
5170 + assert("vs-901",
5171 + coord_is_after_rightmost(flow_insert_point(op)));
5172 +
5173 + if (enough_space_for_min_flow_fraction(op)) {
5174 + return 0;
5175 + }
5176 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5177 + return RETERR(-E_NODE_FULL);
5178 +
5179 + /* add one more new node */
5180 + new = add_new_znode(node, op->node, doing, todo);
5181 + if (unlikely(IS_ERR(new))) {
5182 + return PTR_ERR(new);
5183 + }
5184 + result = lock_carry_node(doing, new);
5185 + zput(reiser4_carry_real(new));
5186 + if (unlikely(result)) {
5187 + return result;
5188 + }
5189 + op->u.insert_flow.new_nodes++;
5190 + }
5191 +
5192 + /* move insertion point to new node */
5193 + coord_init_before_first_item(flow_insert_point(op),
5194 + reiser4_carry_real(new));
5195 + op->node = new;
5196 + return 0;
5197 +}
5198 +
5199 +static int
5200 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5201 + carry_level * todo)
5202 +{
5203 + __u32 flags = op->u.insert_flow.flags;
5204 +
5205 + if (enough_space_for_whole_flow(op)) {
5206 + /* whole flow fits into insert point node */
5207 + return 0;
5208 + }
5209 +
5210 + if (!(flags & COPI_DONT_SHIFT_LEFT)
5211 + && (make_space_by_shift_left(op, doing, todo) == 0)) {
5212 + /* insert point is shifted to left neighbor of original insert
5213 + point node and is set after last unit in that node. It has
5214 + enough space to fit at least minimal fraction of flow. */
5215 + return 0;
5216 + }
5217 +
5218 + if (enough_space_for_whole_flow(op)) {
5219 + /* whole flow fits into insert point node */
5220 + return 0;
5221 + }
5222 +
5223 + if (!(flags & COPI_DONT_SHIFT_RIGHT)
5224 + && (make_space_by_shift_right(op, doing, todo) == 0)) {
5225 + /* insert point is still set to the same node, but there is
5226 + nothing to the right of insert point. */
5227 + return 0;
5228 + }
5229 +
5230 + if (enough_space_for_whole_flow(op)) {
5231 + /* whole flow fits into insert point node */
5232 + return 0;
5233 + }
5234 +
5235 + return make_space_by_new_nodes(op, doing, todo);
5236 +}
5237 +
5238 +/* implements COP_INSERT_FLOW operation */
5239 +static int
5240 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5241 +{
5242 + int result;
5243 + flow_t *f;
5244 + coord_t *insert_point;
5245 + node_plugin *nplug;
5246 + carry_plugin_info info;
5247 + znode *orig_node;
5248 + lock_handle *orig_lh;
5249 +
5250 + f = op->u.insert_flow.flow;
5251 + result = 0;
5252 +
5253 + /* carry system needs this to work */
5254 + info.doing = doing;
5255 + info.todo = todo;
5256 +
5257 + orig_node = flow_insert_point(op)->node;
5258 + orig_lh = doing->tracked;
5259 +
5260 + while (f->length) {
5261 + result = make_space_for_flow_insertion(op, doing, todo);
5262 + if (result)
5263 + break;
5264 +
5265 + insert_point = flow_insert_point(op);
5266 + nplug = node_plugin_by_node(insert_point->node);
5267 +
5268 + /* compose item data for insertion/pasting */
5269 + flow_insert_data(op)->data = f->data;
5270 + flow_insert_data(op)->length = what_can_fit_into_node(op);
5271 +
5272 + if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5273 + /* insert point is set to item of file we are writing to and we have to append to it */
5274 + assert("vs-903", insert_point->between == AFTER_UNIT);
5275 + nplug->change_item_size(insert_point,
5276 + flow_insert_data(op)->length);
5277 + flow_insert_data(op)->iplug->b.paste(insert_point,
5278 + flow_insert_data
5279 + (op), &info);
5280 + } else {
5281 + /* new item must be inserted */
5282 + pos_in_node_t new_pos;
5283 + flow_insert_data(op)->length += item_data_overhead(op);
5284 +
5285 + /* FIXME-VS: this is because node40_create_item changes
5286 + insert_point for obscure reasons */
5287 + switch (insert_point->between) {
5288 + case AFTER_ITEM:
5289 + new_pos = insert_point->item_pos + 1;
5290 + break;
5291 + case EMPTY_NODE:
5292 + new_pos = 0;
5293 + break;
5294 + case BEFORE_ITEM:
5295 + assert("vs-905", insert_point->item_pos == 0);
5296 + new_pos = 0;
5297 + break;
5298 + default:
5299 + impossible("vs-906",
5300 + "carry_insert_flow: invalid coord");
5301 + new_pos = 0;
5302 + break;
5303 + }
5304 +
5305 + nplug->create_item(insert_point, &f->key,
5306 + flow_insert_data(op), &info);
5307 + coord_set_item_pos(insert_point, new_pos);
5308 + }
5309 + coord_init_after_item_end(insert_point);
5310 + doing->restartable = 0;
5311 + znode_make_dirty(insert_point->node);
5312 +
5313 + move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5314 + }
5315 +
5316 + if (orig_node != flow_insert_point(op)->node) {
5317 + /* move lock to new insert point */
5318 + done_lh(orig_lh);
5319 + init_lh(orig_lh);
5320 + result =
5321 + longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5322 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5323 + }
5324 +
5325 + return result;
5326 +}
5327 +
5328 +/* implements COP_DELETE operation
5329 +
5330 + Remove pointer to @op -> u.delete.child from it's parent.
5331 +
5332 + This function also handles killing of a tree root is last pointer from it
5333 + was removed. This is complicated by our handling of "twig" level: root on
5334 + twig level is never killed.
5335 +
5336 +*/
5337 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5338 + carry_level * doing UNUSED_ARG /* current carry
5339 + * level */ ,
5340 + carry_level * todo /* next carry level */ )
5341 +{
5342 + int result;
5343 + coord_t coord;
5344 + coord_t coord2;
5345 + znode *parent;
5346 + znode *child;
5347 + carry_plugin_info info;
5348 + reiser4_tree *tree;
5349 +
5350 + /*
5351 + * This operation is called to delete internal item pointing to the
5352 + * child node that was removed by carry from the tree on the previous
5353 + * tree level.
5354 + */
5355 +
5356 + assert("nikita-893", op != NULL);
5357 + assert("nikita-894", todo != NULL);
5358 + assert("nikita-895", op->op == COP_DELETE);
5359 +
5360 + coord_init_zero(&coord);
5361 + coord_init_zero(&coord2);
5362 +
5363 + parent = reiser4_carry_real(op->node);
5364 + child = op->u.delete.child ?
5365 + reiser4_carry_real(op->u.delete.child) : op->node->node;
5366 + tree = znode_get_tree(child);
5367 + read_lock_tree(tree);
5368 +
5369 + /*
5370 + * @parent was determined when carry entered parent level
5371 + * (lock_carry_level/lock_carry_node). Since then, actual parent of
5372 + * @child node could change due to other carry operations performed on
5373 + * the parent level. Check for this.
5374 + */
5375 +
5376 + if (znode_parent(child) != parent) {
5377 + /* NOTE-NIKITA add stat counter for this. */
5378 + parent = znode_parent(child);
5379 + assert("nikita-2581", find_carry_node(doing, parent));
5380 + }
5381 + read_unlock_tree(tree);
5382 +
5383 + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5384 +
5385 + /* Twig level horrors: tree should be of height at least 2. So, last
5386 + pointer from the root at twig level is preserved even if child is
5387 + empty. This is ugly, but so it was architectured.
5388 + */
5389 +
5390 + if (znode_is_root(parent) &&
5391 + znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5392 + node_num_items(parent) == 1) {
5393 + /* Delimiting key manipulations. */
5394 + write_lock_dk(tree);
5395 + znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5396 + znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5397 + ZF_SET(child, JNODE_DKSET);
5398 + write_unlock_dk(tree);
5399 +
5400 + /* @child escaped imminent death! */
5401 + ZF_CLR(child, JNODE_HEARD_BANSHEE);
5402 + return 0;
5403 + }
5404 +
5405 + /* convert child pointer to the coord_t */
5406 + result = find_child_ptr(parent, child, &coord);
5407 + if (result != NS_FOUND) {
5408 + warning("nikita-994", "Cannot find child pointer: %i", result);
5409 + print_coord_content("coord", &coord);
5410 + return result;
5411 + }
5412 +
5413 + coord_dup(&coord2, &coord);
5414 + info.doing = doing;
5415 + info.todo = todo;
5416 + {
5417 + /*
5418 + * Actually kill internal item: prepare structure with
5419 + * arguments for ->cut_and_kill() method...
5420 + */
5421 +
5422 + struct carry_kill_data kdata;
5423 + kdata.params.from = &coord;
5424 + kdata.params.to = &coord2;
5425 + kdata.params.from_key = NULL;
5426 + kdata.params.to_key = NULL;
5427 + kdata.params.smallest_removed = NULL;
5428 + kdata.params.truncate = 1;
5429 + kdata.flags = op->u.delete.flags;
5430 + kdata.inode = NULL;
5431 + kdata.left = NULL;
5432 + kdata.right = NULL;
5433 + kdata.buf = NULL;
5434 + /* ... and call it. */
5435 + result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5436 + &info);
5437 + }
5438 + doing->restartable = 0;
5439 +
5440 + /* check whether root should be killed violently */
5441 + if (znode_is_root(parent) &&
5442 + /* don't kill roots at and lower than twig level */
5443 + znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5444 + node_num_items(parent) == 1) {
5445 + result = reiser4_kill_tree_root(coord.node);
5446 + }
5447 +
5448 + return result < 0 ? : 0;
5449 +}
5450 +
5451 +/* implements COP_CUT opration
5452 +
5453 + Cuts part or whole content of node.
5454 +
5455 +*/
5456 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5457 + carry_level * doing /* current carry level */ ,
5458 + carry_level * todo /* next carry level */ )
5459 +{
5460 + int result;
5461 + carry_plugin_info info;
5462 + node_plugin *nplug;
5463 +
5464 + assert("nikita-896", op != NULL);
5465 + assert("nikita-897", todo != NULL);
5466 + assert("nikita-898", op->op == COP_CUT);
5467 +
5468 + info.doing = doing;
5469 + info.todo = todo;
5470 +
5471 + nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5472 + if (op->u.cut_or_kill.is_cut)
5473 + result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5474 + else
5475 + result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5476 +
5477 + doing->restartable = 0;
5478 + return result < 0 ? : 0;
5479 +}
5480 +
5481 +/* helper function for carry_paste(): returns true if @op can be continued as
5482 + paste */
5483 +static int
5484 +can_paste(coord_t * icoord, const reiser4_key * key,
5485 + const reiser4_item_data * data)
5486 +{
5487 + coord_t circa;
5488 + item_plugin *new_iplug;
5489 + item_plugin *old_iplug;
5490 + int result = 0; /* to keep gcc shut */
5491 +
5492 + assert("", icoord->between != AT_UNIT);
5493 +
5494 + /* obviously, one cannot paste when node is empty---there is nothing
5495 + to paste into. */
5496 + if (node_is_empty(icoord->node))
5497 + return 0;
5498 + /* if insertion point is at the middle of the item, then paste */
5499 + if (!coord_is_between_items(icoord))
5500 + return 1;
5501 + coord_dup(&circa, icoord);
5502 + circa.between = AT_UNIT;
5503 +
5504 + old_iplug = item_plugin_by_coord(&circa);
5505 + new_iplug = data->iplug;
5506 +
5507 + /* check whether we can paste to the item @icoord is "at" when we
5508 + ignore ->between field */
5509 + if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5510 + result = 1;
5511 + } else if (icoord->between == BEFORE_UNIT
5512 + || icoord->between == BEFORE_ITEM) {
5513 + /* otherwise, try to glue to the item at the left, if any */
5514 + coord_dup(&circa, icoord);
5515 + if (coord_set_to_left(&circa)) {
5516 + result = 0;
5517 + coord_init_before_item(icoord);
5518 + } else {
5519 + old_iplug = item_plugin_by_coord(&circa);
5520 + result = (old_iplug == new_iplug)
5521 + && item_can_contain_key(icoord, key, data);
5522 + if (result) {
5523 + coord_dup(icoord, &circa);
5524 + icoord->between = AFTER_UNIT;
5525 + }
5526 + }
5527 + } else if (icoord->between == AFTER_UNIT
5528 + || icoord->between == AFTER_ITEM) {
5529 + coord_dup(&circa, icoord);
5530 + /* otherwise, try to glue to the item at the right, if any */
5531 + if (coord_set_to_right(&circa)) {
5532 + result = 0;
5533 + coord_init_after_item(icoord);
5534 + } else {
5535 + int (*cck) (const coord_t *, const reiser4_key *,
5536 + const reiser4_item_data *);
5537 +
5538 + old_iplug = item_plugin_by_coord(&circa);
5539 +
5540 + cck = old_iplug->b.can_contain_key;
5541 + if (cck == NULL)
5542 + /* item doesn't define ->can_contain_key
5543 + method? So it is not expandable. */
5544 + result = 0;
5545 + else {
5546 + result = (old_iplug == new_iplug)
5547 + && cck(&circa /*icoord */ , key, data);
5548 + if (result) {
5549 + coord_dup(icoord, &circa);
5550 + icoord->between = BEFORE_UNIT;
5551 + }
5552 + }
5553 + }
5554 + } else
5555 + impossible("nikita-2513", "Nothing works");
5556 + if (result) {
5557 + if (icoord->between == BEFORE_ITEM) {
5558 + assert("vs-912", icoord->unit_pos == 0);
5559 + icoord->between = BEFORE_UNIT;
5560 + } else if (icoord->between == AFTER_ITEM) {
5561 + coord_init_after_item_end(icoord);
5562 + }
5563 + }
5564 + return result;
5565 +}
5566 +
5567 +/* implements COP_PASTE operation
5568 +
5569 + Paste data into existing item. This is complicated by the fact that after
5570 + we shifted something to the left or right neighbors trying to free some
5571 + space, item we were supposed to paste into can be in different node than
5572 + insertion coord. If so, we are no longer doing paste, but insert. See
5573 + comments in insert_paste_common().
5574 +
5575 +*/
5576 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5577 + carry_level * doing UNUSED_ARG /* current carry
5578 + * level */ ,
5579 + carry_level * todo /* next carry level */ )
5580 +{
5581 + znode *node;
5582 + carry_insert_data cdata;
5583 + coord_t dcoord;
5584 + reiser4_item_data data;
5585 + int result;
5586 + int real_size;
5587 + item_plugin *iplug;
5588 + carry_plugin_info info;
5589 + coord_t *coord;
5590 +
5591 + assert("nikita-982", op != NULL);
5592 + assert("nikita-983", todo != NULL);
5593 + assert("nikita-984", op->op == COP_PASTE);
5594 +
5595 + coord_init_zero(&dcoord);
5596 +
5597 + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5598 + if (result != 0)
5599 + return result;
5600 +
5601 + coord = op->u.insert.d->coord;
5602 +
5603 + /* handle case when op -> u.insert.coord doesn't point to the item
5604 + of required type. restart as insert. */
5605 + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5606 + op->op = COP_INSERT;
5607 + op->u.insert.type = COPT_PASTE_RESTARTED;
5608 + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5609 +
5610 + return result;
5611 + }
5612 +
5613 + node = coord->node;
5614 + iplug = item_plugin_by_coord(coord);
5615 + assert("nikita-992", iplug != NULL);
5616 +
5617 + assert("nikita-985", node != NULL);
5618 + assert("nikita-986", node_plugin_by_node(node) != NULL);
5619 +
5620 + assert("nikita-987",
5621 + space_needed_for_op(node, op) <= znode_free_space(node));
5622 +
5623 + assert("nikita-1286", coord_is_existing_item(coord));
5624 +
5625 + /*
5626 + * if item is expanded as a result of this operation, we should first
5627 + * change item size, than call ->b.paste item method. If item is
5628 + * shrunk, it should be done other way around: first call ->b.paste
5629 + * method, then reduce item size.
5630 + */
5631 +
5632 + real_size = space_needed_for_op(node, op);
5633 + if (real_size > 0)
5634 + node->nplug->change_item_size(coord, real_size);
5635 +
5636 + doing->restartable = 0;
5637 + info.doing = doing;
5638 + info.todo = todo;
5639 +
5640 + result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5641 +
5642 + if (real_size < 0)
5643 + node->nplug->change_item_size(coord, real_size);
5644 +
5645 + /* if we pasted at the beginning of the item, update item's key. */
5646 + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5647 + node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5648 +
5649 + znode_make_dirty(node);
5650 + return result;
5651 +}
5652 +
5653 +/* handle carry COP_EXTENT operation. */
5654 +static int carry_extent(carry_op * op /* operation to perform */ ,
5655 + carry_level * doing /* queue of operations @op
5656 + * is part of */ ,
5657 + carry_level * todo /* queue where new operations
5658 + * are accumulated */ )
5659 +{
5660 + znode *node;
5661 + carry_insert_data cdata;
5662 + coord_t coord;
5663 + reiser4_item_data data;
5664 + carry_op *delete_dummy;
5665 + carry_op *insert_extent;
5666 + int result;
5667 + carry_plugin_info info;
5668 +
5669 + assert("nikita-1751", op != NULL);
5670 + assert("nikita-1752", todo != NULL);
5671 + assert("nikita-1753", op->op == COP_EXTENT);
5672 +
5673 + /* extent insertion overview:
5674 +
5675 + extents live on the TWIG LEVEL, which is level one above the leaf
5676 + one. This complicates extent insertion logic somewhat: it may
5677 + happen (and going to happen all the time) that in logical key
5678 + ordering extent has to be placed between items I1 and I2, located
5679 + at the leaf level, but I1 and I2 are in the same formatted leaf
5680 + node N1. To insert extent one has to
5681 +
5682 + (1) reach node N1 and shift data between N1, its neighbors and
5683 + possibly newly allocated nodes until I1 and I2 fall into different
5684 + nodes. Since I1 and I2 are still neighboring items in logical key
5685 + order, they will be necessary utmost items in their respective
5686 + nodes.
5687 +
5688 + (2) After this new extent item is inserted into node on the twig
5689 + level.
5690 +
5691 + Fortunately this process can reuse almost all code from standard
5692 + insertion procedure (viz. make_space() and insert_paste_common()),
5693 + due to the following observation: make_space() only shifts data up
5694 + to and excluding or including insertion point. It never
5695 + "over-moves" through insertion point. Thus, one can use
5696 + make_space() to perform step (1). All required for this is just to
5697 + instruct free_space_shortage() to keep make_space() shifting data
5698 + until insertion point is at the node border.
5699 +
5700 + */
5701 +
5702 + /* perform common functionality of insert and paste. */
5703 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5704 + if (result != 0)
5705 + return result;
5706 +
5707 + node = op->u.extent.d->coord->node;
5708 + assert("nikita-1754", node != NULL);
5709 + assert("nikita-1755", node_plugin_by_node(node) != NULL);
5710 + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5711 +
5712 + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5713 + extent fits between items. */
5714 +
5715 + info.doing = doing;
5716 + info.todo = todo;
5717 +
5718 + /* there is another complication due to placement of extents on the
5719 + twig level: extents are "rigid" in the sense that key-range
5720 + occupied by extent cannot grow indefinitely to the right as it is
5721 + for the formatted leaf nodes. Because of this when search finds two
5722 + adjacent extents on the twig level, it has to "drill" to the leaf
5723 + level, creating new node. Here we are removing this node.
5724 + */
5725 + if (node_is_empty(node)) {
5726 + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5727 + if (IS_ERR(delete_dummy))
5728 + return PTR_ERR(delete_dummy);
5729 + delete_dummy->u.delete.child = NULL;
5730 + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5731 + ZF_SET(node, JNODE_HEARD_BANSHEE);
5732 + }
5733 +
5734 + /* proceed with inserting extent item into parent. We are definitely
5735 + inserting rather than pasting if we get that far. */
5736 + insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5737 + if (IS_ERR(insert_extent))
5738 + /* @delete_dummy will be automatically destroyed on the level
5739 + exiting */
5740 + return PTR_ERR(insert_extent);
5741 + /* NOTE-NIKITA insertion by key is simplest option here. Another
5742 + possibility is to insert on the left or right of already existing
5743 + item.
5744 + */
5745 + insert_extent->u.insert.type = COPT_KEY;
5746 + insert_extent->u.insert.d = op->u.extent.d;
5747 + assert("nikita-1719", op->u.extent.d->key != NULL);
5748 + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5749 + insert_extent->u.insert.flags =
5750 + znode_get_tree(node)->carry.new_extent_flags;
5751 +
5752 + /*
5753 + * if carry was asked to track lock handle we should actually track
5754 + * lock handle on the twig node rather than on the leaf where
5755 + * operation was started from. Transfer tracked lock handle.
5756 + */
5757 + if (doing->track_type) {
5758 + assert("nikita-3242", doing->tracked != NULL);
5759 + assert("nikita-3244", todo->tracked == NULL);
5760 + todo->tracked = doing->tracked;
5761 + todo->track_type = CARRY_TRACK_NODE;
5762 + doing->tracked = NULL;
5763 + doing->track_type = 0;
5764 + }
5765 +
5766 + return 0;
5767 +}
5768 +
5769 +/* update key in @parent between pointers to @left and @right.
5770 +
5771 + Find coords of @left and @right and update delimiting key between them.
5772 + This is helper function called by carry_update(). Finds position of
5773 + internal item involved. Updates item key. Updates delimiting keys of child
5774 + nodes involved.
5775 +*/
5776 +static int update_delimiting_key(znode * parent /* node key is updated
5777 + * in */ ,
5778 + znode * left /* child of @parent */ ,
5779 + znode * right /* child of @parent */ ,
5780 + carry_level * doing /* current carry
5781 + * level */ ,
5782 + carry_level * todo /* parent carry
5783 + * level */ ,
5784 + const char **error_msg /* place to
5785 + * store error
5786 + * message */ )
5787 +{
5788 + coord_t left_pos;
5789 + coord_t right_pos;
5790 + int result;
5791 + reiser4_key ldkey;
5792 + carry_plugin_info info;
5793 +
5794 + assert("nikita-1177", right != NULL);
5795 + /* find position of right left child in a parent */
5796 + result = find_child_ptr(parent, right, &right_pos);
5797 + if (result != NS_FOUND) {
5798 + *error_msg = "Cannot find position of right child";
5799 + return result;
5800 + }
5801 +
5802 + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5803 + /* find position of the left child in a parent */
5804 + result = find_child_ptr(parent, left, &left_pos);
5805 + if (result != NS_FOUND) {
5806 + *error_msg = "Cannot find position of left child";
5807 + return result;
5808 + }
5809 + assert("nikita-1355", left_pos.node != NULL);
5810 + } else
5811 + left_pos.node = NULL;
5812 +
5813 + /* check that they are separated by exactly one key and are basically
5814 + sane */
5815 + if (REISER4_DEBUG) {
5816 + if ((left_pos.node != NULL)
5817 + && !coord_is_existing_unit(&left_pos)) {
5818 + *error_msg = "Left child is bastard";
5819 + return RETERR(-EIO);
5820 + }
5821 + if (!coord_is_existing_unit(&right_pos)) {
5822 + *error_msg = "Right child is bastard";
5823 + return RETERR(-EIO);
5824 + }
5825 + if (left_pos.node != NULL &&
5826 + !coord_are_neighbors(&left_pos, &right_pos)) {
5827 + *error_msg = "Children are not direct siblings";
5828 + return RETERR(-EIO);
5829 + }
5830 + }
5831 + *error_msg = NULL;
5832 +
5833 + info.doing = doing;
5834 + info.todo = todo;
5835 +
5836 + /*
5837 + * If child node is not empty, new key of internal item is a key of
5838 + * leftmost item in the child node. If the child is empty, take its
5839 + * right delimiting key as a new key of the internal item. Precise key
5840 + * in the latter case is not important per se, because the child (and
5841 + * the internal item) are going to be killed shortly anyway, but we
5842 + * have to preserve correct order of keys in the parent node.
5843 + */
5844 +
5845 + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5846 + leftmost_key_in_node(right, &ldkey);
5847 + else {
5848 + read_lock_dk(znode_get_tree(parent));
5849 + ldkey = *znode_get_rd_key(right);
5850 + read_unlock_dk(znode_get_tree(parent));
5851 + }
5852 + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5853 + doing->restartable = 0;
5854 + znode_make_dirty(parent);
5855 + return 0;
5856 +}
5857 +
5858 +/* implements COP_UPDATE opration
5859 +
5860 + Update delimiting keys.
5861 +
5862 +*/
5863 +static int carry_update(carry_op * op /* operation to be performed */ ,
5864 + carry_level * doing /* current carry level */ ,
5865 + carry_level * todo /* next carry level */ )
5866 +{
5867 + int result;
5868 + carry_node *missing UNUSED_ARG;
5869 + znode *left;
5870 + znode *right;
5871 + carry_node *lchild;
5872 + carry_node *rchild;
5873 + const char *error_msg;
5874 + reiser4_tree *tree;
5875 +
5876 + /*
5877 + * This operation is called to update key of internal item. This is
5878 + * necessary when carry shifted of cut data on the child
5879 + * level. Arguments of this operation are:
5880 + *
5881 + * @right --- child node. Operation should update key of internal
5882 + * item pointing to @right.
5883 + *
5884 + * @left --- left neighbor of @right. This parameter is optional.
5885 + */
5886 +
5887 + assert("nikita-902", op != NULL);
5888 + assert("nikita-903", todo != NULL);
5889 + assert("nikita-904", op->op == COP_UPDATE);
5890 +
5891 + lchild = op->u.update.left;
5892 + rchild = op->node;
5893 +
5894 + if (lchild != NULL) {
5895 + assert("nikita-1001", lchild->parent);
5896 + assert("nikita-1003", !lchild->left);
5897 + left = reiser4_carry_real(lchild);
5898 + } else
5899 + left = NULL;
5900 +
5901 + tree = znode_get_tree(rchild->node);
5902 + read_lock_tree(tree);
5903 + right = znode_parent(rchild->node);
5904 + read_unlock_tree(tree);
5905 +
5906 + if (right != NULL) {
5907 + result = update_delimiting_key(right,
5908 + lchild ? lchild->node : NULL,
5909 + rchild->node,
5910 + doing, todo, &error_msg);
5911 + } else {
5912 + error_msg = "Cannot find node to update key in";
5913 + result = RETERR(-EIO);
5914 + }
5915 + /* operation will be reposted to the next level by the
5916 + ->update_item_key() method of node plugin, if necessary. */
5917 +
5918 + if (result != 0) {
5919 + warning("nikita-999", "Error updating delimiting key: %s (%i)",
5920 + error_msg ? : "", result);
5921 + }
5922 + return result;
5923 +}
5924 +
5925 +/* move items from @node during carry */
5926 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
5927 + coord_t * insert_coord /* coord where new item
5928 + * is to be inserted */ ,
5929 + znode * node /* node which data are moved from */ ,
5930 + carry_level * doing /* active carry queue */ ,
5931 + carry_level * todo /* carry queue where new
5932 + * operations are to be put
5933 + * in */ ,
5934 + unsigned int including_insert_coord_p /* true if
5935 + * @insertion_coord
5936 + * can be moved */ )
5937 +{
5938 + int result;
5939 + znode *source;
5940 + carry_plugin_info info;
5941 + node_plugin *nplug;
5942 +
5943 + source = insert_coord->node;
5944 +
5945 + info.doing = doing;
5946 + info.todo = todo;
5947 +
5948 + nplug = node_plugin_by_node(node);
5949 + result = nplug->shift(insert_coord, node,
5950 + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5951 + (int)including_insert_coord_p, &info);
5952 + /* the only error ->shift() method of node plugin can return is
5953 + -ENOMEM due to carry node/operation allocation. */
5954 + assert("nikita-915", result >= 0 || result == -ENOMEM);
5955 + if (result > 0) {
5956 + /*
5957 + * if some number of bytes was actually shifted, mark nodes
5958 + * dirty, and carry level as non-restartable.
5959 + */
5960 + doing->restartable = 0;
5961 + znode_make_dirty(source);
5962 + znode_make_dirty(node);
5963 + }
5964 +
5965 + assert("nikita-2077", coord_check(insert_coord));
5966 + return 0;
5967 +}
5968 +
5969 +typedef carry_node *(*carry_iterator) (carry_node * node);
5970 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5971 + carry_iterator iterator);
5972 +
5973 +static carry_node *pool_level_list_prev(carry_node *node)
5974 +{
5975 + return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
5976 +}
5977 +
5978 +/* look for the left neighbor of given carry node in a carry queue.
5979 +
5980 + This is used by find_left_neighbor(), but I am not sure that this
5981 + really gives any advantage. More statistics required.
5982 +
5983 +*/
5984 +carry_node *find_left_carry(carry_node * node /* node to find left neighbor
5985 + * of */ ,
5986 + carry_level * level /* level to scan */ )
5987 +{
5988 + return find_dir_carry(node, level,
5989 + (carry_iterator) pool_level_list_prev);
5990 +}
5991 +
5992 +static carry_node *pool_level_list_next(carry_node *node)
5993 +{
5994 + return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
5995 +}
5996 +
5997 +/* look for the right neighbor of given carry node in a
5998 + carry queue.
5999 +
6000 + This is used by find_right_neighbor(), but I am not sure that this
6001 + really gives any advantage. More statistics required.
6002 +
6003 +*/
6004 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6005 + * of */ ,
6006 + carry_level * level /* level to scan */ )
6007 +{
6008 + return find_dir_carry(node, level,
6009 + (carry_iterator) pool_level_list_next);
6010 +}
6011 +
6012 +/* look for the left or right neighbor of given carry node in a carry
6013 + queue.
6014 +
6015 + Helper function used by find_{left|right}_carry().
6016 +*/
6017 +static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6018 + * from */ ,
6019 + carry_level * level /* level to scan */ ,
6020 + carry_iterator iterator /* operation to
6021 + * move to the next
6022 + * node */ )
6023 +{
6024 + carry_node *neighbor;
6025 +
6026 + assert("nikita-1059", node != NULL);
6027 + assert("nikita-1060", level != NULL);
6028 +
6029 + /* scan list of carry nodes on this list dir-ward, skipping all
6030 + carry nodes referencing the same znode. */
6031 + neighbor = node;
6032 + while (1) {
6033 + neighbor = iterator(neighbor);
6034 + if (carry_node_end(level, neighbor))
6035 + /* list head is reached */
6036 + return NULL;
6037 + if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6038 + return neighbor;
6039 + }
6040 +}
6041 +
6042 +/*
6043 + * Memory reservation estimation.
6044 + *
6045 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6046 + * takes tree in consistent state (e.g., that search tree invariants hold),
6047 + * and leaves tree consistent after it finishes. This means that when some
6048 + * error occurs carry cannot simply return if there are pending carry
6049 + * operations. Generic solution for this problem is carry-undo either as
6050 + * transaction manager feature (requiring checkpoints and isolation), or
6051 + * through some carry specific mechanism.
6052 + *
6053 + * Our current approach is to panic if carry hits an error while tree is
6054 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6055 + * this "memory reservation" mechanism was added.
6056 + *
6057 + * Memory reservation is implemented by perthread-pages.diff patch from
6058 + * core-patches. Its API is defined in <linux/gfp.h>
6059 + *
6060 + * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6061 + * void perthread_pages_release(int nrpages);
6062 + * int perthread_pages_count(void);
6063 + *
6064 + * carry estimates its worst case memory requirements at the entry, reserved
6065 + * enough memory, and released unused pages before returning.
6066 + *
6067 + * Code below estimates worst case memory requirements for a given carry
6068 + * queue. This is dome by summing worst case memory requirements for each
6069 + * operation in the queue.
6070 + *
6071 + */
6072 +
6073 +/*
6074 + * Memory memory requirements of many operations depends on the tree
6075 + * height. For example, item insertion requires new node to be inserted at
6076 + * each tree level in the worst case. What tree height should be used for
6077 + * estimation? Current tree height is wrong, because tree height can change
6078 + * between the time when estimation was done and the time when operation is
6079 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6080 + * is also not desirable, because it would lead to the huge over-estimation
6081 + * all the time. Plausible solution is "capped tree height": if current tree
6082 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6083 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6084 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6085 + * to be increased even more during short interval of time.
6086 + */
6087 +#define TREE_HEIGHT_CAP (5)
6088 +
6089 +/* return capped tree height for the @tree. See comment above. */
6090 +static int cap_tree_height(reiser4_tree * tree)
6091 +{
6092 + return max_t(int, tree->height, TREE_HEIGHT_CAP);
6093 +}
6094 +
6095 +/* return capped tree height for the current tree. */
6096 +static int capped_height(void)
6097 +{
6098 + return cap_tree_height(current_tree);
6099 +}
6100 +
6101 +/* return number of pages required to store given number of bytes */
6102 +static int bytes_to_pages(int bytes)
6103 +{
6104 + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6105 +}
6106 +
6107 +/* how many pages are required to allocate znodes during item insertion. */
6108 +static int carry_estimate_znodes(void)
6109 +{
6110 + /*
6111 + * Note, that there we have some problem here: there is no way to
6112 + * reserve pages specifically for the given slab. This means that
6113 + * these pages can be hijacked for some other end.
6114 + */
6115 +
6116 + /* in the worst case we need 3 new znode on each tree level */
6117 + return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6118 +}
6119 +
6120 +/*
6121 + * how many pages are required to load bitmaps. One bitmap per level.
6122 + */
6123 +static int carry_estimate_bitmaps(void)
6124 +{
6125 + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6126 + int bytes;
6127 +
6128 + bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6129 + * bitmap.c, skip for now. */
6130 + 2 * sizeof(jnode)); /* working and commit jnodes */
6131 + return bytes_to_pages(bytes) + 2; /* and their contents */
6132 + } else
6133 + /* bitmaps were pre-loaded during mount */
6134 + return 0;
6135 +}
6136 +
6137 +/* worst case item insertion memory requirements */
6138 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6139 +{
6140 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6141 + capped_height() + /* new block on each level */
6142 + 1 + /* and possibly extra new block at the leaf level */
6143 + 3; /* loading of leaves into memory */
6144 +}
6145 +
6146 +/* worst case item deletion memory requirements */
6147 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6148 +{
6149 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6150 + 3; /* loading of leaves into memory */
6151 +}
6152 +
6153 +/* worst case tree cut memory requirements */
6154 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6155 +{
6156 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6157 + 3; /* loading of leaves into memory */
6158 +}
6159 +
6160 +/* worst case memory requirements of pasting into item */
6161 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6162 +{
6163 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6164 + capped_height() + /* new block on each level */
6165 + 1 + /* and possibly extra new block at the leaf level */
6166 + 3; /* loading of leaves into memory */
6167 +}
6168 +
6169 +/* worst case memory requirements of extent insertion */
6170 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6171 +{
6172 + return carry_estimate_insert(op, level) + /* insert extent */
6173 + carry_estimate_delete(op, level); /* kill leaf */
6174 +}
6175 +
6176 +/* worst case memory requirements of key update */
6177 +static int carry_estimate_update(carry_op * op, carry_level * level)
6178 +{
6179 + return 0;
6180 +}
6181 +
6182 +/* worst case memory requirements of flow insertion */
6183 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6184 +{
6185 + int newnodes;
6186 +
6187 + newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6188 + CARRY_FLOW_NEW_NODES_LIMIT);
6189 + /*
6190 + * roughly estimate insert_flow as a sequence of insertions.
6191 + */
6192 + return newnodes * carry_estimate_insert(op, level);
6193 +}
6194 +
6195 +/* This is dispatch table for carry operations. It can be trivially
6196 + abstracted into useful plugin: tunable balancing policy is a good
6197 + thing. */
6198 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6199 + [COP_INSERT] = {
6200 + .handler = carry_insert,
6201 + .estimate = carry_estimate_insert}
6202 + ,
6203 + [COP_DELETE] = {
6204 + .handler = carry_delete,
6205 + .estimate = carry_estimate_delete}
6206 + ,
6207 + [COP_CUT] = {
6208 + .handler = carry_cut,
6209 + .estimate = carry_estimate_cut}
6210 + ,
6211 + [COP_PASTE] = {
6212 + .handler = carry_paste,
6213 + .estimate = carry_estimate_paste}
6214 + ,
6215 + [COP_EXTENT] = {
6216 + .handler = carry_extent,
6217 + .estimate = carry_estimate_extent}
6218 + ,
6219 + [COP_UPDATE] = {
6220 + .handler = carry_update,
6221 + .estimate = carry_estimate_update}
6222 + ,
6223 + [COP_INSERT_FLOW] = {
6224 + .handler = carry_insert_flow,
6225 + .estimate = carry_estimate_insert_flow}
6226 +};
6227 +
6228 +/* Make Linus happy.
6229 + Local variables:
6230 + c-indentation-style: "K&R"
6231 + mode-name: "LC"
6232 + c-basic-offset: 8
6233 + tab-width: 8
6234 + fill-column: 120
6235 + scroll-step: 1
6236 + End:
6237 +*/
6238 diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.h linux-2.6.20/fs/reiser4/carry_ops.h
6239 --- linux-2.6.20.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6240 +++ linux-2.6.20/fs/reiser4/carry_ops.h 2007-05-06 14:50:43.694974475 +0400
6241 @@ -0,0 +1,42 @@
6242 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6243 +
6244 +/* implementation of carry operations. See carry_ops.c for details. */
6245 +
6246 +#if !defined( __CARRY_OPS_H__ )
6247 +#define __CARRY_OPS_H__
6248 +
6249 +#include "forward.h"
6250 +#include "znode.h"
6251 +#include "carry.h"
6252 +
6253 +/* carry operation handlers */
6254 +typedef struct carry_op_handler {
6255 + /* perform operation */
6256 + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6257 + /* estimate memory requirements for @op */
6258 + int (*estimate) (carry_op * op, carry_level * level);
6259 +} carry_op_handler;
6260 +
6261 +/* This is dispatch table for carry operations. It can be trivially
6262 + abstracted into useful plugin: tunable balancing policy is a good
6263 + thing. */
6264 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6265 +
6266 +unsigned int space_needed(const znode * node, const coord_t * coord,
6267 + const reiser4_item_data * data, int inserting);
6268 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6269 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6270 +
6271 +/* __CARRY_OPS_H__ */
6272 +#endif
6273 +
6274 +/* Make Linus happy.
6275 + Local variables:
6276 + c-indentation-style: "K&R"
6277 + mode-name: "LC"
6278 + c-basic-offset: 8
6279 + tab-width: 8
6280 + fill-column: 120
6281 + scroll-step: 1
6282 + End:
6283 +*/
6284 diff -urN linux-2.6.20.orig/fs/reiser4/context.c linux-2.6.20/fs/reiser4/context.c
6285 --- linux-2.6.20.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6286 +++ linux-2.6.20/fs/reiser4/context.c 2007-05-06 14:50:43.694974475 +0400
6287 @@ -0,0 +1,288 @@
6288 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6289 +
6290 +/* Manipulation of reiser4_context */
6291 +
6292 +/*
6293 + * global context used during system call. Variable of this type is allocated
6294 + * on the stack at the beginning of the reiser4 part of the system call and
6295 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6296 + * passing pointer to current transaction and current lockstack (both in
6297 + * one-to-one mapping with threads) all over the call chain.
6298 + *
6299 + * It's kind of like those global variables the prof used to tell you not to
6300 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6301 + *
6302 + * In some situations it is desirable to have ability to enter reiser4_context
6303 + * more than once for the same thread (nested contexts). For example, there
6304 + * are some functions that can be called either directly from VFS/VM or from
6305 + * already active reiser4 context (->writepage, for example).
6306 + *
6307 + * In such situations "child" context acts like dummy: all activity is
6308 + * actually performed in the top level context, and get_current_context()
6309 + * always returns top level context.
6310 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6311 + * nested any way.
6312 + *
6313 + * Note that there is an important difference between reiser4 uses
6314 + * ->fs_context and the way other file systems use it. Other file systems
6315 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6316 + * (this is why ->fs_context was initially called ->journal_info). This means,
6317 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6318 + * to the file system, they assume that some transaction is already underway,
6319 + * and usually bail out, because starting nested transaction would most likely
6320 + * lead to the deadlock. This gives false positives with reiser4, because we
6321 + * set ->fs_context before starting transaction.
6322 + */
6323 +
6324 +#include "debug.h"
6325 +#include "super.h"
6326 +#include "context.h"
6327 +
6328 +#include <linux/writeback.h> /* balance_dirty_pages() */
6329 +#include <linux/hardirq.h>
6330 +
6331 +static void _reiser4_init_context(reiser4_context * context,
6332 + struct super_block *super)
6333 +{
6334 + memset(context, 0, sizeof(*context));
6335 +
6336 + context->super = super;
6337 + context->magic = context_magic;
6338 + context->outer = current->journal_info;
6339 + current->journal_info = (void *)context;
6340 + context->nr_children = 0;
6341 + context->gfp_mask = GFP_KERNEL;
6342 +
6343 + init_lock_stack(&context->stack);
6344 +
6345 + reiser4_txn_begin(context);
6346 +
6347 + /* initialize head of tap list */
6348 + INIT_LIST_HEAD(&context->taps);
6349 +#if REISER4_DEBUG
6350 + context->task = current;
6351 +#endif
6352 + grab_space_enable();
6353 +}
6354 +
6355 +/* initialize context and bind it to the current thread
6356 +
6357 + This function should be called at the beginning of reiser4 part of
6358 + syscall.
6359 +*/
6360 +reiser4_context * reiser4_init_context(struct super_block * super)
6361 +{
6362 + reiser4_context *context;
6363 +
6364 + assert("nikita-2662", !in_interrupt() && !in_irq());
6365 + assert("nikita-3357", super != NULL);
6366 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6367 +
6368 + context = get_current_context_check();
6369 + if (context && context->super == super) {
6370 + context = (reiser4_context *) current->journal_info;
6371 + context->nr_children++;
6372 + return context;
6373 + }
6374 +
6375 + context = kmalloc(sizeof(*context), GFP_KERNEL);
6376 + if (context == NULL)
6377 + return ERR_PTR(RETERR(-ENOMEM));
6378 +
6379 + _reiser4_init_context(context, super);
6380 + return context;
6381 +}
6382 +
6383 +/* this is used in scan_mgr which is called with spinlock held and in
6384 + reiser4_fill_super magic */
6385 +void init_stack_context(reiser4_context *context, struct super_block *super)
6386 +{
6387 + assert("nikita-2662", !in_interrupt() && !in_irq());
6388 + assert("nikita-3357", super != NULL);
6389 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6390 + assert("vs-12", !is_in_reiser4_context());
6391 +
6392 + _reiser4_init_context(context, super);
6393 + context->on_stack = 1;
6394 + return;
6395 +}
6396 +
6397 +/* cast lock stack embedded into reiser4 context up to its container */
6398 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6399 +{
6400 + return container_of(owner, reiser4_context, stack);
6401 +}
6402 +
6403 +/* true if there is already _any_ reiser4 context for the current thread */
6404 +int is_in_reiser4_context(void)
6405 +{
6406 + reiser4_context *ctx;
6407 +
6408 + ctx = current->journal_info;
6409 + return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6410 +}
6411 +
6412 +/*
6413 + * call balance dirty pages for the current context.
6414 + *
6415 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6416 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6417 + * write---this covers vast majority of all dirty traffic), but we cannot do
6418 + * this immediately when formatted node is dirtied, because long term lock is
6419 + * usually held at that time. To work around this, dirtying of formatted node
6420 + * simply increases ->nr_marked_dirty counter in the current reiser4
6421 + * context. When we are about to leave this context,
6422 + * balance_dirty_pages_ratelimited() is called, if necessary.
6423 + *
6424 + * This introduces another problem: sometimes we do not want to run
6425 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6426 + * because some important lock (like ->i_mutex on the parent directory) is
6427 + * held. To achieve this, ->nobalance flag can be set in the current context.
6428 + */
6429 +static void balance_dirty_pages_at(reiser4_context *context)
6430 +{
6431 + reiser4_super_info_data *sbinfo = get_super_private(context->super);
6432 +
6433 + /*
6434 + * call balance_dirty_pages_ratelimited() to process formatted nodes
6435 + * dirtied during this system call. Do that only if we are not in mount
6436 + * and there were nodes dirtied in this context and we are not in
6437 + * writepage (to avoid deadlock) and not in pdflush
6438 + */
6439 + if (sbinfo != NULL && sbinfo->fake != NULL &&
6440 + context->nr_marked_dirty != 0 &&
6441 + !(current->flags & PF_MEMALLOC) &&
6442 + !current_is_pdflush())
6443 + balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6444 +}
6445 +
6446 +/* release resources associated with context.
6447 +
6448 + This function should be called at the end of "session" with reiser4,
6449 + typically just before leaving reiser4 driver back to VFS.
6450 +
6451 + This is good place to put some degugging consistency checks, like that
6452 + thread released all locks and closed transcrash etc.
6453 +
6454 +*/
6455 +static void reiser4_done_context(reiser4_context * context /* context being released */ )
6456 +{
6457 + assert("nikita-860", context != NULL);
6458 + assert("nikita-859", context->magic == context_magic);
6459 + assert("vs-646", (reiser4_context *) current->journal_info == context);
6460 + assert("zam-686", !in_interrupt() && !in_irq());
6461 +
6462 + /* only do anything when leaving top-level reiser4 context. All nested
6463 + * contexts are just dummies. */
6464 + if (context->nr_children == 0) {
6465 + assert("jmacd-673", context->trans == NULL);
6466 + assert("jmacd-1002", lock_stack_isclean(&context->stack));
6467 + assert("nikita-1936", reiser4_no_counters_are_held());
6468 + assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6469 + assert("zam-1004", ergo(get_super_private(context->super),
6470 + get_super_private(context->super)->delete_mutex_owner !=
6471 + current));
6472 +
6473 + /* release all grabbed but as yet unused blocks */
6474 + if (context->grabbed_blocks != 0)
6475 + all_grabbed2free();
6476 +
6477 + /*
6478 + * synchronize against longterm_unlock_znode():
6479 + * wake_up_requestor() wakes up requestors without holding
6480 + * zlock (otherwise they will immediately bump into that lock
6481 + * after wake up on another CPU). To work around (rare)
6482 + * situation where requestor has been woken up asynchronously
6483 + * and managed to run until completion (and destroy its
6484 + * context and lock stack) before wake_up_requestor() called
6485 + * wake_up() on it, wake_up_requestor() synchronize on lock
6486 + * stack spin lock. It has actually been observed that spin
6487 + * lock _was_ locked at this point, because
6488 + * wake_up_requestor() took interrupt.
6489 + */
6490 + spin_lock_stack(&context->stack);
6491 + spin_unlock_stack(&context->stack);
6492 +
6493 + assert("zam-684", context->nr_children == 0);
6494 + /* restore original ->fs_context value */
6495 + current->journal_info = context->outer;
6496 + if (context->on_stack == 0)
6497 + kfree(context);
6498 + } else {
6499 + context->nr_children--;
6500 +#if REISER4_DEBUG
6501 + assert("zam-685", context->nr_children >= 0);
6502 +#endif
6503 + }
6504 +}
6505 +
6506 +/*
6507 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6508 + * transaction. Call done_context() to do context related book-keeping.
6509 + */
6510 +void reiser4_exit_context(reiser4_context * context)
6511 +{
6512 + assert("nikita-3021", reiser4_schedulable());
6513 +
6514 + if (context->nr_children == 0) {
6515 + if (!context->nobalance) {
6516 + reiser4_txn_restart(context);
6517 + balance_dirty_pages_at(context);
6518 + }
6519 +
6520 + /* if filesystem is mounted with -o sync or -o dirsync - commit
6521 + transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6522 + commiting on exit_context when inode semaphore is held and
6523 + to have ktxnmgrd to do commit instead to get better
6524 + concurrent filesystem accesses. But, when one mounts with -o
6525 + sync, he cares more about reliability than about
6526 + performance. So, for now we have this simple mount -o sync
6527 + support. */
6528 + if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6529 + txn_atom *atom;
6530 +
6531 + atom = get_current_atom_locked_nocheck();
6532 + if (atom) {
6533 + atom->flags |= ATOM_FORCE_COMMIT;
6534 + context->trans->flags &= ~TXNH_DONT_COMMIT;
6535 + spin_unlock_atom(atom);
6536 + }
6537 + }
6538 + reiser4_txn_end(context);
6539 + }
6540 + reiser4_done_context(context);
6541 +}
6542 +
6543 +void reiser4_ctx_gfp_mask_set(void)
6544 +{
6545 + reiser4_context *ctx;
6546 +
6547 + ctx = get_current_context();
6548 + if (ctx->entd == 0 &&
6549 + list_empty(&ctx->stack.locks) &&
6550 + ctx->trans->atom == NULL)
6551 + ctx->gfp_mask = GFP_KERNEL;
6552 + else
6553 + ctx->gfp_mask = GFP_NOFS;
6554 +}
6555 +
6556 +void reiser4_ctx_gfp_mask_force (gfp_t mask)
6557 +{
6558 + reiser4_context *ctx;
6559 + ctx = get_current_context();
6560 +
6561 + assert("edward-1454", ctx != NULL);
6562 +
6563 + ctx->gfp_mask = mask;
6564 +}
6565 +
6566 +/*
6567 + * Local variables:
6568 + * c-indentation-style: "K&R"
6569 + * mode-name: "LC"
6570 + * c-basic-offset: 8
6571 + * tab-width: 8
6572 + * fill-column: 120
6573 + * scroll-step: 1
6574 + * End:
6575 + */
6576 diff -urN linux-2.6.20.orig/fs/reiser4/context.h linux-2.6.20/fs/reiser4/context.h
6577 --- linux-2.6.20.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6578 +++ linux-2.6.20/fs/reiser4/context.h 2007-05-06 14:50:43.698975725 +0400
6579 @@ -0,0 +1,228 @@
6580 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6581 + * reiser4/README */
6582 +
6583 +/* Reiser4 context. See context.c for details. */
6584 +
6585 +#if !defined( __REISER4_CONTEXT_H__ )
6586 +#define __REISER4_CONTEXT_H__
6587 +
6588 +#include "forward.h"
6589 +#include "debug.h"
6590 +#include "dformat.h"
6591 +#include "tap.h"
6592 +#include "lock.h"
6593 +
6594 +#include <linux/types.h> /* for __u?? */
6595 +#include <linux/fs.h> /* for struct super_block */
6596 +#include <linux/spinlock.h>
6597 +#include <linux/sched.h> /* for struct task_struct */
6598 +
6599 +/* reiser4 per-thread context */
6600 +struct reiser4_context {
6601 + /* magic constant. For identification of reiser4 contexts. */
6602 + __u32 magic;
6603 +
6604 + /* current lock stack. See lock.[ch]. This is where list of all
6605 + locks taken by current thread is kept. This is also used in
6606 + deadlock detection. */
6607 + lock_stack stack;
6608 +
6609 + /* current transcrash. */
6610 + txn_handle *trans;
6611 + /* transaction handle embedded into reiser4_context. ->trans points
6612 + * here by default. */
6613 + txn_handle trans_in_ctx;
6614 +
6615 + /* super block we are working with. To get the current tree
6616 + use &get_super_private (reiser4_get_current_sb ())->tree. */
6617 + struct super_block *super;
6618 +
6619 + /* parent fs activation */
6620 + struct fs_activation *outer;
6621 +
6622 + /* per-thread grabbed (for further allocation) blocks counter */
6623 + reiser4_block_nr grabbed_blocks;
6624 +
6625 + /* list of taps currently monitored. See tap.c */
6626 + struct list_head taps;
6627 +
6628 + /* grabbing space is enabled */
6629 + unsigned int grab_enabled:1;
6630 + /* should be set when we are write dirty nodes to disk in jnode_flush or
6631 + * reiser4_write_logs() */
6632 + unsigned int writeout_mode:1;
6633 + /* true, if current thread is an ent thread */
6634 + unsigned int entd:1;
6635 + /* true, if balance_dirty_pages() should not be run when leaving this
6636 + * context. This is used to avoid lengthly balance_dirty_pages()
6637 + * operation when holding some important resource, like directory
6638 + * ->i_mutex */
6639 + unsigned int nobalance:1;
6640 +
6641 + /* this bit is used on reiser4_done_context to decide whether context is
6642 + kmalloc-ed and has to be kfree-ed */
6643 + unsigned int on_stack:1;
6644 +
6645 + /* count non-trivial jnode_set_dirty() calls */
6646 + unsigned long nr_marked_dirty;
6647 +
6648 + /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6649 + * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6650 + * captures pages. When number of pages captured in one
6651 + * reiser4_sync_inodes reaches some threshold - some atoms get
6652 + * flushed */
6653 + int nr_captured;
6654 + int nr_children; /* number of child contexts */
6655 +#if REISER4_DEBUG
6656 + /* debugging information about reiser4 locks held by the current
6657 + * thread */
6658 + reiser4_lock_counters_info locks;
6659 + struct task_struct *task; /* so we can easily find owner of the stack */
6660 +
6661 + /*
6662 + * disk space grabbing debugging support
6663 + */
6664 + /* how many disk blocks were grabbed by the first call to
6665 + * reiser4_grab_space() in this context */
6666 + reiser4_block_nr grabbed_initially;
6667 +
6668 + /* list of all threads doing flush currently */
6669 + struct list_head flushers_link;
6670 + /* information about last error encountered by reiser4 */
6671 + err_site err;
6672 +#endif
6673 + void *vp;
6674 + gfp_t gfp_mask;
6675 +};
6676 +
6677 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6678 +
6679 +/* Debugging helps. */
6680 +#if REISER4_DEBUG
6681 +extern void print_contexts(void);
6682 +#endif
6683 +
6684 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6685 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6686 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6687 +
6688 +extern reiser4_context *reiser4_init_context(struct super_block *);
6689 +extern void init_stack_context(reiser4_context *, struct super_block *);
6690 +extern void reiser4_exit_context(reiser4_context *);
6691 +
6692 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6693 + catch accesses to staled or uninitialized contexts. */
6694 +#define context_magic ((__u32) 0x4b1b5d0b)
6695 +
6696 +extern int is_in_reiser4_context(void);
6697 +
6698 +/*
6699 + * return reiser4_context for the thread @tsk
6700 + */
6701 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6702 +{
6703 + assert("vs-1682",
6704 + ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6705 + return (reiser4_context *) tsk->journal_info;
6706 +}
6707 +
6708 +/*
6709 + * return reiser4 context of the current thread, or NULL if there is none.
6710 + */
6711 +static inline reiser4_context *get_current_context_check(void)
6712 +{
6713 + if (is_in_reiser4_context())
6714 + return get_context(current);
6715 + else
6716 + return NULL;
6717 +}
6718 +
6719 +static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6720 +
6721 +/* return context associated with current thread */
6722 +static inline reiser4_context *get_current_context(void)
6723 +{
6724 + return get_context(current);
6725 +}
6726 +
6727 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6728 +{
6729 + reiser4_context *ctx;
6730 +
6731 + ctx = get_current_context_check();
6732 + return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6733 +}
6734 +
6735 +void reiser4_ctx_gfp_mask_set(void);
6736 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6737 +
6738 +/*
6739 + * true if current thread is in the write-out mode. Thread enters write-out
6740 + * mode during jnode_flush and reiser4_write_logs().
6741 + */
6742 +static inline int is_writeout_mode(void)
6743 +{
6744 + return get_current_context()->writeout_mode;
6745 +}
6746 +
6747 +/*
6748 + * enter write-out mode
6749 + */
6750 +static inline void writeout_mode_enable(void)
6751 +{
6752 + assert("zam-941", !get_current_context()->writeout_mode);
6753 + get_current_context()->writeout_mode = 1;
6754 +}
6755 +
6756 +/*
6757 + * leave write-out mode
6758 + */
6759 +static inline void writeout_mode_disable(void)
6760 +{
6761 + assert("zam-942", get_current_context()->writeout_mode);
6762 + get_current_context()->writeout_mode = 0;
6763 +}
6764 +
6765 +static inline void grab_space_enable(void)
6766 +{
6767 + get_current_context()->grab_enabled = 1;
6768 +}
6769 +
6770 +static inline void grab_space_disable(void)
6771 +{
6772 + get_current_context()->grab_enabled = 0;
6773 +}
6774 +
6775 +static inline void grab_space_set_enabled(int enabled)
6776 +{
6777 + get_current_context()->grab_enabled = enabled;
6778 +}
6779 +
6780 +static inline int is_grab_enabled(reiser4_context * ctx)
6781 +{
6782 + return ctx->grab_enabled;
6783 +}
6784 +
6785 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6786 + * flush would be performed when it is closed. This is necessary when handle
6787 + * has to be closed under some coarse semaphore, like i_mutex of
6788 + * directory. Commit will be performed by ktxnmgrd. */
6789 +static inline void context_set_commit_async(reiser4_context * context)
6790 +{
6791 + context->nobalance = 1;
6792 + context->trans->flags |= TXNH_DONT_COMMIT;
6793 +}
6794 +
6795 +/* __REISER4_CONTEXT_H__ */
6796 +#endif
6797 +
6798 +/* Make Linus happy.
6799 + Local variables:
6800 + c-indentation-style: "K&R"
6801 + mode-name: "LC"
6802 + c-basic-offset: 8
6803 + tab-width: 8
6804 + fill-column: 120
6805 + scroll-step: 1
6806 + End:
6807 +*/
6808 diff -urN linux-2.6.20.orig/fs/reiser4/coord.c linux-2.6.20/fs/reiser4/coord.c
6809 --- linux-2.6.20.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6810 +++ linux-2.6.20/fs/reiser4/coord.c 2007-05-06 14:50:43.698975725 +0400
6811 @@ -0,0 +1,935 @@
6812 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6813 +
6814 +#include "forward.h"
6815 +#include "debug.h"
6816 +#include "dformat.h"
6817 +#include "tree.h"
6818 +#include "plugin/item/item.h"
6819 +#include "znode.h"
6820 +#include "coord.h"
6821 +
6822 +/* Internal constructor. */
6823 +static inline void
6824 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6825 + pos_in_node_t unit_pos, between_enum between)
6826 +{
6827 + coord->node = (znode *) node;
6828 + coord_set_item_pos(coord, item_pos);
6829 + coord->unit_pos = unit_pos;
6830 + coord->between = between;
6831 + ON_DEBUG(coord->plug_v = 0);
6832 + ON_DEBUG(coord->body_v = 0);
6833 +
6834 + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6835 +}
6836 +
6837 +/* after shifting of node content, coord previously set properly may become
6838 + invalid, try to "normalize" it. */
6839 +void coord_normalize(coord_t * coord)
6840 +{
6841 + znode *node;
6842 +
6843 + node = coord->node;
6844 + assert("vs-683", node);
6845 +
6846 + coord_clear_iplug(coord);
6847 +
6848 + if (node_is_empty(node)) {
6849 + coord_init_first_unit(coord, node);
6850 + } else if ((coord->between == AFTER_ITEM)
6851 + || (coord->between == AFTER_UNIT)) {
6852 + return;
6853 + } else if (coord->item_pos == coord_num_items(coord)
6854 + && coord->between == BEFORE_ITEM) {
6855 + coord_dec_item_pos(coord);
6856 + coord->between = AFTER_ITEM;
6857 + } else if (coord->unit_pos == coord_num_units(coord)
6858 + && coord->between == BEFORE_UNIT) {
6859 + coord->unit_pos--;
6860 + coord->between = AFTER_UNIT;
6861 + } else if (coord->item_pos == coord_num_items(coord)
6862 + && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6863 + coord_dec_item_pos(coord);
6864 + coord->unit_pos = 0;
6865 + coord->between = AFTER_ITEM;
6866 + }
6867 +}
6868 +
6869 +/* Copy a coordinate. */
6870 +void coord_dup(coord_t * coord, const coord_t * old_coord)
6871 +{
6872 + assert("jmacd-9800", coord_check(old_coord));
6873 + coord_dup_nocheck(coord, old_coord);
6874 +}
6875 +
6876 +/* Copy a coordinate without check. Useful when old_coord->node is not
6877 + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6878 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6879 +{
6880 + coord->node = old_coord->node;
6881 + coord_set_item_pos(coord, old_coord->item_pos);
6882 + coord->unit_pos = old_coord->unit_pos;
6883 + coord->between = old_coord->between;
6884 + coord->iplugid = old_coord->iplugid;
6885 + ON_DEBUG(coord->plug_v = old_coord->plug_v);
6886 + ON_DEBUG(coord->body_v = old_coord->body_v);
6887 +}
6888 +
6889 +/* Initialize an invalid coordinate. */
6890 +void coord_init_invalid(coord_t * coord, const znode * node)
6891 +{
6892 + coord_init_values(coord, node, 0, 0, INVALID_COORD);
6893 +}
6894 +
6895 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6896 +{
6897 + coord_init_values(coord, node, 0, 0, AT_UNIT);
6898 +}
6899 +
6900 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
6901 + empty, it is positioned at the EMPTY_NODE. */
6902 +void coord_init_first_unit(coord_t * coord, const znode * node)
6903 +{
6904 + int is_empty = node_is_empty(node);
6905 +
6906 + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6907 +
6908 + assert("jmacd-9801", coord_check(coord));
6909 +}
6910 +
6911 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
6912 + empty, it is positioned at the EMPTY_NODE. */
6913 +void coord_init_last_unit(coord_t * coord, const znode * node)
6914 +{
6915 + int is_empty = node_is_empty(node);
6916 +
6917 + coord_init_values(coord, node,
6918 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6919 + (is_empty ? EMPTY_NODE : AT_UNIT));
6920 + if (!is_empty)
6921 + coord->unit_pos = coord_last_unit_pos(coord);
6922 + assert("jmacd-9802", coord_check(coord));
6923 +}
6924 +
6925 +/* Initialize a coordinate to before the first item. If the node is empty, it is
6926 + positioned at the EMPTY_NODE. */
6927 +void coord_init_before_first_item(coord_t * coord, const znode * node)
6928 +{
6929 + int is_empty = node_is_empty(node);
6930 +
6931 + coord_init_values(coord, node, 0, 0,
6932 + (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6933 +
6934 + assert("jmacd-9803", coord_check(coord));
6935 +}
6936 +
6937 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6938 + at the EMPTY_NODE. */
6939 +void coord_init_after_last_item(coord_t * coord, const znode * node)
6940 +{
6941 + int is_empty = node_is_empty(node);
6942 +
6943 + coord_init_values(coord, node,
6944 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6945 + (is_empty ? EMPTY_NODE : AFTER_ITEM));
6946 +
6947 + assert("jmacd-9804", coord_check(coord));
6948 +}
6949 +
6950 +/* Initialize a coordinate to after last unit in the item. Coord must be set
6951 + already to existing item */
6952 +void coord_init_after_item_end(coord_t * coord)
6953 +{
6954 + coord->between = AFTER_UNIT;
6955 + coord->unit_pos = coord_last_unit_pos(coord);
6956 +}
6957 +
6958 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6959 +void coord_init_before_item(coord_t * coord)
6960 +{
6961 + coord->unit_pos = 0;
6962 + coord->between = BEFORE_ITEM;
6963 +}
6964 +
6965 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
6966 +void coord_init_after_item(coord_t * coord)
6967 +{
6968 + coord->unit_pos = 0;
6969 + coord->between = AFTER_ITEM;
6970 +}
6971 +
6972 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
6973 + it was not clear how actually */
6974 +void coord_init_zero(coord_t * coord)
6975 +{
6976 + memset(coord, 0, sizeof(*coord));
6977 +}
6978 +
6979 +/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
6980 +unsigned coord_num_units(const coord_t * coord)
6981 +{
6982 + assert("jmacd-9806", coord_is_existing_item(coord));
6983 +
6984 + return item_plugin_by_coord(coord)->b.nr_units(coord);
6985 +}
6986 +
6987 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
6988 +/* Audited by: green(2002.06.15) */
6989 +int coord_is_invalid(const coord_t * coord)
6990 +{
6991 + return coord->between == INVALID_COORD;
6992 +}
6993 +
6994 +/* Returns true if the coordinate is positioned at an existing item, not before or after
6995 + an item. It may be placed at, before, or after any unit within the item, whether
6996 + existing or not. */
6997 +int coord_is_existing_item(const coord_t * coord)
6998 +{
6999 + switch (coord->between) {
7000 + case EMPTY_NODE:
7001 + case BEFORE_ITEM:
7002 + case AFTER_ITEM:
7003 + case INVALID_COORD:
7004 + return 0;
7005 +
7006 + case BEFORE_UNIT:
7007 + case AT_UNIT:
7008 + case AFTER_UNIT:
7009 + return coord->item_pos < coord_num_items(coord);
7010 + }
7011 +
7012 + impossible("jmacd-9900", "unreachable coord: %p", coord);
7013 + return 0;
7014 +}
7015 +
7016 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7017 + unit. */
7018 +/* Audited by: green(2002.06.15) */
7019 +int coord_is_existing_unit(const coord_t * coord)
7020 +{
7021 + switch (coord->between) {
7022 + case EMPTY_NODE:
7023 + case BEFORE_UNIT:
7024 + case AFTER_UNIT:
7025 + case BEFORE_ITEM:
7026 + case AFTER_ITEM:
7027 + case INVALID_COORD:
7028 + return 0;
7029 +
7030 + case AT_UNIT:
7031 + return (coord->item_pos < coord_num_items(coord)
7032 + && coord->unit_pos < coord_num_units(coord));
7033 + }
7034 +
7035 + impossible("jmacd-9902", "unreachable");
7036 + return 0;
7037 +}
7038 +
7039 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7040 + true for empty nodes nor coordinates positioned before the first item. */
7041 +/* Audited by: green(2002.06.15) */
7042 +int coord_is_leftmost_unit(const coord_t * coord)
7043 +{
7044 + return (coord->between == AT_UNIT && coord->item_pos == 0
7045 + && coord->unit_pos == 0);
7046 +}
7047 +
7048 +#if REISER4_DEBUG
7049 +/* For assertions only, checks for a valid coordinate. */
7050 +int coord_check(const coord_t * coord)
7051 +{
7052 + if (coord->node == NULL) {
7053 + return 0;
7054 + }
7055 + if (znode_above_root(coord->node))
7056 + return 1;
7057 +
7058 + switch (coord->between) {
7059 + default:
7060 + case INVALID_COORD:
7061 + return 0;
7062 + case EMPTY_NODE:
7063 + if (!node_is_empty(coord->node)) {
7064 + return 0;
7065 + }
7066 + return coord->item_pos == 0 && coord->unit_pos == 0;
7067 +
7068 + case BEFORE_UNIT:
7069 + case AFTER_UNIT:
7070 + if (node_is_empty(coord->node) && (coord->item_pos == 0)
7071 + && (coord->unit_pos == 0))
7072 + return 1;
7073 + case AT_UNIT:
7074 + break;
7075 + case AFTER_ITEM:
7076 + case BEFORE_ITEM:
7077 + /* before/after item should not set unit_pos. */
7078 + if (coord->unit_pos != 0) {
7079 + return 0;
7080 + }
7081 + break;
7082 + }
7083 +
7084 + if (coord->item_pos >= node_num_items(coord->node)) {
7085 + return 0;
7086 + }
7087 +
7088 + /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7089 + between is set either AFTER_ITEM or BEFORE_ITEM */
7090 + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7091 + return 1;
7092 +
7093 + if (coord_is_iplug_set(coord) &&
7094 + coord->unit_pos >
7095 + item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7096 + return 0;
7097 + }
7098 + return 1;
7099 +}
7100 +#endif
7101 +
7102 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7103 + Returns 1 if the new position is does not exist. */
7104 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7105 +{
7106 + /* If the node is invalid, leave it. */
7107 + if (coord->between == INVALID_COORD) {
7108 + return 1;
7109 + }
7110 +
7111 + /* If the node is empty, set it appropriately. */
7112 + if (items == 0) {
7113 + coord->between = EMPTY_NODE;
7114 + coord_set_item_pos(coord, 0);
7115 + coord->unit_pos = 0;
7116 + return 1;
7117 + }
7118 +
7119 + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7120 + if (coord->between == EMPTY_NODE) {
7121 + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7122 + coord_set_item_pos(coord, 0);
7123 + coord->unit_pos = 0;
7124 + return 0;
7125 + }
7126 +
7127 + /* If the item_pos is out-of-range, set it appropriatly. */
7128 + if (coord->item_pos >= items) {
7129 + coord->between = AFTER_ITEM;
7130 + coord_set_item_pos(coord, items - 1);
7131 + coord->unit_pos = 0;
7132 + /* If is_next, return 1 (can't go any further). */
7133 + return is_next;
7134 + }
7135 +
7136 + return 0;
7137 +}
7138 +
7139 +/* Advances the coordinate by one unit to the right. If empty, no change. If
7140 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7141 + existing unit. */
7142 +int coord_next_unit(coord_t * coord)
7143 +{
7144 + unsigned items = coord_num_items(coord);
7145 +
7146 + if (coord_adjust_items(coord, items, 1) == 1) {
7147 + return 1;
7148 + }
7149 +
7150 + switch (coord->between) {
7151 + case BEFORE_UNIT:
7152 + /* Now it is positioned at the same unit. */
7153 + coord->between = AT_UNIT;
7154 + return 0;
7155 +
7156 + case AFTER_UNIT:
7157 + case AT_UNIT:
7158 + /* If it was at or after a unit and there are more units in this item,
7159 + advance to the next one. */
7160 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7161 + coord->unit_pos += 1;
7162 + coord->between = AT_UNIT;
7163 + return 0;
7164 + }
7165 +
7166 + /* Otherwise, it is crossing an item boundary and treated as if it was
7167 + after the current item. */
7168 + coord->between = AFTER_ITEM;
7169 + coord->unit_pos = 0;
7170 + /* FALLTHROUGH */
7171 +
7172 + case AFTER_ITEM:
7173 + /* Check for end-of-node. */
7174 + if (coord->item_pos == items - 1) {
7175 + return 1;
7176 + }
7177 +
7178 + coord_inc_item_pos(coord);
7179 + coord->unit_pos = 0;
7180 + coord->between = AT_UNIT;
7181 + return 0;
7182 +
7183 + case BEFORE_ITEM:
7184 + /* The adjust_items checks ensure that we are valid here. */
7185 + coord->unit_pos = 0;
7186 + coord->between = AT_UNIT;
7187 + return 0;
7188 +
7189 + case INVALID_COORD:
7190 + case EMPTY_NODE:
7191 + /* Handled in coord_adjust_items(). */
7192 + break;
7193 + }
7194 +
7195 + impossible("jmacd-9902", "unreachable");
7196 + return 0;
7197 +}
7198 +
7199 +/* Advances the coordinate by one item to the right. If empty, no change. If
7200 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7201 + an existing item. */
7202 +int coord_next_item(coord_t * coord)
7203 +{
7204 + unsigned items = coord_num_items(coord);
7205 +
7206 + if (coord_adjust_items(coord, items, 1) == 1) {
7207 + return 1;
7208 + }
7209 +
7210 + switch (coord->between) {
7211 + case AFTER_UNIT:
7212 + case AT_UNIT:
7213 + case BEFORE_UNIT:
7214 + case AFTER_ITEM:
7215 + /* Check for end-of-node. */
7216 + if (coord->item_pos == items - 1) {
7217 + coord->between = AFTER_ITEM;
7218 + coord->unit_pos = 0;
7219 + coord_clear_iplug(coord);
7220 + return 1;
7221 + }
7222 +
7223 + /* Anywhere in an item, go to the next one. */
7224 + coord->between = AT_UNIT;
7225 + coord_inc_item_pos(coord);
7226 + coord->unit_pos = 0;
7227 + return 0;
7228 +
7229 + case BEFORE_ITEM:
7230 + /* The out-of-range check ensures that we are valid here. */
7231 + coord->unit_pos = 0;
7232 + coord->between = AT_UNIT;
7233 + return 0;
7234 + case INVALID_COORD:
7235 + case EMPTY_NODE:
7236 + /* Handled in coord_adjust_items(). */
7237 + break;
7238 + }
7239 +
7240 + impossible("jmacd-9903", "unreachable");
7241 + return 0;
7242 +}
7243 +
7244 +/* Advances the coordinate by one unit to the left. If empty, no change. If
7245 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7246 + is an existing unit. */
7247 +int coord_prev_unit(coord_t * coord)
7248 +{
7249 + unsigned items = coord_num_items(coord);
7250 +
7251 + if (coord_adjust_items(coord, items, 0) == 1) {
7252 + return 1;
7253 + }
7254 +
7255 + switch (coord->between) {
7256 + case AT_UNIT:
7257 + case BEFORE_UNIT:
7258 + if (coord->unit_pos > 0) {
7259 + coord->unit_pos -= 1;
7260 + coord->between = AT_UNIT;
7261 + return 0;
7262 + }
7263 +
7264 + if (coord->item_pos == 0) {
7265 + coord->between = BEFORE_ITEM;
7266 + return 1;
7267 + }
7268 +
7269 + coord_dec_item_pos(coord);
7270 + coord->unit_pos = coord_last_unit_pos(coord);
7271 + coord->between = AT_UNIT;
7272 + return 0;
7273 +
7274 + case AFTER_UNIT:
7275 + /* What if unit_pos is out-of-range? */
7276 + assert("jmacd-5442",
7277 + coord->unit_pos <= coord_last_unit_pos(coord));
7278 + coord->between = AT_UNIT;
7279 + return 0;
7280 +
7281 + case BEFORE_ITEM:
7282 + if (coord->item_pos == 0) {
7283 + return 1;
7284 + }
7285 +
7286 + coord_dec_item_pos(coord);
7287 + /* FALLTHROUGH */
7288 +
7289 + case AFTER_ITEM:
7290 + coord->between = AT_UNIT;
7291 + coord->unit_pos = coord_last_unit_pos(coord);
7292 + return 0;
7293 +
7294 + case INVALID_COORD:
7295 + case EMPTY_NODE:
7296 + break;
7297 + }
7298 +
7299 + impossible("jmacd-9904", "unreachable");
7300 + return 0;
7301 +}
7302 +
7303 +/* Advances the coordinate by one item to the left. If empty, no change. If
7304 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7305 + is an existing item. */
7306 +int coord_prev_item(coord_t * coord)
7307 +{
7308 + unsigned items = coord_num_items(coord);
7309 +
7310 + if (coord_adjust_items(coord, items, 0) == 1) {
7311 + return 1;
7312 + }
7313 +
7314 + switch (coord->between) {
7315 + case AT_UNIT:
7316 + case AFTER_UNIT:
7317 + case BEFORE_UNIT:
7318 + case BEFORE_ITEM:
7319 +
7320 + if (coord->item_pos == 0) {
7321 + coord->between = BEFORE_ITEM;
7322 + coord->unit_pos = 0;
7323 + return 1;
7324 + }
7325 +
7326 + coord_dec_item_pos(coord);
7327 + coord->unit_pos = 0;
7328 + coord->between = AT_UNIT;
7329 + return 0;
7330 +
7331 + case AFTER_ITEM:
7332 + coord->between = AT_UNIT;
7333 + coord->unit_pos = 0;
7334 + return 0;
7335 +
7336 + case INVALID_COORD:
7337 + case EMPTY_NODE:
7338 + break;
7339 + }
7340 +
7341 + impossible("jmacd-9905", "unreachable");
7342 + return 0;
7343 +}
7344 +
7345 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7346 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7347 +{
7348 + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7349 + if (dir == LEFT_SIDE) {
7350 + coord_init_first_unit(coord, node);
7351 + } else {
7352 + coord_init_last_unit(coord, node);
7353 + }
7354 +}
7355 +
7356 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7357 + argument. */
7358 +/* Audited by: green(2002.06.15) */
7359 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7360 +{
7361 + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7362 + if (dir == LEFT_SIDE) {
7363 + return coord_is_before_leftmost(coord);
7364 + } else {
7365 + return coord_is_after_rightmost(coord);
7366 + }
7367 +}
7368 +
7369 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7370 +/* Audited by: green(2002.06.15) */
7371 +int coord_sideof_unit(coord_t * coord, sideof dir)
7372 +{
7373 + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7374 + if (dir == LEFT_SIDE) {
7375 + return coord_prev_unit(coord);
7376 + } else {
7377 + return coord_next_unit(coord);
7378 + }
7379 +}
7380 +
7381 +#if REISER4_DEBUG
7382 +int coords_equal(const coord_t * c1, const coord_t * c2)
7383 +{
7384 + assert("nikita-2840", c1 != NULL);
7385 + assert("nikita-2841", c2 != NULL);
7386 +
7387 + return
7388 + c1->node == c2->node &&
7389 + c1->item_pos == c2->item_pos &&
7390 + c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7391 +}
7392 +#endif /* REISER4_DEBUG */
7393 +
7394 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7395 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7396 +/* Audited by: green(2002.06.15) */
7397 +coord_wrt_node coord_wrt(const coord_t * coord)
7398 +{
7399 + if (coord_is_before_leftmost(coord)) {
7400 + return COORD_ON_THE_LEFT;
7401 + }
7402 +
7403 + if (coord_is_after_rightmost(coord)) {
7404 + return COORD_ON_THE_RIGHT;
7405 + }
7406 +
7407 + return COORD_INSIDE;
7408 +}
7409 +
7410 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7411 + of the last item or it is an empty node. */
7412 +/* Audited by: green(2002.06.15) */
7413 +int coord_is_after_rightmost(const coord_t * coord)
7414 +{
7415 + assert("jmacd-7313", coord_check(coord));
7416 +
7417 + switch (coord->between) {
7418 + case INVALID_COORD:
7419 + case AT_UNIT:
7420 + case BEFORE_UNIT:
7421 + case BEFORE_ITEM:
7422 + return 0;
7423 +
7424 + case EMPTY_NODE:
7425 + return 1;
7426 +
7427 + case AFTER_ITEM:
7428 + return (coord->item_pos == node_num_items(coord->node) - 1);
7429 +
7430 + case AFTER_UNIT:
7431 + return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7432 + coord->unit_pos == coord_last_unit_pos(coord));
7433 + }
7434 +
7435 + impossible("jmacd-9908", "unreachable");
7436 + return 0;
7437 +}
7438 +
7439 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7440 + node. */
7441 +int coord_is_before_leftmost(const coord_t * coord)
7442 +{
7443 + /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7444 + necessary to check if coord is set before leftmost
7445 + assert ("jmacd-7313", coord_check (coord)); */
7446 + switch (coord->between) {
7447 + case INVALID_COORD:
7448 + case AT_UNIT:
7449 + case AFTER_ITEM:
7450 + case AFTER_UNIT:
7451 + return 0;
7452 +
7453 + case EMPTY_NODE:
7454 + return 1;
7455 +
7456 + case BEFORE_ITEM:
7457 + case BEFORE_UNIT:
7458 + return (coord->item_pos == 0) && (coord->unit_pos == 0);
7459 + }
7460 +
7461 + impossible("jmacd-9908", "unreachable");
7462 + return 0;
7463 +}
7464 +
7465 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7466 + last unit of an item, before the first unit of an item, or at an empty node. */
7467 +/* Audited by: green(2002.06.15) */
7468 +int coord_is_between_items(const coord_t * coord)
7469 +{
7470 + assert("jmacd-7313", coord_check(coord));
7471 +
7472 + switch (coord->between) {
7473 + case INVALID_COORD:
7474 + case AT_UNIT:
7475 + return 0;
7476 +
7477 + case AFTER_ITEM:
7478 + case BEFORE_ITEM:
7479 + case EMPTY_NODE:
7480 + return 1;
7481 +
7482 + case BEFORE_UNIT:
7483 + return coord->unit_pos == 0;
7484 +
7485 + case AFTER_UNIT:
7486 + return coord->unit_pos == coord_last_unit_pos(coord);
7487 + }
7488 +
7489 + impossible("jmacd-9908", "unreachable");
7490 + return 0;
7491 +}
7492 +
7493 +#if REISER4_DEBUG
7494 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7495 + before-after or item boundaries. */
7496 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7497 +{
7498 + coord_t *left;
7499 + coord_t *right;
7500 +
7501 + assert("nikita-1241", c1 != NULL);
7502 + assert("nikita-1242", c2 != NULL);
7503 + assert("nikita-1243", c1->node == c2->node);
7504 + assert("nikita-1244", coord_is_existing_unit(c1));
7505 + assert("nikita-1245", coord_is_existing_unit(c2));
7506 +
7507 + left = right = NULL;
7508 + switch (coord_compare(c1, c2)) {
7509 + case COORD_CMP_ON_LEFT:
7510 + left = c1;
7511 + right = c2;
7512 + break;
7513 + case COORD_CMP_ON_RIGHT:
7514 + left = c2;
7515 + right = c1;
7516 + break;
7517 + case COORD_CMP_SAME:
7518 + return 0;
7519 + default:
7520 + wrong_return_value("nikita-1246", "compare_coords()");
7521 + }
7522 + assert("vs-731", left && right);
7523 + if (left->item_pos == right->item_pos) {
7524 + return left->unit_pos + 1 == right->unit_pos;
7525 + } else if (left->item_pos + 1 == right->item_pos) {
7526 + return (left->unit_pos == coord_last_unit_pos(left))
7527 + && (right->unit_pos == 0);
7528 + } else {
7529 + return 0;
7530 + }
7531 +}
7532 +#endif /* REISER4_DEBUG */
7533 +
7534 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7535 + COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7536 +/* Audited by: green(2002.06.15) */
7537 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7538 +{
7539 + assert("vs-209", c1->node == c2->node);
7540 + assert("vs-194", coord_is_existing_unit(c1)
7541 + && coord_is_existing_unit(c2));
7542 +
7543 + if (c1->item_pos > c2->item_pos)
7544 + return COORD_CMP_ON_RIGHT;
7545 + if (c1->item_pos < c2->item_pos)
7546 + return COORD_CMP_ON_LEFT;
7547 + if (c1->unit_pos > c2->unit_pos)
7548 + return COORD_CMP_ON_RIGHT;
7549 + if (c1->unit_pos < c2->unit_pos)
7550 + return COORD_CMP_ON_LEFT;
7551 + return COORD_CMP_SAME;
7552 +}
7553 +
7554 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7555 + non-zero if there is no position to the right. */
7556 +int coord_set_to_right(coord_t * coord)
7557 +{
7558 + unsigned items = coord_num_items(coord);
7559 +
7560 + if (coord_adjust_items(coord, items, 1) == 1) {
7561 + return 1;
7562 + }
7563 +
7564 + switch (coord->between) {
7565 + case AT_UNIT:
7566 + return 0;
7567 +
7568 + case BEFORE_ITEM:
7569 + case BEFORE_UNIT:
7570 + coord->between = AT_UNIT;
7571 + return 0;
7572 +
7573 + case AFTER_UNIT:
7574 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7575 + coord->unit_pos += 1;
7576 + coord->between = AT_UNIT;
7577 + return 0;
7578 + } else {
7579 +
7580 + coord->unit_pos = 0;
7581 +
7582 + if (coord->item_pos == items - 1) {
7583 + coord->between = AFTER_ITEM;
7584 + return 1;
7585 + }
7586 +
7587 + coord_inc_item_pos(coord);
7588 + coord->between = AT_UNIT;
7589 + return 0;
7590 + }
7591 +
7592 + case AFTER_ITEM:
7593 + if (coord->item_pos == items - 1) {
7594 + return 1;
7595 + }
7596 +
7597 + coord_inc_item_pos(coord);
7598 + coord->unit_pos = 0;
7599 + coord->between = AT_UNIT;
7600 + return 0;
7601 +
7602 + case EMPTY_NODE:
7603 + return 1;
7604 +
7605 + case INVALID_COORD:
7606 + break;
7607 + }
7608 +
7609 + impossible("jmacd-9920", "unreachable");
7610 + return 0;
7611 +}
7612 +
7613 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7614 + non-zero if there is no position to the left. */
7615 +int coord_set_to_left(coord_t * coord)
7616 +{
7617 + unsigned items = coord_num_items(coord);
7618 +
7619 + if (coord_adjust_items(coord, items, 0) == 1) {
7620 + return 1;
7621 + }
7622 +
7623 + switch (coord->between) {
7624 + case AT_UNIT:
7625 + return 0;
7626 +
7627 + case AFTER_UNIT:
7628 + coord->between = AT_UNIT;
7629 + return 0;
7630 +
7631 + case AFTER_ITEM:
7632 + coord->between = AT_UNIT;
7633 + coord->unit_pos = coord_last_unit_pos(coord);
7634 + return 0;
7635 +
7636 + case BEFORE_UNIT:
7637 + if (coord->unit_pos > 0) {
7638 + coord->unit_pos -= 1;
7639 + coord->between = AT_UNIT;
7640 + return 0;
7641 + } else {
7642 +
7643 + if (coord->item_pos == 0) {
7644 + coord->between = BEFORE_ITEM;
7645 + return 1;
7646 + }
7647 +
7648 + coord->unit_pos = coord_last_unit_pos(coord);
7649 + coord_dec_item_pos(coord);
7650 + coord->between = AT_UNIT;
7651 + return 0;
7652 + }
7653 +
7654 + case BEFORE_ITEM:
7655 + if (coord->item_pos == 0) {
7656 + return 1;
7657 + }
7658 +
7659 + coord_dec_item_pos(coord);
7660 + coord->unit_pos = coord_last_unit_pos(coord);
7661 + coord->between = AT_UNIT;
7662 + return 0;
7663 +
7664 + case EMPTY_NODE:
7665 + return 1;
7666 +
7667 + case INVALID_COORD:
7668 + break;
7669 + }
7670 +
7671 + impossible("jmacd-9920", "unreachable");
7672 + return 0;
7673 +}
7674 +
7675 +static const char *coord_tween_tostring(between_enum n)
7676 +{
7677 + switch (n) {
7678 + case BEFORE_UNIT:
7679 + return "before unit";
7680 + case BEFORE_ITEM:
7681 + return "before item";
7682 + case AT_UNIT:
7683 + return "at unit";
7684 + case AFTER_UNIT:
7685 + return "after unit";
7686 + case AFTER_ITEM:
7687 + return "after item";
7688 + case EMPTY_NODE:
7689 + return "empty node";
7690 + case INVALID_COORD:
7691 + return "invalid";
7692 + default:
7693 + {
7694 + static char buf[30];
7695 +
7696 + sprintf(buf, "unknown: %i", n);
7697 + return buf;
7698 + }
7699 + }
7700 +}
7701 +
7702 +void print_coord(const char *mes, const coord_t * coord, int node)
7703 +{
7704 + if (coord == NULL) {
7705 + printk("%s: null\n", mes);
7706 + return;
7707 + }
7708 + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7709 + mes, coord->item_pos, coord->unit_pos,
7710 + coord_tween_tostring(coord->between), coord->iplugid);
7711 +}
7712 +
7713 +int
7714 +item_utmost_child_real_block(const coord_t * coord, sideof side,
7715 + reiser4_block_nr * blk)
7716 +{
7717 + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7718 + side,
7719 + blk);
7720 +}
7721 +
7722 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7723 +{
7724 + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7725 +}
7726 +
7727 +/* @count bytes of flow @f got written, update correspondingly f->length,
7728 + f->data and f->key */
7729 +void move_flow_forward(flow_t * f, unsigned count)
7730 +{
7731 + if (f->data)
7732 + f->data += count;
7733 + f->length -= count;
7734 + set_key_offset(&f->key, get_key_offset(&f->key) + count);
7735 +}
7736 +
7737 +/*
7738 + Local variables:
7739 + c-indentation-style: "K&R"
7740 + mode-name: "LC"
7741 + c-basic-offset: 8
7742 + tab-width: 8
7743 + fill-column: 120
7744 + scroll-step: 1
7745 + End:
7746 +*/
7747 diff -urN linux-2.6.20.orig/fs/reiser4/coord.h linux-2.6.20/fs/reiser4/coord.h
7748 --- linux-2.6.20.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7749 +++ linux-2.6.20/fs/reiser4/coord.h 2007-05-06 14:50:43.698975725 +0400
7750 @@ -0,0 +1,389 @@
7751 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7752 +
7753 +/* Coords */
7754 +
7755 +#if !defined( __REISER4_COORD_H__ )
7756 +#define __REISER4_COORD_H__
7757 +
7758 +#include "forward.h"
7759 +#include "debug.h"
7760 +#include "dformat.h"
7761 +#include "key.h"
7762 +
7763 +/* insertions happen between coords in the tree, so we need some means
7764 + of specifying the sense of betweenness. */
7765 +typedef enum {
7766 + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7767 + AT_UNIT,
7768 + AFTER_UNIT,
7769 + BEFORE_ITEM,
7770 + AFTER_ITEM,
7771 + INVALID_COORD,
7772 + EMPTY_NODE,
7773 +} between_enum;
7774 +
7775 +/* location of coord w.r.t. its node */
7776 +typedef enum {
7777 + COORD_ON_THE_LEFT = -1,
7778 + COORD_ON_THE_RIGHT = +1,
7779 + COORD_INSIDE = 0
7780 +} coord_wrt_node;
7781 +
7782 +typedef enum {
7783 + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7784 +} coord_cmp;
7785 +
7786 +struct coord {
7787 + /* node in a tree */
7788 + /* 0 */ znode *node;
7789 +
7790 + /* position of item within node */
7791 + /* 4 */ pos_in_node_t item_pos;
7792 + /* position of unit within item */
7793 + /* 6 */ pos_in_node_t unit_pos;
7794 + /* optimization: plugin of item is stored in coord_t. Until this was
7795 + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7796 + is invalidated (set to 0xff) on each modification of ->item_pos,
7797 + and all such modifications are funneled through coord_*_item_pos()
7798 + functions below.
7799 + */
7800 + /* 8 */ char iplugid;
7801 + /* position of coord w.r.t. to neighboring items and/or units.
7802 + Values are taken from &between_enum above.
7803 + */
7804 + /* 9 */ char between;
7805 + /* padding. It will be added by the compiler anyway to conform to the
7806 + * C language alignment requirements. We keep it here to be on the
7807 + * safe side and to have a clear picture of the memory layout of this
7808 + * structure. */
7809 + /* 10 */ __u16 pad;
7810 + /* 12 */ int offset;
7811 +#if REISER4_DEBUG
7812 + unsigned long plug_v;
7813 + unsigned long body_v;
7814 +#endif
7815 +};
7816 +
7817 +#define INVALID_PLUGID ((char)((1 << 8) - 1))
7818 +#define INVALID_OFFSET -1
7819 +
7820 +static inline void coord_clear_iplug(coord_t * coord)
7821 +{
7822 + assert("nikita-2835", coord != NULL);
7823 + coord->iplugid = INVALID_PLUGID;
7824 + coord->offset = INVALID_OFFSET;
7825 +}
7826 +
7827 +static inline int coord_is_iplug_set(const coord_t * coord)
7828 +{
7829 + assert("nikita-2836", coord != NULL);
7830 + return coord->iplugid != INVALID_PLUGID;
7831 +}
7832 +
7833 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7834 +{
7835 + assert("nikita-2478", coord != NULL);
7836 + coord->item_pos = pos;
7837 + coord_clear_iplug(coord);
7838 +}
7839 +
7840 +static inline void coord_dec_item_pos(coord_t * coord)
7841 +{
7842 + assert("nikita-2480", coord != NULL);
7843 + --coord->item_pos;
7844 + coord_clear_iplug(coord);
7845 +}
7846 +
7847 +static inline void coord_inc_item_pos(coord_t * coord)
7848 +{
7849 + assert("nikita-2481", coord != NULL);
7850 + ++coord->item_pos;
7851 + coord_clear_iplug(coord);
7852 +}
7853 +
7854 +static inline void coord_add_item_pos(coord_t * coord, int delta)
7855 +{
7856 + assert("nikita-2482", coord != NULL);
7857 + coord->item_pos += delta;
7858 + coord_clear_iplug(coord);
7859 +}
7860 +
7861 +static inline void coord_invalid_item_pos(coord_t * coord)
7862 +{
7863 + assert("nikita-2832", coord != NULL);
7864 + coord->item_pos = (unsigned short)~0;
7865 + coord_clear_iplug(coord);
7866 +}
7867 +
7868 +/* Reverse a direction. */
7869 +static inline sideof sideof_reverse(sideof side)
7870 +{
7871 + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7872 +}
7873 +
7874 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7875 +
7876 + "first" and "last"
7877 + "next" and "prev"
7878 + "before" and "after"
7879 + "leftmost" and "rightmost"
7880 +
7881 + But I think the chosen names are decent the way they are.
7882 +*/
7883 +
7884 +/* COORD INITIALIZERS */
7885 +
7886 +/* Initialize an invalid coordinate. */
7887 +extern void coord_init_invalid(coord_t * coord, const znode * node);
7888 +
7889 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7890 +
7891 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
7892 + empty, it is positioned at the EMPTY_NODE. */
7893 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
7894 +
7895 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
7896 + empty, it is positioned at the EMPTY_NODE. */
7897 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
7898 +
7899 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7900 + positioned at the EMPTY_NODE. */
7901 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7902 +
7903 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7904 + at the EMPTY_NODE. */
7905 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7906 +
7907 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7908 + already to existing item */
7909 +void coord_init_after_item_end(coord_t * coord);
7910 +
7911 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7912 +void coord_init_before_item(coord_t *);
7913 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7914 +void coord_init_after_item(coord_t *);
7915 +
7916 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7917 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7918 + sideof dir);
7919 +
7920 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7921 + it was not clear how actually
7922 + FIXME-VS: added by vs (2002, june, 8) */
7923 +extern void coord_init_zero(coord_t * coord);
7924 +
7925 +/* COORD METHODS */
7926 +
7927 +/* after shifting of node content, coord previously set properly may become
7928 + invalid, try to "normalize" it. */
7929 +void coord_normalize(coord_t * coord);
7930 +
7931 +/* Copy a coordinate. */
7932 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7933 +
7934 +/* Copy a coordinate without check. */
7935 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7936 +
7937 +unsigned coord_num_units(const coord_t * coord);
7938 +
7939 +/* Return the last valid unit number at the present item (i.e.,
7940 + coord_num_units() - 1). */
7941 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
7942 +{
7943 + return coord_num_units(coord) - 1;
7944 +}
7945 +
7946 +#if REISER4_DEBUG
7947 +/* For assertions only, checks for a valid coordinate. */
7948 +extern int coord_check(const coord_t * coord);
7949 +
7950 +extern unsigned long znode_times_locked(const znode * z);
7951 +
7952 +static inline void coord_update_v(coord_t * coord)
7953 +{
7954 + coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7955 +}
7956 +#endif
7957 +
7958 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
7959 +
7960 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
7961 +
7962 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7963 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7964 +extern coord_wrt_node coord_wrt(const coord_t * coord);
7965 +
7966 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7967 + before-after or item boundaries. */
7968 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
7969 +
7970 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
7971 + NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
7972 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
7973 +
7974 +/* COORD PREDICATES */
7975 +
7976 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7977 +extern int coord_is_invalid(const coord_t * coord);
7978 +
7979 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7980 + an item. It may be placed at, before, or after any unit within the item, whether
7981 + existing or not. If this is true you can call methods of the item plugin. */
7982 +extern int coord_is_existing_item(const coord_t * coord);
7983 +
7984 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7985 + last unit of an item, before the first unit of an item, or at an empty node. */
7986 +extern int coord_is_between_items(const coord_t * coord);
7987 +
7988 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7989 + unit. */
7990 +extern int coord_is_existing_unit(const coord_t * coord);
7991 +
7992 +/* Returns true if the coordinate is positioned at an empty node. */
7993 +extern int coord_is_empty(const coord_t * coord);
7994 +
7995 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7996 + true for empty nodes nor coordinates positioned before the first item. */
7997 +extern int coord_is_leftmost_unit(const coord_t * coord);
7998 +
7999 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8000 + of the last item or it is an empty node. */
8001 +extern int coord_is_after_rightmost(const coord_t * coord);
8002 +
8003 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8004 + node. */
8005 +extern int coord_is_before_leftmost(const coord_t * coord);
8006 +
8007 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8008 + argument. */
8009 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8010 +
8011 +/* COORD MODIFIERS */
8012 +
8013 +/* Advances the coordinate by one unit to the right. If empty, no change. If
8014 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8015 + an existing unit. */
8016 +extern int coord_next_unit(coord_t * coord);
8017 +
8018 +/* Advances the coordinate by one item to the right. If empty, no change. If
8019 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8020 + an existing item. */
8021 +extern int coord_next_item(coord_t * coord);
8022 +
8023 +/* Advances the coordinate by one unit to the left. If empty, no change. If
8024 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8025 + is an existing unit. */
8026 +extern int coord_prev_unit(coord_t * coord);
8027 +
8028 +/* Advances the coordinate by one item to the left. If empty, no change. If
8029 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8030 + is an existing item. */
8031 +extern int coord_prev_item(coord_t * coord);
8032 +
8033 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8034 + non-zero if there is no position to the right. */
8035 +extern int coord_set_to_right(coord_t * coord);
8036 +
8037 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8038 + non-zero if there is no position to the left. */
8039 +extern int coord_set_to_left(coord_t * coord);
8040 +
8041 +/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8042 + and non-zero if the unit did not exist. */
8043 +extern int coord_set_after_unit(coord_t * coord);
8044 +
8045 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8046 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8047 +
8048 +/* iterate over all units in @node */
8049 +#define for_all_units( coord, node ) \
8050 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8051 + coord_next_unit( coord ) == 0 ; )
8052 +
8053 +/* iterate over all items in @node */
8054 +#define for_all_items( coord, node ) \
8055 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8056 + coord_next_item( coord ) == 0 ; )
8057 +
8058 +/* COORD/ITEM METHODS */
8059 +
8060 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8061 + reiser4_block_nr * blk);
8062 +extern int item_utmost_child(const coord_t * coord, sideof side,
8063 + jnode ** child);
8064 +
8065 +/* a flow is a sequence of bytes being written to or read from the tree. The
8066 + tree will slice the flow into items while storing it into nodes, but all of
8067 + that is hidden from anything outside the tree. */
8068 +
8069 +struct flow {
8070 + reiser4_key key; /* key of start of flow's sequence of bytes */
8071 + loff_t length; /* length of flow's sequence of bytes */
8072 + char *data; /* start of flow's sequence of bytes */
8073 + int user; /* if 1 data is user space, 0 - kernel space */
8074 + rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8075 +};
8076 +
8077 +void move_flow_forward(flow_t * f, unsigned count);
8078 +
8079 +/* &reiser4_item_data - description of data to be inserted or pasted
8080 +
8081 + Q: articulate the reasons for the difference between this and flow.
8082 +
8083 + A: Becides flow we insert into tree other things: stat data, directory
8084 + entry, etc. To insert them into tree one has to provide this structure. If
8085 + one is going to insert flow - he can use insert_flow, where this structure
8086 + does not have to be created
8087 +*/
8088 +struct reiser4_item_data {
8089 + /* actual data to be inserted. If NULL, ->create_item() will not
8090 + do xmemcpy itself, leaving this up to the caller. This can
8091 + save some amount of unnecessary memory copying, for example,
8092 + during insertion of stat data.
8093 +
8094 + */
8095 + char *data;
8096 + /* 1 if 'char * data' contains pointer to user space and 0 if it is
8097 + kernel space */
8098 + int user;
8099 + /* amount of data we are going to insert or paste */
8100 + int length;
8101 + /* "Arg" is opaque data that is passed down to the
8102 + ->create_item() method of node layout, which in turn
8103 + hands it to the ->create_hook() of item being created. This
8104 + arg is currently used by:
8105 +
8106 + . ->create_hook() of internal item
8107 + (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8108 + . ->paste() method of directory item.
8109 + . ->create_hook() of extent item
8110 +
8111 + For internal item, this is left "brother" of new node being
8112 + inserted and it is used to add new node into sibling list
8113 + after parent to it was just inserted into parent.
8114 +
8115 + While ->arg does look somewhat of unnecessary compication,
8116 + it actually saves a lot of headache in many places, because
8117 + all data necessary to insert or paste new data into tree are
8118 + collected in one place, and this eliminates a lot of extra
8119 + argument passing and storing everywhere.
8120 +
8121 + */
8122 + void *arg;
8123 + /* plugin of item we are inserting */
8124 + item_plugin *iplug;
8125 +};
8126 +
8127 +/* __REISER4_COORD_H__ */
8128 +#endif
8129 +
8130 +/* Make Linus happy.
8131 + Local variables:
8132 + c-indentation-style: "K&R"
8133 + mode-name: "LC"
8134 + c-basic-offset: 8
8135 + tab-width: 8
8136 + fill-column: 120
8137 + scroll-step: 1
8138 + End:
8139 +*/
8140 diff -urN linux-2.6.20.orig/fs/reiser4/debug.c linux-2.6.20/fs/reiser4/debug.c
8141 --- linux-2.6.20.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8142 +++ linux-2.6.20/fs/reiser4/debug.c 2007-05-06 14:50:43.702976975 +0400
8143 @@ -0,0 +1,308 @@
8144 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8145 + * reiser4/README */
8146 +
8147 +/* Debugging facilities. */
8148 +
8149 +/*
8150 + * This file contains generic debugging functions used by reiser4. Roughly
8151 + * following:
8152 + *
8153 + * panicking: reiser4_do_panic(), reiser4_print_prefix().
8154 + *
8155 + * locking:
8156 + * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8157 + * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8158 + *
8159 + * error code monitoring (see comment before RETERR macro):
8160 + * reiser4_return_err(), reiser4_report_err().
8161 + *
8162 + * stack back-tracing: fill_backtrace()
8163 + *
8164 + * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8165 + * reiser4_debugtrap().
8166 + *
8167 + */
8168 +
8169 +#include "reiser4.h"
8170 +#include "context.h"
8171 +#include "super.h"
8172 +#include "txnmgr.h"
8173 +#include "znode.h"
8174 +
8175 +#include <linux/sysfs.h>
8176 +#include <linux/slab.h>
8177 +#include <linux/types.h>
8178 +#include <linux/fs.h>
8179 +#include <linux/spinlock.h>
8180 +#include <linux/kallsyms.h>
8181 +#include <linux/vmalloc.h>
8182 +#include <linux/ctype.h>
8183 +#include <linux/sysctl.h>
8184 +#include <linux/hardirq.h>
8185 +
8186 +#if 0
8187 +#if REISER4_DEBUG
8188 +static void reiser4_report_err(void);
8189 +#else
8190 +#define reiser4_report_err() noop
8191 +#endif
8192 +#endif /* 0 */
8193 +
8194 +/*
8195 + * global buffer where message given to reiser4_panic is formatted.
8196 + */
8197 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8198 +
8199 +/*
8200 + * lock protecting consistency of panic_buf under concurrent panics
8201 + */
8202 +static DEFINE_SPINLOCK(panic_guard);
8203 +
8204 +/* Your best friend. Call it on each occasion. This is called by
8205 + fs/reiser4/debug.h:reiser4_panic(). */
8206 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8207 +{
8208 + static int in_panic = 0;
8209 + va_list args;
8210 +
8211 + /*
8212 + * check for recursive panic.
8213 + */
8214 + if (in_panic == 0) {
8215 + in_panic = 1;
8216 +
8217 + spin_lock(&panic_guard);
8218 + va_start(args, format);
8219 + vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8220 + va_end(args);
8221 + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8222 + spin_unlock(&panic_guard);
8223 +
8224 + /*
8225 + * if kernel debugger is configured---drop in. Early dropping
8226 + * into kgdb is not always convenient, because panic message
8227 + * is not yet printed most of the times. But:
8228 + *
8229 + * (1) message can be extracted from printk_buf[]
8230 + * (declared static inside of printk()), and
8231 + *
8232 + * (2) sometimes serial/kgdb combo dies while printing
8233 + * long panic message, so it's more prudent to break into
8234 + * debugger earlier.
8235 + *
8236 + */
8237 + DEBUGON(1);
8238 + }
8239 + /* to make gcc happy about noreturn attribute */
8240 + panic("%s", panic_buf);
8241 +}
8242 +
8243 +#if 0
8244 +void
8245 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8246 + const char *function, const char *file, int lineno)
8247 +{
8248 + const char *comm;
8249 + int pid;
8250 +
8251 + if (unlikely(in_interrupt() || in_irq())) {
8252 + comm = "interrupt";
8253 + pid = 0;
8254 + } else {
8255 + comm = current->comm;
8256 + pid = current->pid;
8257 + }
8258 + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8259 + level, comm, pid, function, file, lineno, mid);
8260 + if (reperr)
8261 + reiser4_report_err();
8262 +}
8263 +#endif /* 0 */
8264 +
8265 +/* Preemption point: this should be called periodically during long running
8266 + operations (carry, allocate, and squeeze are best examples) */
8267 +int reiser4_preempt_point(void)
8268 +{
8269 + assert("nikita-3008", reiser4_schedulable());
8270 + cond_resched();
8271 + return signal_pending(current);
8272 +}
8273 +
8274 +#if REISER4_DEBUG
8275 +/* Debugging aid: return struct where information about locks taken by current
8276 + thread is accumulated. This can be used to formulate lock ordering
8277 + constraints and various assertions.
8278 +
8279 +*/
8280 +reiser4_lock_counters_info *reiser4_lock_counters(void)
8281 +{
8282 + reiser4_context *ctx = get_current_context();
8283 + assert("jmacd-1123", ctx != NULL);
8284 + return &ctx->locks;
8285 +}
8286 +
8287 +/*
8288 + * print human readable information about locks held by the reiser4 context.
8289 + */
8290 +static void print_lock_counters(const char *prefix,
8291 + const reiser4_lock_counters_info * info)
8292 +{
8293 + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8294 + "jload: %i, "
8295 + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8296 + "ktxnmgrd: %i, fq: %i\n"
8297 + "inode: %i, "
8298 + "cbk_cache: %i (r:%i,w%i), "
8299 + "eflush: %i, "
8300 + "zlock: %i,\n"
8301 + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8302 + "d: %i, x: %i, t: %i\n", prefix,
8303 + info->spin_locked_jnode,
8304 + info->rw_locked_tree, info->read_locked_tree,
8305 + info->write_locked_tree,
8306 + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8307 + info->spin_locked_jload,
8308 + info->spin_locked_txnh,
8309 + info->spin_locked_atom, info->spin_locked_stack,
8310 + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8311 + info->spin_locked_fq,
8312 + info->spin_locked_inode,
8313 + info->rw_locked_cbk_cache,
8314 + info->read_locked_cbk_cache,
8315 + info->write_locked_cbk_cache,
8316 + info->spin_locked_super_eflush,
8317 + info->spin_locked_zlock,
8318 + info->spin_locked,
8319 + info->long_term_locked_znode,
8320 + info->inode_sem_r, info->inode_sem_w,
8321 + info->d_refs, info->x_refs, info->t_refs);
8322 +}
8323 +
8324 +/* check that no spinlocks are held */
8325 +int reiser4_schedulable(void)
8326 +{
8327 + if (get_current_context_check() != NULL) {
8328 + if (!LOCK_CNT_NIL(spin_locked)) {
8329 + print_lock_counters("in atomic", reiser4_lock_counters());
8330 + return 0;
8331 + }
8332 + }
8333 + might_sleep();
8334 + return 1;
8335 +}
8336 +/*
8337 + * return true, iff no locks are held.
8338 + */
8339 +int reiser4_no_counters_are_held(void)
8340 +{
8341 + reiser4_lock_counters_info *counters;
8342 +
8343 + counters = reiser4_lock_counters();
8344 + return
8345 + (counters->spin_locked_zlock == 0) &&
8346 + (counters->spin_locked_jnode == 0) &&
8347 + (counters->rw_locked_tree == 0) &&
8348 + (counters->read_locked_tree == 0) &&
8349 + (counters->write_locked_tree == 0) &&
8350 + (counters->rw_locked_dk == 0) &&
8351 + (counters->read_locked_dk == 0) &&
8352 + (counters->write_locked_dk == 0) &&
8353 + (counters->spin_locked_txnh == 0) &&
8354 + (counters->spin_locked_atom == 0) &&
8355 + (counters->spin_locked_stack == 0) &&
8356 + (counters->spin_locked_txnmgr == 0) &&
8357 + (counters->spin_locked_inode == 0) &&
8358 + (counters->spin_locked == 0) &&
8359 + (counters->long_term_locked_znode == 0) &&
8360 + (counters->inode_sem_r == 0) &&
8361 + (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8362 +}
8363 +
8364 +/*
8365 + * return true, iff transaction commit can be done under locks held by the
8366 + * current thread.
8367 + */
8368 +int reiser4_commit_check_locks(void)
8369 +{
8370 + reiser4_lock_counters_info *counters;
8371 + int inode_sem_r;
8372 + int inode_sem_w;
8373 + int result;
8374 +
8375 + /*
8376 + * inode's read/write semaphore is the only reiser4 lock that can be
8377 + * held during commit.
8378 + */
8379 +
8380 + counters = reiser4_lock_counters();
8381 + inode_sem_r = counters->inode_sem_r;
8382 + inode_sem_w = counters->inode_sem_w;
8383 +
8384 + counters->inode_sem_r = counters->inode_sem_w = 0;
8385 + result = reiser4_no_counters_are_held();
8386 + counters->inode_sem_r = inode_sem_r;
8387 + counters->inode_sem_w = inode_sem_w;
8388 + return result;
8389 +}
8390 +
8391 +/*
8392 + * fill "error site" in the current reiser4 context. See comment before RETERR
8393 + * macro for more details.
8394 + */
8395 +void reiser4_return_err(int code, const char *file, int line)
8396 +{
8397 + if (code < 0 && is_in_reiser4_context()) {
8398 + reiser4_context *ctx = get_current_context();
8399 +
8400 + if (ctx != NULL) {
8401 + ctx->err.code = code;
8402 + ctx->err.file = file;
8403 + ctx->err.line = line;
8404 + }
8405 + }
8406 +}
8407 +
8408 +#if 0
8409 +/*
8410 + * report error information recorder by reiser4_return_err().
8411 + */
8412 +static void reiser4_report_err(void)
8413 +{
8414 + reiser4_context *ctx = get_current_context_check();
8415 +
8416 + if (ctx != NULL) {
8417 + if (ctx->err.code != 0) {
8418 + printk("code: %i at %s:%i\n",
8419 + ctx->err.code, ctx->err.file, ctx->err.line);
8420 + }
8421 + }
8422 +}
8423 +#endif /* 0 */
8424 +
8425 +#endif /* REISER4_DEBUG */
8426 +
8427 +#if KERNEL_DEBUGGER
8428 +
8429 +/*
8430 + * this functions just drops into kernel debugger. It is a convenient place to
8431 + * put breakpoint in.
8432 + */
8433 +void reiser4_debugtrap(void)
8434 +{
8435 + /* do nothing. Put break point here. */
8436 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8437 + extern void breakpoint(void);
8438 + breakpoint();
8439 +#endif
8440 +}
8441 +#endif
8442 +
8443 +/* Make Linus happy.
8444 + Local variables:
8445 + c-indentation-style: "K&R"
8446 + mode-name: "LC"
8447 + c-basic-offset: 8
8448 + tab-width: 8
8449 + fill-column: 120
8450 + End:
8451 +*/
8452 diff -urN linux-2.6.20.orig/fs/reiser4/debug.h linux-2.6.20/fs/reiser4/debug.h
8453 --- linux-2.6.20.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8454 +++ linux-2.6.20/fs/reiser4/debug.h 2007-05-06 14:50:43.702976975 +0400
8455 @@ -0,0 +1,350 @@
8456 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8457 +
8458 +/* Declarations of debug macros. */
8459 +
8460 +#if !defined( __FS_REISER4_DEBUG_H__ )
8461 +#define __FS_REISER4_DEBUG_H__
8462 +
8463 +#include "forward.h"
8464 +#include "reiser4.h"
8465 +
8466 +/* generic function to produce formatted output, decorating it with
8467 + whatever standard prefixes/postfixes we want. "Fun" is a function
8468 + that will be actually called, can be printk, panic etc.
8469 + This is for use by other debugging macros, not by users. */
8470 +#define DCALL(lev, fun, reperr, label, format, ...) \
8471 +({ \
8472 + fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8473 + current->comm, current->pid, __FUNCTION__, \
8474 + __FILE__, __LINE__, label, ## __VA_ARGS__); \
8475 +})
8476 +
8477 +/*
8478 + * cause kernel to crash
8479 + */
8480 +#define reiser4_panic(mid, format, ...) \
8481 + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8482 +
8483 +/* print message with indication of current process, file, line and
8484 + function */
8485 +#define reiser4_log(label, format, ...) \
8486 + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8487 +
8488 +/* Assertion checked during compilation.
8489 + If "cond" is false (0) we get duplicate case label in switch.
8490 + Use this to check something like famous
8491 + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8492 + in 3.x journal.c. If cassertion fails you get compiler error,
8493 + so no "maintainer-id".
8494 +*/
8495 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8496 +
8497 +#define noop do {;} while(0)
8498 +
8499 +#if REISER4_DEBUG
8500 +/* version of info that only actually prints anything when _d_ebugging
8501 + is on */
8502 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8503 +/* macro to catch logical errors. Put it into `default' clause of
8504 + switch() statement. */
8505 +#define impossible(label, format, ...) \
8506 + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8507 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8508 + called. Use this for checking logical consistency and _never_ call
8509 + this to check correctness of external data: disk blocks and user-input . */
8510 +#define assert(label, cond) \
8511 +({ \
8512 + /* call_on_each_assert(); */ \
8513 + if (cond) { \
8514 + /* put negated check to avoid using !(cond) that would lose \
8515 + * warnings for things like assert(a = b); */ \
8516 + ; \
8517 + } else { \
8518 + DEBUGON(1); \
8519 + reiser4_panic(label, "assertion failed: %s", #cond); \
8520 + } \
8521 +})
8522 +
8523 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8524 +#define check_me( label, expr ) assert( label, ( expr ) )
8525 +
8526 +#define ON_DEBUG( exp ) exp
8527 +
8528 +extern int reiser4_schedulable(void);
8529 +extern void call_on_each_assert(void);
8530 +
8531 +#else
8532 +
8533 +#define dinfo( format, args... ) noop
8534 +#define impossible( label, format, args... ) noop
8535 +#define assert( label, cond ) noop
8536 +#define check_me( label, expr ) ( ( void ) ( expr ) )
8537 +#define ON_DEBUG( exp )
8538 +#define reiser4_schedulable() might_sleep()
8539 +
8540 +/* REISER4_DEBUG */
8541 +#endif
8542 +
8543 +#if REISER4_DEBUG
8544 +/* per-thread information about lock acquired by this thread. Used by lock
8545 + * ordering checking in spin_macros.h */
8546 +typedef struct reiser4_lock_counters_info {
8547 + int rw_locked_tree;
8548 + int read_locked_tree;
8549 + int write_locked_tree;
8550 +
8551 + int rw_locked_dk;
8552 + int read_locked_dk;
8553 + int write_locked_dk;
8554 +
8555 + int rw_locked_cbk_cache;
8556 + int read_locked_cbk_cache;
8557 + int write_locked_cbk_cache;
8558 +
8559 + int spin_locked_zlock;
8560 + int spin_locked_jnode;
8561 + int spin_locked_jload;
8562 + int spin_locked_txnh;
8563 + int spin_locked_atom;
8564 + int spin_locked_stack;
8565 + int spin_locked_txnmgr;
8566 + int spin_locked_ktxnmgrd;
8567 + int spin_locked_fq;
8568 + int spin_locked_inode;
8569 + int spin_locked_super_eflush;
8570 + int spin_locked;
8571 + int long_term_locked_znode;
8572 +
8573 + int inode_sem_r;
8574 + int inode_sem_w;
8575 +
8576 + int d_refs;
8577 + int x_refs;
8578 + int t_refs;
8579 +} reiser4_lock_counters_info;
8580 +
8581 +extern reiser4_lock_counters_info *reiser4_lock_counters(void);
8582 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8583 +
8584 +/* increment lock-counter @counter, if present */
8585 +#define LOCK_CNT_INC(counter) \
8586 + IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8587 +
8588 +/* decrement lock-counter @counter, if present */
8589 +#define LOCK_CNT_DEC(counter) \
8590 + IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8591 +
8592 +/* check that lock-counter is zero. This is for use in assertions */
8593 +#define LOCK_CNT_NIL(counter) \
8594 + IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8595 +
8596 +/* check that lock-counter is greater than zero. This is for use in
8597 + * assertions */
8598 +#define LOCK_CNT_GTZ(counter) \
8599 + IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8600 +#define LOCK_CNT_LT(counter,n) \
8601 + IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8602 +
8603 +#else /* REISER4_DEBUG */
8604 +
8605 +/* no-op versions on the above */
8606 +
8607 +typedef struct reiser4_lock_counters_info {
8608 +} reiser4_lock_counters_info;
8609 +
8610 +#define reiser4_lock_counters() ((reiser4_lock_counters_info *)NULL)
8611 +#define LOCK_CNT_INC(counter) noop
8612 +#define LOCK_CNT_DEC(counter) noop
8613 +#define LOCK_CNT_NIL(counter) (1)
8614 +#define LOCK_CNT_GTZ(counter) (1)
8615 +#define LOCK_CNT_LT(counter,n) (1)
8616 +
8617 +#endif /* REISER4_DEBUG */
8618 +
8619 +#define assert_spin_not_locked(lock) BUG_ON(0)
8620 +#define assert_rw_write_locked(lock) BUG_ON(0)
8621 +#define assert_rw_read_locked(lock) BUG_ON(0)
8622 +#define assert_rw_locked(lock) BUG_ON(0)
8623 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8624 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8625 +#define assert_rw_not_locked(lock) BUG_ON(0)
8626 +
8627 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8628 + option. */
8629 +typedef enum {
8630 + /* print a lot of information during panic. When this is on all jnodes
8631 + * are listed. This can be *very* large output. Usually you don't want
8632 + * this. Especially over serial line. */
8633 + REISER4_VERBOSE_PANIC = 0x00000001,
8634 + /* print a lot of information during umount */
8635 + REISER4_VERBOSE_UMOUNT = 0x00000002,
8636 + /* print gathered statistics on umount */
8637 + REISER4_STATS_ON_UMOUNT = 0x00000004,
8638 + /* check node consistency */
8639 + REISER4_CHECK_NODE = 0x00000008
8640 +} reiser4_debug_flags;
8641 +
8642 +extern int is_in_reiser4_context(void);
8643 +
8644 +/*
8645 + * evaluate expression @e only if with reiser4 context
8646 + */
8647 +#define ON_CONTEXT(e) do { \
8648 + if(is_in_reiser4_context()) { \
8649 + e; \
8650 + } } while(0)
8651 +
8652 +/*
8653 + * evaluate expression @e only when within reiser4_context and debugging is
8654 + * on.
8655 + */
8656 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8657 +
8658 +/*
8659 + * complain about unexpected function result and crash. Used in "default"
8660 + * branches of switch statements and alike to assert that invalid results are
8661 + * not silently ignored.
8662 + */
8663 +#define wrong_return_value( label, function ) \
8664 + impossible( label, "wrong return value from " function )
8665 +
8666 +/* Issue different types of reiser4 messages to the console */
8667 +#define warning( label, format, ... ) \
8668 + DCALL( KERN_WARNING, \
8669 + printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8670 +#define notice( label, format, ... ) \
8671 + DCALL( KERN_NOTICE, \
8672 + printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8673 +
8674 +/* mark not yet implemented functionality */
8675 +#define not_yet( label, format, ... ) \
8676 + reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8677 +
8678 +extern void reiser4_do_panic(const char *format, ...)
8679 + __attribute__ ((noreturn, format(printf, 1, 2)));
8680 +
8681 +extern int reiser4_preempt_point(void);
8682 +extern void reiser4_print_stats(void);
8683 +
8684 +#if REISER4_DEBUG
8685 +extern int reiser4_no_counters_are_held(void);
8686 +extern int reiser4_commit_check_locks(void);
8687 +#else
8688 +#define reiser4_no_counters_are_held() (1)
8689 +#define reiser4_commit_check_locks() (1)
8690 +#endif
8691 +
8692 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8693 +#define IS_POW(i) \
8694 +({ \
8695 + typeof(i) __i; \
8696 + \
8697 + __i = (i); \
8698 + !(__i & (__i - 1)); \
8699 +})
8700 +
8701 +#define KERNEL_DEBUGGER (1)
8702 +
8703 +#if KERNEL_DEBUGGER
8704 +
8705 +extern void reiser4_debugtrap(void);
8706 +
8707 +/*
8708 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8709 + * kgdb is not compiled in, do nothing.
8710 + */
8711 +#define DEBUGON(cond) \
8712 +({ \
8713 + if (unlikely(cond)) \
8714 + reiser4_debugtrap(); \
8715 +})
8716 +#else
8717 +#define DEBUGON(cond) noop
8718 +#endif
8719 +
8720 +/*
8721 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8722 + *
8723 + * Suppose some strange and/or unexpected code is returned from some function
8724 + * (for example, write(2) returns -EEXIST). It is possible to place a
8725 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8726 + * in what particular place -EEXIST was generated first?
8727 + *
8728 + * In reiser4 all places where actual error codes are produced (that is,
8729 + * statements of the form
8730 + *
8731 + * return -EFOO; // (1), or
8732 + *
8733 + * result = -EFOO; // (2)
8734 + *
8735 + * are replaced with
8736 + *
8737 + * return RETERR(-EFOO); // (1a), and
8738 + *
8739 + * result = RETERR(-EFOO); // (2a) respectively
8740 + *
8741 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8742 + * printed in error and warning messages. Moreover, it's possible to put a
8743 + * conditional breakpoint in reiser4_return_err (low-level function called
8744 + * by RETERR() to do the actual work) to break into debugger immediately
8745 + * when particular error happens.
8746 + *
8747 + */
8748 +
8749 +#if REISER4_DEBUG
8750 +
8751 +/*
8752 + * data-type to store information about where error happened ("error site").
8753 + */
8754 +typedef struct err_site {
8755 + int code; /* error code */
8756 + const char *file; /* source file, filled by __FILE__ */
8757 + int line; /* source file line, filled by __LINE__ */
8758 +} err_site;
8759 +
8760 +extern void reiser4_return_err(int code, const char *file, int line);
8761 +
8762 +/*
8763 + * fill &get_current_context()->err_site with error information.
8764 + */
8765 +#define RETERR(code) \
8766 +({ \
8767 + typeof(code) __code; \
8768 + \
8769 + __code = (code); \
8770 + reiser4_return_err(__code, __FILE__, __LINE__); \
8771 + __code; \
8772 +})
8773 +
8774 +#else
8775 +
8776 +/*
8777 + * no-op versions of the above
8778 + */
8779 +
8780 +typedef struct err_site {
8781 +} err_site;
8782 +#define RETERR(code) code
8783 +#endif
8784 +
8785 +#if REISER4_LARGE_KEY
8786 +/*
8787 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8788 + */
8789 +#define ON_LARGE_KEY(...) __VA_ARGS__
8790 +#else
8791 +#define ON_LARGE_KEY(...)
8792 +#endif
8793 +
8794 +/* __FS_REISER4_DEBUG_H__ */
8795 +#endif
8796 +
8797 +/* Make Linus happy.
8798 + Local variables:
8799 + c-indentation-style: "K&R"
8800 + mode-name: "LC"
8801 + c-basic-offset: 8
8802 + tab-width: 8
8803 + fill-column: 120
8804 + End:
8805 +*/
8806 diff -urN linux-2.6.20.orig/fs/reiser4/dformat.h linux-2.6.20/fs/reiser4/dformat.h
8807 --- linux-2.6.20.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8808 +++ linux-2.6.20/fs/reiser4/dformat.h 2007-05-06 14:50:43.702976975 +0400
8809 @@ -0,0 +1,70 @@
8810 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8811 +
8812 +/* Formats of on-disk data and conversion functions. */
8813 +
8814 +/* put all item formats in the files describing the particular items,
8815 + our model is, everything you need to do to add an item to reiser4,
8816 + (excepting the changes to the plugin that uses the item which go
8817 + into the file defining that plugin), you put into one file. */
8818 +/* Data on disk are stored in little-endian format.
8819 + To declare fields of on-disk structures, use d8, d16, d32 and d64.
8820 + d??tocpu() and cputod??() to convert. */
8821 +
8822 +#if !defined( __FS_REISER4_DFORMAT_H__ )
8823 +#define __FS_REISER4_DFORMAT_H__
8824 +
8825 +#include <asm/byteorder.h>
8826 +#include <asm/unaligned.h>
8827 +#include <linux/types.h>
8828 +
8829 +typedef __u8 d8;
8830 +typedef __le16 d16;
8831 +typedef __le32 d32;
8832 +typedef __le64 d64;
8833 +
8834 +#define PACKED __attribute__((packed))
8835 +
8836 +/* data-type for block number */
8837 +typedef __u64 reiser4_block_nr;
8838 +
8839 +/* data-type for block number on disk, disk format */
8840 +typedef __le64 reiser4_dblock_nr;
8841 +
8842 +/**
8843 + * disk_addr_eq - compare disk addresses
8844 + * @b1: pointer to block number ot compare
8845 + * @b2: pointer to block number ot compare
8846 + *
8847 + * Returns true if if disk addresses are the same
8848 + */
8849 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
8850 + const reiser4_block_nr * b2)
8851 +{
8852 + assert("nikita-1033", b1 != NULL);
8853 + assert("nikita-1266", b2 != NULL);
8854 +
8855 + return !memcmp(b1, b2, sizeof *b1);
8856 +}
8857 +
8858 +/* structure of master reiser4 super block */
8859 +typedef struct reiser4_master_sb {
8860 + char magic[16]; /* "ReIsEr4" */
8861 + __le16 disk_plugin_id; /* id of disk layout plugin */
8862 + __le16 blocksize;
8863 + char uuid[16]; /* unique id */
8864 + char label[16]; /* filesystem label */
8865 + __le64 diskmap; /* location of the diskmap. 0 if not present */
8866 +} reiser4_master_sb;
8867 +
8868 +/* __FS_REISER4_DFORMAT_H__ */
8869 +#endif
8870 +
8871 +/*
8872 + * Local variables:
8873 + * c-indentation-style: "K&R"
8874 + * mode-name: "LC"
8875 + * c-basic-offset: 8
8876 + * tab-width: 8
8877 + * fill-column: 79
8878 + * End:
8879 + */
8880 diff -urN linux-2.6.20.orig/fs/reiser4/dscale.c linux-2.6.20/fs/reiser4/dscale.c
8881 --- linux-2.6.20.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8882 +++ linux-2.6.20/fs/reiser4/dscale.c 2007-05-06 14:50:43.702976975 +0400
8883 @@ -0,0 +1,174 @@
8884 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8885 + * reiser4/README */
8886 +
8887 +/* Scalable on-disk integers */
8888 +
8889 +/*
8890 + * Various on-disk structures contain integer-like structures. Stat-data
8891 + * contain [yes, "data" is plural, check the dictionary] file size, link
8892 + * count; extent unit contains extent width etc. To accommodate for general
8893 + * case enough space is reserved to keep largest possible value. 64 bits in
8894 + * all cases above. But in overwhelming majority of cases numbers actually
8895 + * stored in these fields will be comparatively small and reserving 8 bytes is
8896 + * a waste of precious disk bandwidth.
8897 + *
8898 + * Scalable integers are one way to solve this problem. dscale_write()
8899 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8900 + * depending on the magnitude of the value supplied. dscale_read() reads value
8901 + * previously stored by dscale_write().
8902 + *
8903 + * dscale_write() produces format not completely unlike of UTF: two highest
8904 + * bits of the first byte are used to store "tag". One of 4 possible tag
8905 + * values is chosen depending on the number being encoded:
8906 + *
8907 + * 0 ... 0x3f => 0 [table 1]
8908 + * 0x40 ... 0x3fff => 1
8909 + * 0x4000 ... 0x3fffffff => 2
8910 + * 0x40000000 ... 0xffffffffffffffff => 3
8911 + *
8912 + * (see dscale_range() function)
8913 + *
8914 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8915 + * to be stored, so in this case there is no place in the first byte to store
8916 + * tag. For such values tag is stored in an extra 9th byte.
8917 + *
8918 + * As _highest_ bits are used for the test (which is natural) scaled integers
8919 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8920 + * uses LITTLE-ENDIAN.
8921 + *
8922 + */
8923 +
8924 +#include "debug.h"
8925 +#include "dscale.h"
8926 +
8927 +/* return tag of scaled integer stored at @address */
8928 +static int gettag(const unsigned char *address)
8929 +{
8930 + /* tag is stored in two highest bits */
8931 + return (*address) >> 6;
8932 +}
8933 +
8934 +/* clear tag from value. Clear tag embedded into @value. */
8935 +static void cleartag(__u64 * value, int tag)
8936 +{
8937 + /*
8938 + * W-w-what ?!
8939 + *
8940 + * Actually, this is rather simple: @value passed here was read by
8941 + * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8942 + * zeroes. Tag is still stored in the highest (arithmetically)
8943 + * non-zero bits of @value, but relative position of tag within __u64
8944 + * depends on @tag.
8945 + *
8946 + * For example if @tag is 0, it's stored 2 highest bits of lowest
8947 + * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8948 + *
8949 + * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8950 + * and it's offset if (2 * 8) - 2 == 14 bits.
8951 + *
8952 + * See table 1 above for details.
8953 + *
8954 + * All these cases are captured by the formula:
8955 + */
8956 + *value &= ~(3 << (((1 << tag) << 3) - 2));
8957 + /*
8958 + * That is, clear two (3 == 0t11) bits at the offset
8959 + *
8960 + * 8 * (2 ^ tag) - 2,
8961 + *
8962 + * that is, two highest bits of (2 ^ tag)-th byte of @value.
8963 + */
8964 +}
8965 +
8966 +/* return tag for @value. See table 1 above for details. */
8967 +static int dscale_range(__u64 value)
8968 +{
8969 + if (value > 0x3fffffff)
8970 + return 3;
8971 + if (value > 0x3fff)
8972 + return 2;
8973 + if (value > 0x3f)
8974 + return 1;
8975 + return 0;
8976 +}
8977 +
8978 +/* restore value stored at @adderss by dscale_write() and return number of
8979 + * bytes consumed */
8980 +int dscale_read(unsigned char *address, __u64 * value)
8981 +{
8982 + int tag;
8983 +
8984 + /* read tag */
8985 + tag = gettag(address);
8986 + switch (tag) {
8987 + case 3:
8988 + /* In this case tag is stored in an extra byte, skip this byte
8989 + * and decode value stored in the next 8 bytes.*/
8990 + *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
8991 + /* worst case: 8 bytes for value itself plus one byte for
8992 + * tag. */
8993 + return 9;
8994 + case 0:
8995 + *value = get_unaligned(address);
8996 + break;
8997 + case 1:
8998 + *value = __be16_to_cpu(get_unaligned((__be16 *)address));
8999 + break;
9000 + case 2:
9001 + *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9002 + break;
9003 + default:
9004 + return RETERR(-EIO);
9005 + }
9006 + /* clear tag embedded into @value */
9007 + cleartag(value, tag);
9008 + /* number of bytes consumed is (2 ^ tag)---see table 1. */
9009 + return 1 << tag;
9010 +}
9011 +
9012 +/* store @value at @address and return number of bytes consumed */
9013 +int dscale_write(unsigned char *address, __u64 value)
9014 +{
9015 + int tag;
9016 + int shift;
9017 + __be64 v;
9018 + unsigned char *valarr;
9019 +
9020 + tag = dscale_range(value);
9021 + v = __cpu_to_be64(value);
9022 + valarr = (unsigned char *)&v;
9023 + shift = (tag == 3) ? 1 : 0;
9024 + memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9025 + *address |= (tag << 6);
9026 + return shift + (1 << tag);
9027 +}
9028 +
9029 +/* number of bytes required to store @value */
9030 +int dscale_bytes(__u64 value)
9031 +{
9032 + int bytes;
9033 +
9034 + bytes = 1 << dscale_range(value);
9035 + if (bytes == 8)
9036 + ++bytes;
9037 + return bytes;
9038 +}
9039 +
9040 +/* returns true if @value and @other require the same number of bytes to be
9041 + * stored. Used by detect when data structure (like stat-data) has to be
9042 + * expanded or contracted. */
9043 +int dscale_fit(__u64 value, __u64 other)
9044 +{
9045 + return dscale_range(value) == dscale_range(other);
9046 +}
9047 +
9048 +/* Make Linus happy.
9049 + Local variables:
9050 + c-indentation-style: "K&R"
9051 + mode-name: "LC"
9052 + c-basic-offset: 8
9053 + tab-width: 8
9054 + fill-column: 120
9055 + scroll-step: 1
9056 + End:
9057 +*/
9058 diff -urN linux-2.6.20.orig/fs/reiser4/dscale.h linux-2.6.20/fs/reiser4/dscale.h
9059 --- linux-2.6.20.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9060 +++ linux-2.6.20/fs/reiser4/dscale.h 2007-05-06 14:50:43.702976975 +0400
9061 @@ -0,0 +1,27 @@
9062 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9063 + * reiser4/README */
9064 +
9065 +/* Scalable on-disk integers. See dscale.h for details. */
9066 +
9067 +#if !defined( __FS_REISER4_DSCALE_H__ )
9068 +#define __FS_REISER4_DSCALE_H__
9069 +
9070 +#include "dformat.h"
9071 +
9072 +extern int dscale_read(unsigned char *address, __u64 * value);
9073 +extern int dscale_write(unsigned char *address, __u64 value);
9074 +extern int dscale_bytes(__u64 value);
9075 +extern int dscale_fit(__u64 value, __u64 other);
9076 +
9077 +/* __FS_REISER4_DSCALE_H__ */
9078 +#endif
9079 +
9080 +/* Make Linus happy.
9081 + Local variables:
9082 + c-indentation-style: "K&R"
9083 + mode-name: "LC"
9084 + c-basic-offset: 8
9085 + tab-width: 8
9086 + fill-column: 120
9087 + End:
9088 +*/
9089 diff -urN linux-2.6.20.orig/fs/reiser4/entd.c linux-2.6.20/fs/reiser4/entd.c
9090 --- linux-2.6.20.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9091 +++ linux-2.6.20/fs/reiser4/entd.c 2007-05-06 14:50:43.702976975 +0400
9092 @@ -0,0 +1,335 @@
9093 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9094 + * reiser4/README */
9095 +
9096 +/* Ent daemon. */
9097 +
9098 +#include "debug.h"
9099 +#include "txnmgr.h"
9100 +#include "tree.h"
9101 +#include "entd.h"
9102 +#include "super.h"
9103 +#include "context.h"
9104 +#include "reiser4.h"
9105 +#include "vfs_ops.h"
9106 +#include "page_cache.h"
9107 +#include "inode.h"
9108 +
9109 +#include <linux/sched.h> /* struct task_struct */
9110 +#include <linux/suspend.h>
9111 +#include <linux/kernel.h>
9112 +#include <linux/writeback.h>
9113 +#include <linux/time.h> /* INITIAL_JIFFIES */
9114 +#include <linux/backing-dev.h> /* bdi_write_congested */
9115 +#include <linux/wait.h>
9116 +#include <linux/kthread.h>
9117 +#include <linux/freezer.h>
9118 +
9119 +#define DEF_PRIORITY 12
9120 +#define MAX_ENTD_ITERS 10
9121 +
9122 +static void entd_flush(struct super_block *, struct wbq *);
9123 +static int entd(void *arg);
9124 +
9125 +/*
9126 + * set ->comm field of end thread to make its state visible to the user level
9127 + */
9128 +#define entd_set_comm(state) \
9129 + snprintf(current->comm, sizeof(current->comm), \
9130 + "ent:%s%s", super->s_id, (state))
9131 +
9132 +/**
9133 + * reiser4_init_entd - initialize entd context and start kernel daemon
9134 + * @super: super block to start ent thread for
9135 + *
9136 + * Creates entd contexts, starts kernel thread and waits until it
9137 + * initializes.
9138 + */
9139 +int reiser4_init_entd(struct super_block *super)
9140 +{
9141 + entd_context *ctx;
9142 +
9143 + assert("nikita-3104", super != NULL);
9144 +
9145 + ctx = get_entd_context(super);
9146 +
9147 + memset(ctx, 0, sizeof *ctx);
9148 + spin_lock_init(&ctx->guard);
9149 + init_waitqueue_head(&ctx->wait);
9150 +#if REISER4_DEBUG
9151 + INIT_LIST_HEAD(&ctx->flushers_list);
9152 +#endif
9153 + /* lists of writepage requests */
9154 + INIT_LIST_HEAD(&ctx->todo_list);
9155 + INIT_LIST_HEAD(&ctx->done_list);
9156 + /* start entd */
9157 + ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9158 + if (IS_ERR(ctx->tsk))
9159 + return PTR_ERR(ctx->tsk);
9160 + return 0;
9161 +}
9162 +
9163 +static void put_wbq(struct wbq *rq)
9164 +{
9165 + iput(rq->mapping->host);
9166 + complete(&rq->completion);
9167 +}
9168 +
9169 +/* ent should be locked */
9170 +static struct wbq *__get_wbq(entd_context * ent)
9171 +{
9172 + struct wbq *wbq;
9173 +
9174 + if (list_empty(&ent->todo_list))
9175 + return NULL;
9176 +
9177 + ent->nr_todo_reqs --;
9178 + wbq = list_entry(ent->todo_list.next, struct wbq, link);
9179 + list_del_init(&wbq->link);
9180 + return wbq;
9181 +}
9182 +
9183 +/* ent thread function */
9184 +static int entd(void *arg)
9185 +{
9186 + struct super_block *super;
9187 + entd_context *ent;
9188 + int done = 0;
9189 +
9190 + super = arg;
9191 + /* do_fork() just copies task_struct into the new
9192 + thread. ->fs_context shouldn't be copied of course. This shouldn't
9193 + be a problem for the rest of the code though.
9194 + */
9195 + current->journal_info = NULL;
9196 +
9197 + ent = get_entd_context(super);
9198 +
9199 + while (!done) {
9200 + try_to_freeze();
9201 +
9202 + spin_lock(&ent->guard);
9203 + while (ent->nr_todo_reqs != 0) {
9204 + struct wbq *rq;
9205 +
9206 + assert("", list_empty(&ent->done_list));
9207 +
9208 + /* take request from the queue head */
9209 + rq = __get_wbq(ent);
9210 + assert("", rq != NULL);
9211 + ent->cur_request = rq;
9212 + spin_unlock(&ent->guard);
9213 +
9214 + entd_set_comm("!");
9215 + entd_flush(super, rq);
9216 +
9217 + put_wbq(rq);
9218 +
9219 + /*
9220 + * wakeup all requestors and iput their inodes
9221 + */
9222 + spin_lock(&ent->guard);
9223 + while (!list_empty(&ent->done_list)) {
9224 + rq = list_entry(ent->done_list.next, struct wbq, link);
9225 + list_del_init(&rq->link);
9226 + ent->nr_done_reqs --;
9227 + spin_unlock(&ent->guard);
9228 + assert("", rq->written == 1);
9229 + put_wbq(rq);
9230 + spin_lock(&ent->guard);
9231 + }
9232 + }
9233 + spin_unlock(&ent->guard);
9234 +
9235 + entd_set_comm(".");
9236 +
9237 + {
9238 + DEFINE_WAIT(__wait);
9239 +
9240 + do {
9241 + prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9242 + if (kthread_should_stop()) {
9243 + done = 1;
9244 + break;
9245 + }
9246 + if (ent->nr_todo_reqs != 0)
9247 + break;
9248 + schedule();
9249 + } while (0);
9250 + finish_wait(&ent->wait, &__wait);
9251 + }
9252 + }
9253 + BUG_ON(ent->nr_todo_reqs != 0);
9254 + return 0;
9255 +}
9256 +
9257 +/**
9258 + * reiser4_done_entd - stop entd kernel thread
9259 + * @super: super block to stop ent thread for
9260 + *
9261 + * It is called on umount. Sends stop signal to entd and wait until it handles
9262 + * it.
9263 + */
9264 +void reiser4_done_entd(struct super_block *super)
9265 +{
9266 + entd_context *ent;
9267 +
9268 + assert("nikita-3103", super != NULL);
9269 +
9270 + ent = get_entd_context(super);
9271 + assert("zam-1055", ent->tsk != NULL);
9272 + kthread_stop(ent->tsk);
9273 +}
9274 +
9275 +/* called at the beginning of jnode_flush to register flusher thread with ent
9276 + * daemon */
9277 +void reiser4_enter_flush(struct super_block *super)
9278 +{
9279 + entd_context *ent;
9280 +
9281 + assert("zam-1029", super != NULL);
9282 + ent = get_entd_context(super);
9283 +
9284 + assert("zam-1030", ent != NULL);
9285 +
9286 + spin_lock(&ent->guard);
9287 + ent->flushers++;
9288 +#if REISER4_DEBUG
9289 + list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9290 +#endif
9291 + spin_unlock(&ent->guard);
9292 +}
9293 +
9294 +/* called at the end of jnode_flush */
9295 +void reiser4_leave_flush(struct super_block *super)
9296 +{
9297 + entd_context *ent;
9298 + int wake_up_ent;
9299 +
9300 + assert("zam-1027", super != NULL);
9301 + ent = get_entd_context(super);
9302 +
9303 + assert("zam-1028", ent != NULL);
9304 +
9305 + spin_lock(&ent->guard);
9306 + ent->flushers--;
9307 + wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9308 +#if REISER4_DEBUG
9309 + list_del_init(&get_current_context()->flushers_link);
9310 +#endif
9311 + spin_unlock(&ent->guard);
9312 + if (wake_up_ent)
9313 + wake_up(&ent->wait);
9314 +}
9315 +
9316 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9317 +
9318 +static void entd_flush(struct super_block *super, struct wbq *rq)
9319 +{
9320 + reiser4_context ctx;
9321 + int tmp;
9322 +
9323 + init_stack_context(&ctx, super);
9324 + ctx.entd = 1;
9325 + ctx.gfp_mask = GFP_NOFS;
9326 +
9327 + rq->wbc->range_start = page_offset(rq->page);
9328 + rq->wbc->range_end = rq->wbc->range_start +
9329 + (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9330 + tmp = rq->wbc->nr_to_write;
9331 + rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9332 +
9333 + if (rq->wbc->nr_to_write > 0) {
9334 + rq->wbc->range_start = 0;
9335 + rq->wbc->range_end = LLONG_MAX;
9336 + generic_sync_sb_inodes(super, rq->wbc);
9337 + }
9338 + rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9339 + reiser4_writeout(super, rq->wbc);
9340 +
9341 + context_set_commit_async(&ctx);
9342 + reiser4_exit_context(&ctx);
9343 +}
9344 +
9345 +/**
9346 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9347 + * @page: page to be written
9348 + * @wbc: writeback control passed to reiser4_writepage
9349 + *
9350 + * Creates a request, puts it on entd list of requests, wakeups entd if
9351 + * necessary, waits until entd completes with the request.
9352 + */
9353 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9354 +{
9355 + struct super_block *sb;
9356 + struct inode *inode;
9357 + entd_context *ent;
9358 + struct wbq rq;
9359 +
9360 + assert("", PageLocked(page));
9361 + assert("", page->mapping != NULL);
9362 +
9363 + sb = page->mapping->host->i_sb;
9364 + ent = get_entd_context(sb);
9365 + assert("", ent && ent->done == 0);
9366 +
9367 + /*
9368 + * we are going to unlock page and ask ent thread to write the
9369 + * page. Re-dirty page before unlocking so that if ent thread fails to
9370 + * write it - it will remain dirty
9371 + */
9372 + reiser4_set_page_dirty_internal(page);
9373 +
9374 + /*
9375 + * pin inode in memory, unlock page, entd_flush will iput. We can not
9376 + * iput here becasue we can not allow delete_inode to be called here
9377 + */
9378 + inode = igrab(page->mapping->host);
9379 + unlock_page(page);
9380 + if (inode == NULL)
9381 + /* inode is getting freed */
9382 + return 0;
9383 +
9384 + /* init wbq */
9385 + INIT_LIST_HEAD(&rq.link);
9386 + rq.magic = WBQ_MAGIC;
9387 + rq.wbc = wbc;
9388 + rq.page = page;
9389 + rq.mapping = inode->i_mapping;
9390 + rq.node = NULL;
9391 + rq.written = 0;
9392 + init_completion(&rq.completion);
9393 +
9394 + /* add request to entd's list of writepage requests */
9395 + spin_lock(&ent->guard);
9396 + ent->nr_todo_reqs++;
9397 + list_add_tail(&rq.link, &ent->todo_list);
9398 + if (ent->nr_todo_reqs == 1)
9399 + wake_up(&ent->wait);
9400 +
9401 + spin_unlock(&ent->guard);
9402 +
9403 + /* wait until entd finishes */
9404 + wait_for_completion(&rq.completion);
9405 +
9406 + if (rq.written)
9407 + /* Eventually ENTD has written the page to disk. */
9408 + return 0;
9409 + return 0;
9410 +}
9411 +
9412 +int wbq_available(void)
9413 +{
9414 + struct super_block *sb = reiser4_get_current_sb();
9415 + entd_context *ent = get_entd_context(sb);
9416 + return ent->nr_todo_reqs;
9417 +}
9418 +
9419 +/*
9420 + * Local variables:
9421 + * c-indentation-style: "K&R"
9422 + * mode-name: "LC"
9423 + * c-basic-offset: 8
9424 + * tab-width: 8
9425 + * fill-column: 79
9426 + * End:
9427 + */
9428 diff -urN linux-2.6.20.orig/fs/reiser4/entd.h linux-2.6.20/fs/reiser4/entd.h
9429 --- linux-2.6.20.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9430 +++ linux-2.6.20/fs/reiser4/entd.h 2007-05-06 14:50:43.706978224 +0400
9431 @@ -0,0 +1,90 @@
9432 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9433 +
9434 +/* Ent daemon. */
9435 +
9436 +#ifndef __ENTD_H__
9437 +#define __ENTD_H__
9438 +
9439 +#include "context.h"
9440 +
9441 +#include <linux/fs.h>
9442 +#include <linux/completion.h>
9443 +#include <linux/wait.h>
9444 +#include <linux/spinlock.h>
9445 +#include <linux/sched.h> /* for struct task_struct */
9446 +
9447 +#define WBQ_MAGIC 0x7876dc76
9448 +
9449 +/* write-back request. */
9450 +struct wbq {
9451 + int magic;
9452 + struct list_head link; /* list head of this list is in entd context */
9453 + struct writeback_control *wbc;
9454 + struct page *page;
9455 + struct address_space *mapping;
9456 + struct completion completion;
9457 + jnode *node; /* set if ent thread captured requested page */
9458 + int written; /* set if ent thread wrote requested page */
9459 +};
9460 +
9461 +/* ent-thread context. This is used to synchronize starting/stopping ent
9462 + * threads. */
9463 +typedef struct entd_context {
9464 + /* wait queue that ent thread waits on for more work. It's
9465 + * signaled by write_page_by_ent(). */
9466 + wait_queue_head_t wait;
9467 + /* spinlock protecting other fields */
9468 + spinlock_t guard;
9469 + /* ent thread */
9470 + struct task_struct *tsk;
9471 + /* set to indicate that ent thread should leave. */
9472 + int done;
9473 + /* counter of active flushers */
9474 + int flushers;
9475 + /*
9476 + * when reiser4_writepage asks entd to write a page - it adds struct
9477 + * wbq to this list
9478 + */
9479 + struct list_head todo_list;
9480 + /* number of elements on the above list */
9481 + int nr_todo_reqs;
9482 +
9483 + struct wbq *cur_request;
9484 + /*
9485 + * when entd writes a page it moves write-back request from todo_list
9486 + * to done_list. This list is used at the end of entd iteration to
9487 + * wakeup requestors and iput inodes.
9488 + */
9489 + struct list_head done_list;
9490 + /* number of elements on the above list */
9491 + int nr_done_reqs;
9492 +
9493 +#if REISER4_DEBUG
9494 + /* list of all active flushers */
9495 + struct list_head flushers_list;
9496 +#endif
9497 +} entd_context;
9498 +
9499 +extern int reiser4_init_entd(struct super_block *);
9500 +extern void reiser4_done_entd(struct super_block *);
9501 +
9502 +extern void reiser4_enter_flush(struct super_block *);
9503 +extern void reiser4_leave_flush(struct super_block *);
9504 +
9505 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9506 +extern int wbq_available(void);
9507 +extern void ent_writes_page(struct super_block *, struct page *);
9508 +
9509 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9510 +/* __ENTD_H__ */
9511 +#endif
9512 +
9513 +/* Make Linus happy.
9514 + Local variables:
9515 + c-indentation-style: "K&R"
9516 + mode-name: "LC"
9517 + c-basic-offset: 8
9518 + tab-width: 8
9519 + fill-column: 120
9520 + End:
9521 +*/
9522 diff -urN linux-2.6.20.orig/fs/reiser4/eottl.c linux-2.6.20/fs/reiser4/eottl.c
9523 --- linux-2.6.20.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9524 +++ linux-2.6.20/fs/reiser4/eottl.c 2007-05-06 14:50:43.706978224 +0400
9525 @@ -0,0 +1,509 @@
9526 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9527 +
9528 +#include "forward.h"
9529 +#include "debug.h"
9530 +#include "key.h"
9531 +#include "coord.h"
9532 +#include "plugin/item/item.h"
9533 +#include "plugin/node/node.h"
9534 +#include "znode.h"
9535 +#include "block_alloc.h"
9536 +#include "tree_walk.h"
9537 +#include "tree_mod.h"
9538 +#include "carry.h"
9539 +#include "tree.h"
9540 +#include "super.h"
9541 +
9542 +#include <linux/types.h> /* for __u?? */
9543 +
9544 +/*
9545 + * Extents on the twig level (EOTTL) handling.
9546 + *
9547 + * EOTTL poses some problems to the tree traversal, that are better explained
9548 + * by example.
9549 + *
9550 + * Suppose we have block B1 on the twig level with the following items:
9551 + *
9552 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9553 + * offset)
9554 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9555 + * 2. internal item I2 with key (10:0:0:0)
9556 + *
9557 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9558 + * then intra-node lookup is done. This lookup finished on the E1, because the
9559 + * key we are looking for is larger than the key of E1 and is smaller than key
9560 + * the of I2.
9561 + *
9562 + * Here search is stuck.
9563 + *
9564 + * After some thought it is clear what is wrong here: extents on the twig level
9565 + * break some basic property of the *search* tree (on the pretext, that they
9566 + * restore property of balanced tree).
9567 + *
9568 + * Said property is the following: if in the internal node of the search tree
9569 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9570 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9571 + * through the Pointer.
9572 + *
9573 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9574 + * cannot expand indefinitely to the right to include any item with
9575 + *
9576 + * Key1 <= Key <= Key2.
9577 + *
9578 + * For example, our E1 extent is only responsible for the data with keys
9579 + *
9580 + * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9581 + *
9582 + * so, key range
9583 + *
9584 + * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9585 + *
9586 + * is orphaned: there is no way to get there from the tree root.
9587 + *
9588 + * In other words, extent pointers are different than normal child pointers as
9589 + * far as search tree is concerned, and this creates such problems.
9590 + *
9591 + * Possible solution for this problem is to insert our item into node pointed
9592 + * to by I2. There are some problems through:
9593 + *
9594 + * (1) I2 can be in a different node.
9595 + * (2) E1 can be immediately followed by another extent E2.
9596 + *
9597 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9598 + * for locks/coords as necessary.
9599 + *
9600 + * (2) is more complex. Solution here is to insert new empty leaf node and
9601 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9602 + * further complicated by possibility that E2 is in a different node, etc.
9603 + *
9604 + * Problems:
9605 + *
9606 + * (1) if there was internal item I2 immediately on the right of an extent E1
9607 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9608 + * key of S1 will be less than smallest key in the N2. Normally, search key
9609 + * checks that key we are looking for is in the range of keys covered by the
9610 + * node key is being looked in. To work around of this situation, while
9611 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9612 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9613 + * coord_by_key() and is only cleared when we are about to enter situation
9614 + * described above.
9615 + *
9616 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9617 + * searching for the key that is between E1 and E2 we only have to insert new
9618 + * empty leaf node when coord_by_key was called for insertion, rather than just
9619 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9620 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9621 + * performed by insert_by_key() and friends.
9622 + *
9623 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9624 + * case it requires modification of node content which is only possible under
9625 + * write lock. It may well happen that we only have read lock on the node where
9626 + * new internal pointer is to be inserted (common case: lookup of non-existent
9627 + * stat-data that fells between two extents). If only read lock is held, tree
9628 + * traversal is restarted with lock_level modified so that next time we hit
9629 + * this problem, write lock will be held. Once we have write lock, balancing
9630 + * will be performed.
9631 + */
9632 +
9633 +/**
9634 + * is_next_item_internal - check whether next item is internal
9635 + * @coord: coordinate of extent item in twig node
9636 + * @key: search key
9637 + * @lh: twig node lock handle
9638 + *
9639 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9640 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9641 + * to that node, @coord is set to its first unit. If next item is not internal
9642 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9643 + * is returned if search restart has to be done.
9644 + */
9645 +static int
9646 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
9647 + lock_handle *lh)
9648 +{
9649 + coord_t next;
9650 + lock_handle rn;
9651 + int result;
9652 +
9653 + coord_dup(&next, coord);
9654 + if (coord_next_unit(&next) == 0) {
9655 + /* next unit is in this node */
9656 + if (item_is_internal(&next)) {
9657 + coord_dup(coord, &next);
9658 + return 1;
9659 + }
9660 + assert("vs-3", item_is_extent(&next));
9661 + return 0;
9662 + }
9663 +
9664 + /*
9665 + * next unit either does not exist or is in right neighbor. If it is in
9666 + * right neighbor we have to check right delimiting key because
9667 + * concurrent thread could get their first and insert item with a key
9668 + * smaller than @key
9669 + */
9670 + read_lock_dk(current_tree);
9671 + result = keycmp(key, znode_get_rd_key(coord->node));
9672 + read_unlock_dk(current_tree);
9673 + assert("vs-6", result != EQUAL_TO);
9674 + if (result == GREATER_THAN)
9675 + return 2;
9676 +
9677 + /* lock right neighbor */
9678 + init_lh(&rn);
9679 + result = reiser4_get_right_neighbor(&rn, coord->node,
9680 + znode_is_wlocked(coord->node) ?
9681 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9682 + GN_CAN_USE_UPPER_LEVELS);
9683 + if (result == -E_NO_NEIGHBOR) {
9684 + /* we are on the rightmost edge of the tree */
9685 + done_lh(&rn);
9686 + return 0;
9687 + }
9688 +
9689 + if (result) {
9690 + assert("vs-4", result < 0);
9691 + done_lh(&rn);
9692 + return result;
9693 + }
9694 +
9695 + /*
9696 + * check whether concurrent thread managed to insert item with a key
9697 + * smaller than @key
9698 + */
9699 + read_lock_dk(current_tree);
9700 + result = keycmp(key, znode_get_ld_key(rn.node));
9701 + read_unlock_dk(current_tree);
9702 + assert("vs-6", result != EQUAL_TO);
9703 + if (result == GREATER_THAN) {
9704 + done_lh(&rn);
9705 + return 2;
9706 + }
9707 +
9708 + result = zload(rn.node);
9709 + if (result) {
9710 + assert("vs-5", result < 0);
9711 + done_lh(&rn);
9712 + return result;
9713 + }
9714 +
9715 + coord_init_first_unit(&next, rn.node);
9716 + if (item_is_internal(&next)) {
9717 + /*
9718 + * next unit is in right neighbor and it is an unit of internal
9719 + * item. Unlock coord->node. Move @lh to right neighbor. @coord
9720 + * is set to the first unit of right neighbor.
9721 + */
9722 + coord_dup(coord, &next);
9723 + zrelse(rn.node);
9724 + done_lh(lh);
9725 + move_lh(lh, &rn);
9726 + return 1;
9727 + }
9728 +
9729 + /*
9730 + * next unit is unit of extent item. Return without chaning @lh and
9731 + * @coord.
9732 + */
9733 + assert("vs-6", item_is_extent(&next));
9734 + zrelse(rn.node);
9735 + done_lh(&rn);
9736 + return 0;
9737 +}
9738 +
9739 +/**
9740 + * rd_key - calculate key of an item next to the given one
9741 + * @coord: position in a node
9742 + * @key: storage for result key
9743 + *
9744 + * @coord is set between items or after the last item in a node. Calculate key
9745 + * of item to the right of @coord.
9746 + */
9747 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9748 +{
9749 + coord_t dup;
9750 +
9751 + assert("nikita-2281", coord_is_between_items(coord));
9752 + coord_dup(&dup, coord);
9753 +
9754 + if (coord_set_to_right(&dup) == 0)
9755 + /* next item is in this node. Return its key. */
9756 + unit_key_by_coord(&dup, key);
9757 + else {
9758 + /*
9759 + * next item either does not exist or is in right
9760 + * neighbor. Return znode's right delimiting key.
9761 + */
9762 + read_lock_dk(current_tree);
9763 + *key = *znode_get_rd_key(coord->node);
9764 + read_unlock_dk(current_tree);
9765 + }
9766 + return key;
9767 +}
9768 +
9769 +/**
9770 + * add_empty_leaf - insert empty leaf between two extents
9771 + * @insert_coord: position in twig node between two extents
9772 + * @lh: twig node lock handle
9773 + * @key: left delimiting key of new node
9774 + * @rdkey: right delimiting key of new node
9775 + *
9776 + * Inserts empty leaf node between two extent items. It is necessary when we
9777 + * have to insert an item on leaf level between two extents (items on the twig
9778 + * level).
9779 + */
9780 +static int
9781 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9782 + const reiser4_key *key, const reiser4_key *rdkey)
9783 +{
9784 + int result;
9785 + carry_pool *pool;
9786 + carry_level *todo;
9787 + reiser4_item_data *item;
9788 + carry_insert_data *cdata;
9789 + carry_op *op;
9790 + znode *node;
9791 + reiser4_tree *tree;
9792 +
9793 + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9794 + tree = znode_get_tree(insert_coord->node);
9795 + node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9796 + if (IS_ERR(node))
9797 + return PTR_ERR(node);
9798 +
9799 + /* setup delimiting keys for node being inserted */
9800 + write_lock_dk(tree);
9801 + znode_set_ld_key(node, key);
9802 + znode_set_rd_key(node, rdkey);
9803 + ON_DEBUG(node->creator = current);
9804 + ON_DEBUG(node->first_key = *key);
9805 + write_unlock_dk(tree);
9806 +
9807 + ZF_SET(node, JNODE_ORPHAN);
9808 +
9809 + /*
9810 + * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9811 + * carry_insert_data
9812 + */
9813 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9814 + sizeof(*item) + sizeof(*cdata));
9815 + if (IS_ERR(pool))
9816 + return PTR_ERR(pool);
9817 + todo = (carry_level *) (pool + 1);
9818 + init_carry_level(todo, pool);
9819 +
9820 + item = (reiser4_item_data *) (todo + 3);
9821 + cdata = (carry_insert_data *) (item + 1);
9822 +
9823 + op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9824 + if (!IS_ERR(op)) {
9825 + cdata->coord = insert_coord;
9826 + cdata->key = key;
9827 + cdata->data = item;
9828 + op->u.insert.d = cdata;
9829 + op->u.insert.type = COPT_ITEM_DATA;
9830 + build_child_ptr_data(node, item);
9831 + item->arg = NULL;
9832 + /* have @insert_coord to be set at inserted item after
9833 + insertion is done */
9834 + todo->track_type = CARRY_TRACK_CHANGE;
9835 + todo->tracked = lh;
9836 +
9837 + result = reiser4_carry(todo, NULL);
9838 + if (result == 0) {
9839 + /*
9840 + * pin node in memory. This is necessary for
9841 + * znode_make_dirty() below.
9842 + */
9843 + result = zload(node);
9844 + if (result == 0) {
9845 + lock_handle local_lh;
9846 +
9847 + /*
9848 + * if we inserted new child into tree we have
9849 + * to mark it dirty so that flush will be able
9850 + * to process it.
9851 + */
9852 + init_lh(&local_lh);
9853 + result = longterm_lock_znode(&local_lh, node,
9854 + ZNODE_WRITE_LOCK,
9855 + ZNODE_LOCK_LOPRI);
9856 + if (result == 0) {
9857 + znode_make_dirty(node);
9858 +
9859 + /*
9860 + * when internal item pointing to @node
9861 + * was inserted into twig node
9862 + * create_hook_internal did not connect
9863 + * it properly because its right
9864 + * neighbor was not known. Do it
9865 + * here
9866 + */
9867 + write_lock_tree(tree);
9868 + assert("nikita-3312",
9869 + znode_is_right_connected(node));
9870 + assert("nikita-2984",
9871 + node->right == NULL);
9872 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9873 + write_unlock_tree(tree);
9874 + result =
9875 + connect_znode(insert_coord, node);
9876 + ON_DEBUG(if (result == 0) check_dkeys(node););
9877 +
9878 + done_lh(lh);
9879 + move_lh(lh, &local_lh);
9880 + assert("vs-1676", node_is_empty(node));
9881 + coord_init_first_unit(insert_coord,
9882 + node);
9883 + } else {
9884 + warning("nikita-3136",
9885 + "Cannot lock child");
9886 + }
9887 + done_lh(&local_lh);
9888 + zrelse(node);
9889 + }
9890 + }
9891 + } else
9892 + result = PTR_ERR(op);
9893 + zput(node);
9894 + done_carry_pool(pool);
9895 + return result;
9896 +}
9897 +
9898 +/**
9899 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9900 + * @h: search handle
9901 + * @outcome: flag saying whether search has to restart or is done
9902 + *
9903 + * Handles search on twig level. If this function completes search itself then
9904 + * it returns 1. If search has to go one level down then 0 is returned. If
9905 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9906 + * in @h->result.
9907 + */
9908 +int handle_eottl(cbk_handle *h, int *outcome)
9909 +{
9910 + int result;
9911 + reiser4_key key;
9912 + coord_t *coord;
9913 +
9914 + coord = h->coord;
9915 +
9916 + if (h->level != TWIG_LEVEL ||
9917 + (coord_is_existing_item(coord) && item_is_internal(coord))) {
9918 + /* Continue to traverse tree downward. */
9919 + return 0;
9920 + }
9921 +
9922 + /*
9923 + * make sure that @h->coord is set to twig node and that it is either
9924 + * set to extent item or after extent item
9925 + */
9926 + assert("vs-356", h->level == TWIG_LEVEL);
9927 + assert("vs-357", ( {
9928 + coord_t lcoord;
9929 + coord_dup(&lcoord, coord);
9930 + check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9931 + item_is_extent(&lcoord);
9932 + }
9933 + ));
9934 +
9935 + if (*outcome == NS_FOUND) {
9936 + /* we have found desired key on twig level in extent item */
9937 + h->result = CBK_COORD_FOUND;
9938 + *outcome = LOOKUP_DONE;
9939 + return 1;
9940 + }
9941 +
9942 + if (!(h->flags & CBK_FOR_INSERT)) {
9943 + /* tree traversal is not for insertion. Just return
9944 + CBK_COORD_NOTFOUND. */
9945 + h->result = CBK_COORD_NOTFOUND;
9946 + *outcome = LOOKUP_DONE;
9947 + return 1;
9948 + }
9949 +
9950 + /* take a look at the item to the right of h -> coord */
9951 + result = is_next_item_internal(coord, h->key, h->active_lh);
9952 + if (unlikely(result < 0)) {
9953 + h->error = "get_right_neighbor failed";
9954 + h->result = result;
9955 + *outcome = LOOKUP_DONE;
9956 + return 1;
9957 + }
9958 + if (result == 0) {
9959 + /*
9960 + * item to the right is also an extent one. Allocate a new node
9961 + * and insert pointer to it after item h -> coord.
9962 + *
9963 + * This is a result of extents being located at the twig
9964 + * level. For explanation, see comment just above
9965 + * is_next_item_internal().
9966 + */
9967 + znode *loaded;
9968 +
9969 + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
9970 + /*
9971 + * we got node read locked, restart coord_by_key to
9972 + * have write lock on twig level
9973 + */
9974 + h->lock_level = TWIG_LEVEL;
9975 + h->lock_mode = ZNODE_WRITE_LOCK;
9976 + *outcome = LOOKUP_REST;
9977 + return 1;
9978 + }
9979 +
9980 + loaded = coord->node;
9981 + result =
9982 + add_empty_leaf(coord, h->active_lh, h->key,
9983 + rd_key(coord, &key));
9984 + if (result) {
9985 + h->error = "could not add empty leaf";
9986 + h->result = result;
9987 + *outcome = LOOKUP_DONE;
9988 + return 1;
9989 + }
9990 + /* added empty leaf is locked (h->active_lh), its parent node
9991 + is unlocked, h->coord is set as EMPTY */
9992 + assert("vs-13", coord->between == EMPTY_NODE);
9993 + assert("vs-14", znode_is_write_locked(coord->node));
9994 + assert("vs-15",
9995 + WITH_DATA(coord->node, node_is_empty(coord->node)));
9996 + assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
9997 + assert("vs-17", coord->node == h->active_lh->node);
9998 + *outcome = LOOKUP_DONE;
9999 + h->result = CBK_COORD_NOTFOUND;
10000 + return 1;
10001 + } else if (result == 1) {
10002 + /*
10003 + * this is special case mentioned in the comment on
10004 + * tree.h:cbk_flags. We have found internal item immediately on
10005 + * the right of extent, and we are going to insert new item
10006 + * there. Key of item we are going to insert is smaller than
10007 + * leftmost key in the node pointed to by said internal item
10008 + * (otherwise search wouldn't come to the extent in the first
10009 + * place).
10010 + *
10011 + * This is a result of extents being located at the twig
10012 + * level. For explanation, see comment just above
10013 + * is_next_item_internal().
10014 + */
10015 + h->flags &= ~CBK_TRUST_DK;
10016 + } else {
10017 + assert("vs-8", result == 2);
10018 + *outcome = LOOKUP_REST;
10019 + return 1;
10020 + }
10021 + assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10022 + return 0;
10023 +}
10024 +
10025 +/*
10026 + * Local variables:
10027 + * c-indentation-style: "K&R"
10028 + * mode-name: "LC"
10029 + * c-basic-offset: 8
10030 + * tab-width: 8
10031 + * fill-column: 120
10032 + * scroll-step: 1
10033 + * End:
10034 + */
10035 diff -urN linux-2.6.20.orig/fs/reiser4/estimate.c linux-2.6.20/fs/reiser4/estimate.c
10036 --- linux-2.6.20.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10037 +++ linux-2.6.20/fs/reiser4/estimate.c 2007-05-06 14:50:43.706978224 +0400
10038 @@ -0,0 +1,111 @@
10039 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10040 +
10041 +#include "debug.h"
10042 +#include "dformat.h"
10043 +#include "tree.h"
10044 +#include "carry.h"
10045 +#include "inode.h"
10046 +#include "plugin/cluster.h"
10047 +#include "plugin/item/ctail.h"
10048 +
10049 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10050 +
10051 + Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10052 + is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10053 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10054 + leaf level, 3 for twig level, 2 on upper + 1 for root.
10055 +
10056 + Do not calculate the current node of the lowest level here - this is overhead only.
10057 +
10058 + children is almost always 1 here. Exception is flow insertion
10059 +*/
10060 +static reiser4_block_nr
10061 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10062 +{
10063 + reiser4_block_nr ten_percent;
10064 +
10065 + ten_percent = ((103 * childen) >> 10);
10066 +
10067 + /* If we have too many balancings at the time, tree height can raise on more
10068 + then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10069 + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10070 +}
10071 +
10072 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10073 + perform insertion of one item into the tree */
10074 +/* it is only called when tree height changes, or gets initialized */
10075 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10076 +{
10077 + return 1 + max_balance_overhead(1, height);
10078 +}
10079 +
10080 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10081 +{
10082 + return tree->estimate_one_insert;
10083 +}
10084 +
10085 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10086 + perform insertion of one unit into an item in the tree */
10087 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10088 +{
10089 + /* estimate insert into item just like item insertion */
10090 + return tree->estimate_one_insert;
10091 +}
10092 +
10093 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10094 +{
10095 + /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10096 + level */
10097 + return tree->estimate_one_insert;
10098 +}
10099 +
10100 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10101 + both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10102 + levels */
10103 +reiser4_block_nr estimate_insert_flow(tree_level height)
10104 +{
10105 + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10106 + CARRY_FLOW_NEW_NODES_LIMIT,
10107 + height);
10108 +}
10109 +
10110 +/* returnes max number of nodes can be occupied by disk cluster */
10111 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10112 +{
10113 + int per_cluster;
10114 + per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10115 + return 3 + per_cluster +
10116 + max_balance_overhead(3 + per_cluster,
10117 + REISER4_MAX_ZTREE_HEIGHT);
10118 +}
10119 +
10120 +/* how many nodes might get dirty and added
10121 + during insertion of a disk cluster */
10122 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10123 +{
10124 + return estimate_cluster(inode, 1); /* 24 */
10125 +}
10126 +
10127 +/* how many nodes might get dirty and added
10128 + during update of a (prepped or unprepped) disk cluster */
10129 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10130 +{
10131 + return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10132 +}
10133 +
10134 +/* how many nodes occupied by a disk cluster might get dirty */
10135 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10136 +{
10137 + return cluster_nrpages(inode) + 4;
10138 +}
10139 +
10140 +/* Make Linus happy.
10141 + Local variables:
10142 + c-indentation-style: "K&R"
10143 + mode-name: "LC"
10144 + c-basic-offset: 8
10145 + tab-width: 8
10146 + fill-column: 120
10147 + scroll-step: 1
10148 + End:
10149 +*/
10150 diff -urN linux-2.6.20.orig/fs/reiser4/export_ops.c linux-2.6.20/fs/reiser4/export_ops.c
10151 --- linux-2.6.20.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10152 +++ linux-2.6.20/fs/reiser4/export_ops.c 2007-05-06 14:50:43.706978224 +0400
10153 @@ -0,0 +1,295 @@
10154 +/* Copyright 2005 by Hans Reiser, licensing governed by
10155 + * reiser4/README */
10156 +
10157 +#include "inode.h"
10158 +#include "plugin/plugin.h"
10159 +
10160 +/*
10161 + * Supported file-handle types
10162 + */
10163 +typedef enum {
10164 + FH_WITH_PARENT = 0x10, /* file handle with parent */
10165 + FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10166 +} reiser4_fhtype;
10167 +
10168 +#define NFSERROR (255)
10169 +
10170 +/* initialize place-holder for object */
10171 +static void object_on_wire_init(reiser4_object_on_wire *o)
10172 +{
10173 + o->plugin = NULL;
10174 +}
10175 +
10176 +/* finish with @o */
10177 +static void object_on_wire_done(reiser4_object_on_wire *o)
10178 +{
10179 + if (o->plugin != NULL)
10180 + o->plugin->wire.done(o);
10181 +}
10182 +
10183 +/*
10184 + * read serialized object identity from @addr and store information about
10185 + * object in @obj. This is dual to encode_inode().
10186 + */
10187 +static char *decode_inode(struct super_block *s, char *addr,
10188 + reiser4_object_on_wire * obj)
10189 +{
10190 + file_plugin *fplug;
10191 +
10192 + /* identifier of object plugin is stored in the first two bytes,
10193 + * followed by... */
10194 + fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10195 + if (fplug != NULL) {
10196 + addr += sizeof(d16);
10197 + obj->plugin = fplug;
10198 + assert("nikita-3520", fplug->wire.read != NULL);
10199 + /* plugin specific encoding of object identity. */
10200 + addr = fplug->wire.read(addr, obj);
10201 + } else
10202 + addr = ERR_PTR(RETERR(-EINVAL));
10203 + return addr;
10204 +}
10205 +
10206 +/**
10207 + * reiser4_decode_fh - decode_fh of export operations
10208 + * @super: super block
10209 + * @fh: nfsd file handle
10210 + * @len: length of file handle
10211 + * @fhtype: type of file handle
10212 + * @acceptable: acceptability testing function
10213 + * @context: argument for @acceptable
10214 + *
10215 + * Returns dentry referring to the same file as @fh.
10216 + */
10217 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10218 + int len, int fhtype,
10219 + int (*acceptable) (void *context,
10220 + struct dentry *de),
10221 + void *context)
10222 +{
10223 + reiser4_context *ctx;
10224 + reiser4_object_on_wire object;
10225 + reiser4_object_on_wire parent;
10226 + char *addr;
10227 + int with_parent;
10228 +
10229 + ctx = reiser4_init_context(super);
10230 + if (IS_ERR(ctx))
10231 + return (struct dentry *)ctx;
10232 +
10233 + assert("vs-1482",
10234 + fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10235 +
10236 + with_parent = (fhtype == FH_WITH_PARENT);
10237 +
10238 + addr = (char *)fh;
10239 +
10240 + object_on_wire_init(&object);
10241 + object_on_wire_init(&parent);
10242 +
10243 + addr = decode_inode(super, addr, &object);
10244 + if (!IS_ERR(addr)) {
10245 + if (with_parent)
10246 + addr = decode_inode(super, addr, &parent);
10247 + if (!IS_ERR(addr)) {
10248 + struct dentry *d;
10249 + typeof(super->s_export_op->find_exported_dentry) fn;
10250 +
10251 + fn = super->s_export_op->find_exported_dentry;
10252 + assert("nikita-3521", fn != NULL);
10253 + d = fn(super, &object, with_parent ? &parent : NULL,
10254 + acceptable, context);
10255 + if (d != NULL && !IS_ERR(d))
10256 + /* FIXME check for -ENOMEM */
10257 + reiser4_get_dentry_fsdata(d)->stateless = 1;
10258 + addr = (char *)d;
10259 + }
10260 + }
10261 +
10262 + object_on_wire_done(&object);
10263 + object_on_wire_done(&parent);
10264 +
10265 + reiser4_exit_context(ctx);
10266 + return (void *)addr;
10267 +}
10268 +
10269 +/*
10270 + * Object serialization support.
10271 + *
10272 + * To support knfsd file system provides export_operations that are used to
10273 + * construct and interpret NFS file handles. As a generalization of this,
10274 + * reiser4 object plugins have serialization support: it provides methods to
10275 + * create on-wire representation of identity of reiser4 object, and
10276 + * re-create/locate object given its on-wire identity.
10277 + *
10278 + */
10279 +
10280 +/*
10281 + * return number of bytes that on-wire representation of @inode's identity
10282 + * consumes.
10283 + */
10284 +static int encode_inode_size(struct inode *inode)
10285 +{
10286 + assert("nikita-3514", inode != NULL);
10287 + assert("nikita-3515", inode_file_plugin(inode) != NULL);
10288 + assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10289 +
10290 + return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10291 +}
10292 +
10293 +/*
10294 + * store on-wire representation of @inode's identity at the area beginning at
10295 + * @start.
10296 + */
10297 +static char *encode_inode(struct inode *inode, char *start)
10298 +{
10299 + assert("nikita-3517", inode != NULL);
10300 + assert("nikita-3518", inode_file_plugin(inode) != NULL);
10301 + assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10302 +
10303 + /*
10304 + * first, store two-byte identifier of object plugin, then
10305 + */
10306 + save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10307 + (d16 *) start);
10308 + start += sizeof(d16);
10309 + /*
10310 + * call plugin to serialize object's identity
10311 + */
10312 + return inode_file_plugin(inode)->wire.write(inode, start);
10313 +}
10314 +
10315 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10316 + * returned if file handle can not be stored */
10317 +/**
10318 + * reiser4_encode_fh - encode_fh of export operations
10319 + * @dentry:
10320 + * @fh:
10321 + * @lenp:
10322 + * @need_parent:
10323 + *
10324 + */
10325 +static int
10326 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10327 + int need_parent)
10328 +{
10329 + struct inode *inode;
10330 + struct inode *parent;
10331 + char *addr;
10332 + int need;
10333 + int delta;
10334 + int result;
10335 + reiser4_context *ctx;
10336 +
10337 + /*
10338 + * knfsd asks as to serialize object in @dentry, and, optionally its
10339 + * parent (if need_parent != 0).
10340 + *
10341 + * encode_inode() and encode_inode_size() is used to build
10342 + * representation of object and its parent. All hard work is done by
10343 + * object plugins.
10344 + */
10345 + inode = dentry->d_inode;
10346 + parent = dentry->d_parent->d_inode;
10347 +
10348 + addr = (char *)fh;
10349 +
10350 + need = encode_inode_size(inode);
10351 + if (need < 0)
10352 + return NFSERROR;
10353 + if (need_parent) {
10354 + delta = encode_inode_size(parent);
10355 + if (delta < 0)
10356 + return NFSERROR;
10357 + need += delta;
10358 + }
10359 +
10360 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
10361 + if (IS_ERR(ctx))
10362 + return PTR_ERR(ctx);
10363 +
10364 + if (need <= sizeof(__u32) * (*lenp)) {
10365 + addr = encode_inode(inode, addr);
10366 + if (need_parent)
10367 + addr = encode_inode(parent, addr);
10368 +
10369 + /* store in lenp number of 32bit words required for file
10370 + * handle. */
10371 + *lenp = (need + sizeof(__u32) - 1) >> 2;
10372 + result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10373 + } else
10374 + /* no enough space in file handle */
10375 + result = NFSERROR;
10376 + reiser4_exit_context(ctx);
10377 + return result;
10378 +}
10379 +
10380 +/**
10381 + * reiser4_get_dentry_parent - get_parent of export operations
10382 + * @child:
10383 + *
10384 + */
10385 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10386 +{
10387 + struct inode *dir;
10388 + dir_plugin *dplug;
10389 +
10390 + assert("nikita-3527", child != NULL);
10391 + /* see comment in reiser4_get_dentry() about following assertion */
10392 + assert("nikita-3528", is_in_reiser4_context());
10393 +
10394 + dir = child->d_inode;
10395 + assert("nikita-3529", dir != NULL);
10396 + dplug = inode_dir_plugin(dir);
10397 + assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10398 + if (dplug != NULL)
10399 + return dplug->get_parent(dir);
10400 + else
10401 + return ERR_PTR(RETERR(-ENOTDIR));
10402 +}
10403 +
10404 +/**
10405 + * reiser4_get_dentry - get_dentry of export operations
10406 + * @super:
10407 + * @data:
10408 + *
10409 + *
10410 + */
10411 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10412 +{
10413 + reiser4_object_on_wire *o;
10414 +
10415 + assert("nikita-3522", super != NULL);
10416 + assert("nikita-3523", data != NULL);
10417 + /*
10418 + * this is only supposed to be called by
10419 + *
10420 + * reiser4_decode_fh->find_exported_dentry
10421 + *
10422 + * so, reiser4_context should be here already.
10423 + */
10424 + assert("nikita-3526", is_in_reiser4_context());
10425 +
10426 + o = (reiser4_object_on_wire *)data;
10427 + assert("nikita-3524", o->plugin != NULL);
10428 + assert("nikita-3525", o->plugin->wire.get != NULL);
10429 +
10430 + return o->plugin->wire.get(super, o);
10431 +}
10432 +
10433 +struct export_operations reiser4_export_operations = {
10434 + .encode_fh = reiser4_encode_fh,
10435 + .decode_fh = reiser4_decode_fh,
10436 + .get_parent = reiser4_get_dentry_parent,
10437 + .get_dentry = reiser4_get_dentry
10438 +};
10439 +
10440 +/*
10441 + * Local variables:
10442 + * c-indentation-style: "K&R"
10443 + * mode-name: "LC"
10444 + * c-basic-offset: 8
10445 + * tab-width: 8
10446 + * fill-column: 79
10447 + * End:
10448 + */
10449 diff -urN linux-2.6.20.orig/fs/reiser4/flush.c linux-2.6.20/fs/reiser4/flush.c
10450 --- linux-2.6.20.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10451 +++ linux-2.6.20/fs/reiser4/flush.c 2007-05-06 14:50:43.000000000 +0400
10452 @@ -0,0 +1,3622 @@
10453 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10454 +
10455 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10456 +
10457 +#include "forward.h"
10458 +#include "debug.h"
10459 +#include "dformat.h"
10460 +#include "key.h"
10461 +#include "coord.h"
10462 +#include "plugin/item/item.h"
10463 +#include "plugin/plugin.h"
10464 +#include "plugin/object.h"
10465 +#include "txnmgr.h"
10466 +#include "jnode.h"
10467 +#include "znode.h"
10468 +#include "block_alloc.h"
10469 +#include "tree_walk.h"
10470 +#include "carry.h"
10471 +#include "tree.h"
10472 +#include "vfs_ops.h"
10473 +#include "inode.h"
10474 +#include "page_cache.h"
10475 +#include "wander.h"
10476 +#include "super.h"
10477 +#include "entd.h"
10478 +#include "reiser4.h"
10479 +#include "flush.h"
10480 +#include "writeout.h"
10481 +
10482 +#include <asm/atomic.h>
10483 +#include <linux/fs.h> /* for struct super_block */
10484 +#include <linux/mm.h> /* for struct page */
10485 +#include <linux/bio.h> /* for struct bio */
10486 +#include <linux/pagemap.h>
10487 +#include <linux/blkdev.h>
10488 +
10489 +/* IMPLEMENTATION NOTES */
10490 +
10491 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10492 + order to the nodes of the tree in which the parent is placed before its children, which
10493 + are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10494 + describes the node that "came before in forward parent-first order". When we speak of a
10495 + "parent-first follower", it describes the node that "comes next in parent-first
10496 + order" (alternatively the node that "came before in reverse parent-first order").
10497 +
10498 + The following pseudo-code prints the nodes of a tree in forward parent-first order:
10499 +
10500 + void parent_first (node)
10501 + {
10502 + print_node (node);
10503 + if (node->level > leaf) {
10504 + for (i = 0; i < num_children; i += 1) {
10505 + parent_first (node->child[i]);
10506 + }
10507 + }
10508 + }
10509 +*/
10510 +
10511 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10512 + that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10513 + can be accomplished with sequential reads, which results in reading nodes in their
10514 + parent-first order. This is a read-optimization aspect of the flush algorithm, and
10515 + there is also a write-optimization aspect, which is that we wish to make large
10516 + sequential writes to the disk by allocating or reallocating blocks so that they can be
10517 + written in sequence. Sometimes the read-optimization and write-optimization goals
10518 + conflict with each other, as we discuss in more detail below.
10519 +*/
10520 +
10521 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10522 + the relevant jnode->state bits and their relevence to flush:
10523 +
10524 + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10525 + must be allocated first. In order to be considered allocated, the jnode must have
10526 + exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10527 + all dirtied jnodes eventually have one of these bits set during each transaction.
10528 +
10529 + JNODE_CREATED: The node was freshly created in its transaction and has no previous
10530 + block address, so it is unconditionally assigned to be relocated, although this is
10531 + mainly for code-convenience. It is not being 'relocated' from anything, but in
10532 + almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10533 + remains set even after JNODE_RELOC is set, so the actual relocate can be
10534 + distinguished from the created-and-allocated set easily: relocate-set members
10535 + (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10536 + have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10537 +
10538 + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10539 + decision to maintain the pre-existing location for this node and it will be written
10540 + to the wandered-log.
10541 +
10542 + JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10543 + not created, see note above). A block with JNODE_RELOC set is eligible for
10544 + early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10545 + bit is set on a znode, the parent node's internal item is modified and the znode is
10546 + rehashed.
10547 +
10548 + JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10549 + and calls plugin->f.squeeze() method for its items. By this technology we update disk
10550 + clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10551 + has this flag (races with write(), rare case) the flush algorythm makes the decision
10552 + to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10553 + repeated allocation.
10554 +
10555 + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10556 + flush queue. This means the jnode is not on any clean or dirty list, instead it is
10557 + moved to one of the flush queue (see flush_queue.h) object private list. This
10558 + prevents multiple concurrent flushes from attempting to start flushing from the
10559 + same node.
10560 +
10561 + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10562 + squeeze-and-allocate on a node while its children are actively being squeezed and
10563 + allocated. This flag was created to avoid submitting a write request for a node
10564 + while its children are still being allocated and squeezed. Then flush queue was
10565 + re-implemented to allow unlimited number of nodes be queued. This flag support was
10566 + commented out in source code because we decided that there was no reason to submit
10567 + queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10568 + during a slum traversal and may submit "busy nodes" to disk. Probably we can
10569 + re-enable the JNODE_FLUSH_BUSY bit support in future.
10570 +
10571 + With these state bits, we describe a test used frequently in the code below,
10572 + jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10573 + test for "flushprepped" returns true if any of the following are true:
10574 +
10575 + - The node is not dirty
10576 + - The node has JNODE_RELOC set
10577 + - The node has JNODE_OVRWR set
10578 +
10579 + If either the node is not dirty or it has already been processed by flush (and assigned
10580 + JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10581 + true then flush has work to do on that node.
10582 +*/
10583 +
10584 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10585 + flushprepped twice (unless an explicit call to flush_unprep is made as described in
10586 + detail below). For example a node is dirtied, allocated, and then early-flushed to
10587 + disk and set clean. Before the transaction commits, the page is dirtied again and, due
10588 + to memory pressure, the node is flushed again. The flush algorithm will not relocate
10589 + the node to a new disk location, it will simply write it to the same, previously
10590 + relocated position again.
10591 +*/
10592 +
10593 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10594 + start at a leaf node and allocate in parent-first order by iterating to the right. At
10595 + each step of the iteration, we check for the right neighbor. Before advancing to the
10596 + right neighbor, we check if the current position and the right neighbor share the same
10597 + parent. If they do not share the same parent, the parent is allocated before the right
10598 + neighbor.
10599 +
10600 + This process goes recursively up the tree and squeeze nodes level by level as long as
10601 + the right neighbor and the current position have different parents, then it allocates
10602 + the right-neighbors-with-different-parents on the way back down. This process is
10603 + described in more detail in flush_squalloc_changed_ancestor and the recursive function
10604 + squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10605 + specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10606 + approaches.
10607 +
10608 + The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10609 + approach, we find a starting point by scanning left along each level past dirty nodes,
10610 + then going up and repeating the process until the left node and the parent node are
10611 + clean. We then perform a parent-first traversal from the starting point, which makes
10612 + allocating in parent-first order trivial. After one subtree has been allocated in this
10613 + manner, we move to the right, try moving upward, then repeat the parent-first
10614 + traversal.
10615 +
10616 + Both approaches have problems that need to be addressed. Both are approximately the
10617 + same amount of code, but the bottom-up approach has advantages in the order it acquires
10618 + locks which, at the very least, make it the better approach. At first glance each one
10619 + makes the other one look simpler, so it is important to remember a few of the problems
10620 + with each one.
10621 +
10622 + Main problem with the top-down approach: When you encounter a clean child during the
10623 + parent-first traversal, what do you do? You would like to avoid searching through a
10624 + large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10625 + obvious solution. One of the advantages of the top-down approach is that during the
10626 + parent-first traversal you check every child of a parent to see if it is dirty. In
10627 + this way, the top-down approach easily handles the main problem of the bottom-up
10628 + approach: unallocated children.
10629 +
10630 + The unallocated children problem is that before writing a node to disk we must make
10631 + sure that all of its children are allocated. Otherwise, the writing the node means
10632 + extra I/O because the node will have to be written again when the child is finally
10633 + allocated.
10634 +
10635 + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10636 + should not cause any file system corruption, it only degrades I/O performance because a
10637 + node may be written when it is sure to be written at least one more time in the same
10638 + transaction when the remaining children are allocated. What follows is a description
10639 + of how we will solve the problem.
10640 +*/
10641 +
10642 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10643 + proceeding in parent first order, allocate some of its left-children, then encounter a
10644 + clean child in the middle of the parent. We do not allocate the clean child, but there
10645 + may remain unallocated (dirty) children to the right of the clean child. If we were to
10646 + stop flushing at this moment and write everything to disk, the parent might still
10647 + contain unallocated children.
10648 +
10649 + We could try to allocate all the descendents of every node that we allocate, but this
10650 + is not necessary. Doing so could result in allocating the entire tree: if the root
10651 + node is allocated then every unallocated node would have to be allocated before
10652 + flushing. Actually, we do not have to write a node just because we allocate it. It is
10653 + possible to allocate but not write a node during flush, when it still has unallocated
10654 + children. However, this approach is probably not optimal for the following reason.
10655 +
10656 + The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10657 + to optimize reads that occur in the same order. Thus we are read-optimizing for a
10658 + left-to-right scan through all the leaves in the system, and we are hoping to
10659 + write-optimize at the same time because those nodes will be written together in batch.
10660 + What happens, however, if we assign a block number to a node in its read-optimized
10661 + order but then avoid writing it because it has unallocated children? In that
10662 + situation, we lose out on the write-optimization aspect because a node will have to be
10663 + written again to the its location on the device, later, which likely means seeking back
10664 + to that location.
10665 +
10666 + So there are tradeoffs. We can choose either:
10667 +
10668 + A. Allocate all unallocated children to preserve both write-optimization and
10669 + read-optimization, but this is not always desirable because it may mean having to
10670 + allocate and flush very many nodes at once.
10671 +
10672 + B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10673 + but sacrifice write-optimization because those nodes will be written again.
10674 +
10675 + C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10676 + locations. Instead, choose to write-optimize them later, when they are written. To
10677 + facilitate this, we "undo" the read-optimized allocation that was given to the node so
10678 + that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10679 + case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10680 + call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10681 + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10682 + location, and set the JNODE_CREATED bit, effectively setting the node back to an
10683 + unallocated state.
10684 +
10685 + We will take the following approach in v4.0: for twig nodes we will always finish
10686 + allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10687 + writing and choose write-optimization (C).
10688 +
10689 + To summarize, there are several parts to a solution that avoids the problem with
10690 + unallocated children:
10691 +
10692 + FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10693 + problem because there was an experiment which was done showed that we have 1-2 nodes
10694 + with unallocated children for thousands of written nodes. The experiment was simple
10695 + like coping / deletion of linux kernel sources. However the problem can arise in more
10696 + complex tests. I think we have jnode_io_hook to insert a check for unallocated
10697 + children and see what kind of problem we have.
10698 +
10699 + 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10700 + squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10701 + implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10702 + comments in that function.
10703 +
10704 + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10705 + have unallocated children. If the twig level has unallocated children it is an
10706 + assertion failure. If a higher-level node has unallocated children, then it should be
10707 + explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10708 + should be simple.
10709 +
10710 + 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10711 + CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10712 + this somewhat in the case where large sub-trees are flushed. The following observation
10713 + helps: if both the left- and right-neighbor of a node are processed by the flush
10714 + algorithm then the node itself is guaranteed to have all of its children allocated.
10715 + However, the cost of this check may not be so expensive after all: it is not needed for
10716 + leaves and flush can guarantee this property for twigs. That leaves only (level >
10717 + TWIG) nodes that have to be checked, so this optimization only helps if at least three
10718 + (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10719 + there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10720 + then the number of blocks being written will be very large, so the savings may be
10721 + insignificant. That said, the idea is to maintain both the left and right edges of
10722 + nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10723 + simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10724 + edge, the slow check is necessary, but if it is in the interior then it can be assumed
10725 + to have all of its children allocated. FIXME: medium complexity to implement, but
10726 + simple to verify given that we must have a slow check anyway.
10727 +
10728 + 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10729 + whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10730 + left-scan operation to take unallocated children into account. Normally, the left-scan
10731 + operation goes left as long as adjacent nodes are dirty up until some large maximum
10732 + value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10733 + may stop at a position where there are unallocated children to the left with the same
10734 + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10735 + FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10736 + with a rapid scan. The rapid scan skips all the interior children of a node--if the
10737 + leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10738 + twig to the left). If the left neighbor of the leftmost child is also dirty, then
10739 + continue the scan at the left twig and repeat. This option will cause flush to
10740 + allocate more twigs in a single pass, but it also has the potential to write many more
10741 + nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10742 + was partially implemented, code removed August 12, 2002 by JMACD.
10743 +*/
10744 +
10745 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10746 + starting point for flush is a leaf node, but actually the flush code cares very little
10747 + about whether or not this is true. It is possible that all the leaf nodes are flushed
10748 + and dirty parent nodes still remain, in which case jnode_flush() is called on a
10749 + non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10750 + leaf, even when it is not. This is a simple approach, and there may be a more optimal
10751 + policy but until a problem with this approach is discovered, simplest is probably best.
10752 +
10753 + NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10754 + the leaves. This is done as a matter of simplicity and there is only one (shaky)
10755 + justification. When an atom commits, it flushes all leaf level nodes first, followed
10756 + by twigs, and so on. With flushing done in this order, if flush is eventually called
10757 + on a non-leaf node it means that (somehow) we reached a point where all leaves are
10758 + clean and only internal nodes need to be flushed. If that it the case, then it means
10759 + there were no leaves that were the parent-first preceder/follower of the parent. This
10760 + is expected to be a rare case, which is why we do nothing special about it. However,
10761 + memory pressure may pass an internal node to flush when there are still dirty leaf
10762 + nodes that need to be flushed, which could prove our original assumptions
10763 + "inoperative". If this needs to be fixed, then scan_left/right should have
10764 + special checks for the non-leaf levels. For example, instead of passing from a node to
10765 + the left neighbor, it should pass from the node to the left neighbor's rightmost
10766 + descendent (if dirty).
10767 +
10768 +*/
10769 +
10770 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10771 + it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10772 + logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10773 + device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10774 + device becomes sorted such that tree order and block number order fully correlate.
10775 +
10776 + Resizing is done by shifting everything either all the way to the left or all the way
10777 + to the right, and then reporting the last block.
10778 +*/
10779 +
10780 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10781 + descibes the policy from the highest level:
10782 +
10783 + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10784 + leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10785 + leaf nodes.
10786 +
10787 + Otherwise, there are two contexts in which we make a decision to relocate:
10788 +
10789 + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10790 + During the initial stages of flush, after scan-right completes, we want to ask the
10791 + question: should we relocate this leaf node and thus dirty the parent node. Then if
10792 + the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10793 + the question at the next level up, and so on. In these cases we are moving in the
10794 + reverse-parent first direction.
10795 +
10796 + There is another case which is considered the reverse direction, which comes at the end
10797 + of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10798 + reach a point where there is a clean twig to the right with a dirty leftmost child. In
10799 + this case, we may wish to relocate the child by testing if it should be relocated
10800 + relative to its parent.
10801 +
10802 + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10803 + allocate_znode. What distinguishes the forward parent-first case from the
10804 + reverse-parent first case is that the preceder has already been allocated in the
10805 + forward case, whereas in the reverse case we don't know what the preceder is until we
10806 + finish "going in reverse". That simplifies the forward case considerably, and there we
10807 + actually use the block allocator to determine whether, e.g., a block closer to the
10808 + preceder is available.
10809 +*/
10810 +
10811 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10812 + finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10813 + squeeze the parent's left neighbor and the parent. This may change the
10814 + flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10815 + is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10816 + Note that we cannot allocate extents during this or they will be out of parent-first
10817 + order. There is also some difficult coordinate maintenence issues. We can't do a tree
10818 + search to find coordinates again (because we hold locks), we have to determine them
10819 + from the two nodes being squeezed. Looks difficult, but has potential to increase
10820 + space utilization. */
10821 +
10822 +/* Flush-scan helper functions. */
10823 +static void scan_init(flush_scan * scan);
10824 +static void scan_done(flush_scan * scan);
10825 +
10826 +/* Flush-scan algorithm. */
10827 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10828 + unsigned limit);
10829 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10830 +static int scan_common(flush_scan * scan, flush_scan * other);
10831 +static int scan_formatted(flush_scan * scan);
10832 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
10833 +static int scan_by_coord(flush_scan * scan);
10834 +
10835 +/* Initial flush-point ancestor allocation. */
10836 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
10837 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10838 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10839 +
10840 +/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10841 +static int squalloc(flush_pos_t * pos);
10842 +
10843 +/* Flush squeeze implementation. */
10844 +static int squeeze_right_non_twig(znode * left, znode * right);
10845 +static int shift_one_internal_unit(znode * left, znode * right);
10846 +
10847 +/* Flush reverse parent-first relocation routines. */
10848 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10849 + const reiser4_block_nr * nblk);
10850 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10851 + flush_pos_t * pos);
10852 +static int reverse_relocate_check_dirty_parent(jnode * node,
10853 + const coord_t * parent_coord,
10854 + flush_pos_t * pos);
10855 +
10856 +/* Flush allocate write-queueing functions: */
10857 +static int allocate_znode(znode * node, const coord_t * parent_coord,
10858 + flush_pos_t * pos);
10859 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10860 + flush_pos_t * pos);
10861 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10862 +
10863 +/* Flush helper functions: */
10864 +static int jnode_lock_parent_coord(jnode * node,
10865 + coord_t * coord,
10866 + lock_handle * parent_lh,
10867 + load_count * parent_zh,
10868 + znode_lock_mode mode, int try);
10869 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10870 + znode_lock_mode mode, int check_dirty);
10871 +static int znode_same_parents(znode * a, znode * b);
10872 +
10873 +static int znode_check_flushprepped(znode * node)
10874 +{
10875 + return jnode_check_flushprepped(ZJNODE(node));
10876 +}
10877 +
10878 +/* Flush position functions */
10879 +static void pos_init(flush_pos_t * pos);
10880 +static int pos_valid(flush_pos_t * pos);
10881 +static void pos_done(flush_pos_t * pos);
10882 +static int pos_stop(flush_pos_t * pos);
10883 +
10884 +/* check that @org is first jnode extent unit, if extent is unallocated,
10885 + * because all jnodes of unallocated extent are dirty and of the same atom. */
10886 +#define checkchild(scan) \
10887 +assert("nikita-3435", \
10888 + ergo(scan->direction == LEFT_SIDE && \
10889 + (scan->parent_coord.node->level == TWIG_LEVEL) && \
10890 + jnode_is_unformatted(scan->node) && \
10891 + extent_is_unallocated(&scan->parent_coord), \
10892 + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10893 +
10894 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
10895 + useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10896 + no static initializer function...) */
10897 +ON_DEBUG(atomic_t flush_cnt;
10898 + )
10899 +
10900 +/* check fs backing device for write congestion */
10901 +static int check_write_congestion(void)
10902 +{
10903 + struct super_block *sb;
10904 + struct backing_dev_info *bdi;
10905 +
10906 + sb = reiser4_get_current_sb();
10907 + bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10908 + return bdi_write_congested(bdi);
10909 +}
10910 +
10911 +/* conditionally write flush queue */
10912 +static int write_prepped_nodes(flush_pos_t * pos)
10913 +{
10914 + int ret;
10915 +
10916 + assert("zam-831", pos);
10917 + assert("zam-832", pos->fq);
10918 +
10919 + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10920 + return 0;
10921 +
10922 + if (check_write_congestion())
10923 + return 0;
10924 +
10925 + ret = reiser4_write_fq(pos->fq, pos->nr_written,
10926 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10927 + return ret;
10928 +}
10929 +
10930 +/* Proper release all flush pos. resources then move flush position to new
10931 + locked node */
10932 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10933 + load_count * new_load, const coord_t * new_coord)
10934 +{
10935 + assert("zam-857", new_lock->node == new_load->node);
10936 +
10937 + if (new_coord) {
10938 + assert("zam-858", new_coord->node == new_lock->node);
10939 + coord_dup(&pos->coord, new_coord);
10940 + } else {
10941 + coord_init_first_unit(&pos->coord, new_lock->node);
10942 + }
10943 +
10944 + if (pos->child) {
10945 + jput(pos->child);
10946 + pos->child = NULL;
10947 + }
10948 +
10949 + move_load_count(&pos->load, new_load);
10950 + done_lh(&pos->lock);
10951 + move_lh(&pos->lock, new_lock);
10952 +}
10953 +
10954 +/* delete empty node which link from the parent still exists. */
10955 +static int delete_empty_node(znode * node)
10956 +{
10957 + reiser4_key smallest_removed;
10958 +
10959 + assert("zam-1019", node != NULL);
10960 + assert("zam-1020", node_is_empty(node));
10961 + assert("zam-1023", znode_is_wlocked(node));
10962 +
10963 + return reiser4_delete_node(node, &smallest_removed, NULL, 1);
10964 +}
10965 +
10966 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
10967 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
10968 +{
10969 + int ret;
10970 + load_count load;
10971 + lock_handle lock;
10972 +
10973 + init_lh(&lock);
10974 + init_load_count(&load);
10975 +
10976 + if (jnode_is_znode(org)) {
10977 + ret = longterm_lock_znode(&lock, JZNODE(org),
10978 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
10979 + if (ret)
10980 + return ret;
10981 +
10982 + ret = incr_load_count_znode(&load, JZNODE(org));
10983 + if (ret)
10984 + return ret;
10985 +
10986 + pos->state =
10987 + (jnode_get_level(org) ==
10988 + LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
10989 + move_flush_pos(pos, &lock, &load, NULL);
10990 + } else {
10991 + coord_t parent_coord;
10992 + ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
10993 + &load, ZNODE_WRITE_LOCK, 0);
10994 + if (ret)
10995 + goto done;
10996 + if (!item_is_extent(&parent_coord)) {
10997 + /* file was converted to tail, org became HB, we found internal
10998 + item */
10999 + ret = -EAGAIN;
11000 + goto done;
11001 + }
11002 +
11003 + pos->state = POS_ON_EPOINT;
11004 + move_flush_pos(pos, &lock, &load, &parent_coord);
11005 + pos->child = jref(org);
11006 + if (extent_is_unallocated(&parent_coord)
11007 + && extent_unit_index(&parent_coord) != index_jnode(org)) {
11008 + /* @org is not first child of its parent unit. This may happen
11009 + because longerm lock of its parent node was released between
11010 + scan_left and scan_right. For now work around this having flush to repeat */
11011 + ret = -EAGAIN;
11012 + }
11013 + }
11014 +
11015 + done:
11016 + done_load_count(&load);
11017 + done_lh(&lock);
11018 + return ret;
11019 +}
11020 +
11021 +/* TODO LIST (no particular order): */
11022 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11023 + indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11024 + specific names mentioned instead that need to be inspected/resolved. */
11025 +/* B. There is an issue described in reverse_relocate_test having to do with an
11026 + imprecise is_preceder? check having to do with partially-dirty extents. The code that
11027 + sets preceder hints and computes the preceder is basically untested. Careful testing
11028 + needs to be done that preceder calculations are done correctly, since if it doesn't
11029 + affect correctness we will not catch this stuff during regular testing. */
11030 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11031 + considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11032 + but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11033 + Many of the calls that may produce one of these return values (i.e.,
11034 + longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11035 + values themselves and, for instance, stop flushing instead of resulting in a restart.
11036 + If any of these results are true error conditions then flush will go into a busy-loop,
11037 + as we noticed during testing when a corrupt tree caused find_child_ptr to return
11038 + ENOENT. It needs careful thought and testing of corner conditions.
11039 +*/
11040 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11041 + block is assigned a block number then early-flushed to disk. It is dirtied again and
11042 + flush is called again. Concurrently, that block is deleted, and the de-allocation of
11043 + its block number does not need to be deferred, since it is not part of the preserve set
11044 + (i.e., it didn't exist before the transaction). I think there may be a race condition
11045 + where flush writes the dirty, created block after the non-deferred deallocated block
11046 + number is re-allocated, making it possible to write deleted data on top of non-deleted
11047 + data. Its just a theory, but it needs to be thought out. */
11048 +/* F. bio_alloc() failure is not handled gracefully. */
11049 +/* G. Unallocated children. */
11050 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11051 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11052 +
11053 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11054 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11055 + neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11056 + blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11057 + a part of transaction commit.
11058 +
11059 + Our objective here is to prep and flush the slum the jnode belongs to. We want to
11060 + squish the slum together, and allocate the nodes in it as we squish because allocation
11061 + of children affects squishing of parents.
11062 +
11063 + The "argument" @node tells flush where to start. From there, flush finds the left edge
11064 + of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11065 + "better place" to start squalloc first we perform a flush_scan.
11066 +
11067 + Flush-scanning may be performed in both left and right directions, but for different
11068 + purposes. When scanning to the left, we are searching for a node that precedes a
11069 + sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11070 + During flush-scanning, we also take the opportunity to count the number of consecutive
11071 + leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11072 + make a decision to reallocate leaf nodes (thus favoring write-optimization).
11073 +
11074 + Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11075 + also be dirty nodes to the right of the argument. If the scan-left operation does not
11076 + count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11077 + operation to see whether there is, in fact, enough nodes to meet the relocate
11078 + threshold. Each right- and left-scan operation uses a single flush_scan object.
11079 +
11080 + After left-scan and possibly right-scan, we prepare a flush_position object with the
11081 + starting flush point or parent coordinate, which was determined using scan-left.
11082 +
11083 + Next we call the main flush routine, squalloc, which iterates along the
11084 + leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11085 +
11086 + After squalloc returns we take extra steps to ensure that all the children
11087 + of the final twig node are allocated--this involves repeating squalloc
11088 + until we finish at a twig with no unallocated children.
11089 +
11090 + Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11091 + any above-twig nodes during flush_empty_queue that still have unallocated children, we
11092 + flush_unprep them.
11093 +
11094 + Flush treats several "failure" cases as non-failures, essentially causing them to start
11095 + over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11096 + probably be handled properly rather than restarting, but there are a bunch of cases to
11097 + audit.
11098 +*/
11099 +
11100 +static int
11101 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11102 + flush_queue_t * fq, int flags)
11103 +{
11104 + long ret = 0;
11105 + flush_scan *right_scan;
11106 + flush_scan *left_scan;
11107 + flush_pos_t *flush_pos;
11108 + int todo;
11109 + struct super_block *sb;
11110 + reiser4_super_info_data *sbinfo;
11111 + jnode *leftmost_in_slum = NULL;
11112 +
11113 + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11114 + assert("nikita-3022", reiser4_schedulable());
11115 +
11116 + assert("nikita-3185",
11117 + get_current_super_private()->delete_mutex_owner != current);
11118 +
11119 + /* allocate right_scan, left_scan and flush_pos */
11120 + right_scan =
11121 + kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11122 + reiser4_ctx_gfp_mask_get());
11123 + if (right_scan == NULL)
11124 + return RETERR(-ENOMEM);
11125 + left_scan = right_scan + 1;
11126 + flush_pos = (flush_pos_t *) (left_scan + 1);
11127 +
11128 + sb = reiser4_get_current_sb();
11129 + sbinfo = get_super_private(sb);
11130 +
11131 + /* Flush-concurrency debug code */
11132 +#if REISER4_DEBUG
11133 + atomic_inc(&flush_cnt);
11134 +#endif
11135 +
11136 + reiser4_enter_flush(sb);
11137 +
11138 + /* Initialize a flush position. */
11139 + pos_init(flush_pos);
11140 +
11141 + flush_pos->nr_written = nr_written;
11142 + flush_pos->fq = fq;
11143 + flush_pos->flags = flags;
11144 + flush_pos->nr_to_write = nr_to_write;
11145 +
11146 + scan_init(right_scan);
11147 + scan_init(left_scan);
11148 +
11149 + /* First scan left and remember the leftmost scan position. If the leftmost
11150 + position is unformatted we remember its parent_coord. We scan until counting
11151 + FLUSH_SCAN_MAXNODES.
11152 +
11153 + If starting @node is unformatted, at the beginning of left scan its
11154 + parent (twig level node, containing extent item) will be long term
11155 + locked and lock handle will be stored in the
11156 + @right_scan->parent_lock. This lock is used to start the rightward
11157 + scan without redoing the tree traversal (necessary to find parent)
11158 + and, hence, is kept during leftward scan. As a result, we have to
11159 + use try-lock when taking long term locks during the leftward scan.
11160 + */
11161 + ret = scan_left(left_scan, right_scan,
11162 + node, sbinfo->flush.scan_maxnodes);
11163 + if (ret != 0)
11164 + goto failed;
11165 +
11166 + leftmost_in_slum = jref(left_scan->node);
11167 + scan_done(left_scan);
11168 +
11169 + /* Then possibly go right to decide if we will use a policy of relocating leaves.
11170 + This is only done if we did not scan past (and count) enough nodes during the
11171 + leftward scan. If we do scan right, we only care to go far enough to establish
11172 + that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11173 + scan limit is the difference between left_scan.count and the threshold. */
11174 +
11175 + todo = sbinfo->flush.relocate_threshold - left_scan->count;
11176 + /* scan right is inherently deadlock prone, because we are
11177 + * (potentially) holding a lock on the twig node at this moment.
11178 + * FIXME: this is incorrect comment: lock is not held */
11179 + if (todo > 0) {
11180 + ret = scan_right(right_scan, node, (unsigned)todo);
11181 + if (ret != 0)
11182 + goto failed;
11183 + }
11184 +
11185 + /* Only the right-scan count is needed, release any rightward locks right away. */
11186 + scan_done(right_scan);
11187 +
11188 + /* ... and the answer is: we should relocate leaf nodes if at least
11189 + FLUSH_RELOCATE_THRESHOLD nodes were found. */
11190 + flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11191 + (left_scan->count + right_scan->count >=
11192 + sbinfo->flush.relocate_threshold);
11193 +
11194 + /* Funny business here. We set the 'point' in the flush_position at prior to
11195 + starting squalloc regardless of whether the first point is
11196 + formatted or unformatted. Without this there would be an invariant, in the
11197 + rest of the code, that if the flush_position is unformatted then
11198 + flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11199 + and if the flush_position is formatted then flush_position->point is non-NULL
11200 + and no parent info is set.
11201 +
11202 + This seems lazy, but it makes the initial calls to reverse_relocate_test
11203 + (which ask "is it the pos->point the leftmost child of its parent") much easier
11204 + because we know the first child already. Nothing is broken by this, but the
11205 + reasoning is subtle. Holding an extra reference on a jnode during flush can
11206 + cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11207 + removed from sibling lists until they have zero reference count. Flush would
11208 + never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11209 + deleted to the right. So if nothing is broken, why fix it?
11210 +
11211 + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11212 + point and in any moment, because of the concurrent file system
11213 + activity (for example, truncate). */
11214 +
11215 + /* Check jnode state after flush_scan completed. Having a lock on this
11216 + node or its parent (in case of unformatted) helps us in case of
11217 + concurrent flushing. */
11218 + if (jnode_check_flushprepped(leftmost_in_slum)
11219 + && !jnode_convertible(leftmost_in_slum)) {
11220 + ret = 0;
11221 + goto failed;
11222 + }
11223 +
11224 + /* Now setup flush_pos using scan_left's endpoint. */
11225 + ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11226 + if (ret)
11227 + goto failed;
11228 +
11229 + if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11230 + && node_is_empty(flush_pos->coord.node)) {
11231 + znode *empty = flush_pos->coord.node;
11232 +
11233 + assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11234 + ret = delete_empty_node(empty);
11235 + goto failed;
11236 + }
11237 +
11238 + if (jnode_check_flushprepped(leftmost_in_slum)
11239 + && !jnode_convertible(leftmost_in_slum)) {
11240 + ret = 0;
11241 + goto failed;
11242 + }
11243 +
11244 + /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11245 + ret = alloc_pos_and_ancestors(flush_pos);
11246 + if (ret)
11247 + goto failed;
11248 +
11249 + /* Do the main rightward-bottom-up squeeze and allocate loop. */
11250 + ret = squalloc(flush_pos);
11251 + pos_stop(flush_pos);
11252 + if (ret)
11253 + goto failed;
11254 +
11255 + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11256 + First, the pos_stop() and pos_valid() routines should be modified
11257 + so that pos_stop() sets a flush_position->stop flag to 1 without
11258 + releasing the current position immediately--instead release it in
11259 + pos_done(). This is a better implementation than the current one anyway.
11260 +
11261 + It is not clear that all fields of the flush_position should not be released,
11262 + but at the very least the parent_lock, parent_coord, and parent_load should
11263 + remain held because they are hold the last twig when pos_stop() is
11264 + called.
11265 +
11266 + When we reach this point in the code, if the parent_coord is set to after the
11267 + last item then we know that flush reached the end of a twig (and according to
11268 + the new flush queueing design, we will return now). If parent_coord is not
11269 + past the last item, we should check if the current twig has any unallocated
11270 + children to the right (we are not concerned with unallocated children to the
11271 + left--in that case the twig itself should not have been allocated). If the
11272 + twig has unallocated children to the right, set the parent_coord to that
11273 + position and then repeat the call to squalloc.
11274 +
11275 + Testing for unallocated children may be defined in two ways: if any internal
11276 + item has a fake block number, it is unallocated; if any extent item is
11277 + unallocated then all of its children are unallocated. But there is a more
11278 + aggressive approach: if there are any dirty children of the twig to the right
11279 + of the current position, we may wish to relocate those nodes now. Checking for
11280 + potential relocation is more expensive as it requires knowing whether there are
11281 + any dirty children that are not unallocated. The extent_needs_allocation
11282 + should be used after setting the correct preceder.
11283 +
11284 + When we reach the end of a twig at this point in the code, if the flush can
11285 + continue (when the queue is ready) it will need some information on the future
11286 + starting point. That should be stored away in the flush_handle using a seal, I
11287 + believe. Holding a jref() on the future starting point may break other code
11288 + that deletes that node.
11289 + */
11290 +
11291 + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11292 + above the twig level. If the VM calls flush above the twig level, do nothing
11293 + and return (but figure out why this happens). The txnmgr should be modified to
11294 + only flush its leaf-level dirty list. This will do all the necessary squeeze
11295 + and allocate steps but leave unallocated branches and possibly unallocated
11296 + twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11297 + level, the remaining unallocated nodes should be given write-optimized
11298 + locations. (Possibly, the remaining unallocated twigs should be allocated just
11299 + before their leftmost child.)
11300 + */
11301 +
11302 + /* Any failure reaches this point. */
11303 + failed:
11304 +
11305 + switch (ret) {
11306 + case -E_REPEAT:
11307 + case -EINVAL:
11308 + case -E_DEADLOCK:
11309 + case -E_NO_NEIGHBOR:
11310 + case -ENOENT:
11311 + /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11312 + in each case. They already are handled in many cases. */
11313 + /* Something bad happened, but difficult to avoid... Try again! */
11314 + ret = 0;
11315 + }
11316 +
11317 + if (leftmost_in_slum)
11318 + jput(leftmost_in_slum);
11319 +
11320 + pos_done(flush_pos);
11321 + scan_done(left_scan);
11322 + scan_done(right_scan);
11323 + kfree(right_scan);
11324 +
11325 + ON_DEBUG(atomic_dec(&flush_cnt));
11326 +
11327 + reiser4_leave_flush(sb);
11328 +
11329 + return ret;
11330 +}
11331 +
11332 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11333 + * flusher should submit all prepped nodes immediately without keeping them in
11334 + * flush queues for long time. The reason for rapid flush mode is to free
11335 + * memory as fast as possible. */
11336 +
11337 +#if REISER4_USE_RAPID_FLUSH
11338 +
11339 +/**
11340 + * submit all prepped nodes if rapid flush mode is set,
11341 + * turn rapid flush mode off.
11342 + */
11343 +
11344 +static int rapid_flush(flush_pos_t * pos)
11345 +{
11346 + if (!wbq_available())
11347 + return 0;
11348 +
11349 + return write_prepped_nodes(pos);
11350 +}
11351 +
11352 +#else
11353 +
11354 +#define rapid_flush(pos) (0)
11355 +
11356 +#endif /* REISER4_USE_RAPID_FLUSH */
11357 +
11358 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11359 + flush_queue_t *fq, int *nr_queued,
11360 + int flags)
11361 +{
11362 + jnode * node;
11363 +
11364 + if (start != NULL) {
11365 + spin_lock_jnode(start);
11366 + if (!jnode_is_flushprepped(start)) {
11367 + assert("zam-1056", start->atom == atom);
11368 + node = start;
11369 + goto enter;
11370 + }
11371 + spin_unlock_jnode(start);
11372 + }
11373 + /*
11374 + * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11375 + * nodes. The atom spin lock is not released until all dirty nodes processed or
11376 + * not prepped node found in the atom dirty lists.
11377 + */
11378 + while ((node = find_first_dirty_jnode(atom, flags))) {
11379 + spin_lock_jnode(node);
11380 + enter:
11381 + assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11382 + assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11383 +
11384 + if (JF_ISSET(node, JNODE_WRITEBACK)) {
11385 + /* move node to the end of atom's writeback list */
11386 + list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11387 +
11388 + /*
11389 + * jnode is not necessarily on dirty list: if it was dirtied when
11390 + * it was on flush queue - it does not get moved to dirty list
11391 + */
11392 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11393 + WB_LIST, 1));
11394 +
11395 + } else if (jnode_is_znode(node)
11396 + && znode_above_root(JZNODE(node))) {
11397 + /*
11398 + * A special case for znode-above-root. The above-root (fake)
11399 + * znode is captured and dirtied when the tree height changes or
11400 + * when the root node is relocated. This causes atoms to fuse so
11401 + * that changes at the root are serialized. However, this node is
11402 + * never flushed. This special case used to be in lock.c to
11403 + * prevent the above-root node from ever being captured, but now
11404 + * that it is captured we simply prevent it from flushing. The
11405 + * log-writer code relies on this to properly log superblock
11406 + * modifications of the tree height.
11407 + */
11408 + jnode_make_wander_nolock(node);
11409 + } else if (JF_ISSET(node, JNODE_RELOC)) {
11410 + queue_jnode(fq, node);
11411 + ++(*nr_queued);
11412 + } else
11413 + break;
11414 +
11415 + spin_unlock_jnode(node);
11416 + }
11417 + return node;
11418 +}
11419 +
11420 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11421 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11422 + * other errors as they are. */
11423 +int
11424 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11425 + txn_atom ** atom, jnode *start)
11426 +{
11427 + reiser4_super_info_data *sinfo = get_current_super_private();
11428 + flush_queue_t *fq = NULL;
11429 + jnode *node;
11430 + int nr_queued;
11431 + int ret;
11432 +
11433 + assert("zam-889", atom != NULL && *atom != NULL);
11434 + assert_spin_locked(&((*atom)->alock));
11435 + assert("zam-892", get_current_context()->trans->atom == *atom);
11436 +
11437 + nr_to_write = LONG_MAX;
11438 + while (1) {
11439 + ret = reiser4_fq_by_atom(*atom, &fq);
11440 + if (ret != -E_REPEAT)
11441 + break;
11442 + *atom = get_current_atom_locked();
11443 + }
11444 + if (ret)
11445 + return ret;
11446 +
11447 + assert_spin_locked(&((*atom)->alock));
11448 +
11449 + /* parallel flushers limit */
11450 + if (sinfo->tmgr.atom_max_flushers != 0) {
11451 + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11452 + /* An reiser4_atom_send_event() call is inside
11453 + reiser4_fq_put_nolock() which is called when flush is
11454 + finished and nr_flushers is decremented. */
11455 + reiser4_atom_wait_event(*atom);
11456 + *atom = get_current_atom_locked();
11457 + }
11458 + }
11459 +
11460 + /* count ourself as a flusher */
11461 + (*atom)->nr_flushers++;
11462 +
11463 + writeout_mode_enable();
11464 +
11465 + nr_queued = 0;
11466 + node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11467 +
11468 + if (node == NULL) {
11469 + if (nr_queued == 0) {
11470 + (*atom)->nr_flushers--;
11471 + reiser4_fq_put_nolock(fq);
11472 + reiser4_atom_send_event(*atom);
11473 + /* current atom remains locked */
11474 + writeout_mode_disable();
11475 + return 0;
11476 + }
11477 + spin_unlock_atom(*atom);
11478 + } else {
11479 + jref(node);
11480 + BUG_ON((*atom)->super != node->tree->super);
11481 + spin_unlock_atom(*atom);
11482 + spin_unlock_jnode(node);
11483 + BUG_ON(nr_to_write == 0);
11484 + ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11485 + jput(node);
11486 + }
11487 +
11488 + ret =
11489 + reiser4_write_fq(fq, nr_submitted,
11490 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11491 +
11492 + *atom = get_current_atom_locked();
11493 + (*atom)->nr_flushers--;
11494 + reiser4_fq_put_nolock(fq);
11495 + reiser4_atom_send_event(*atom);
11496 + spin_unlock_atom(*atom);
11497 +
11498 + writeout_mode_disable();
11499 +
11500 + if (ret == 0)
11501 + ret = -E_REPEAT;
11502 +
11503 + return ret;
11504 +}
11505 +
11506 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11507 +
11508 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11509 + reverse parent-first relocate context. Here all we know is the preceder and the block
11510 + number. Since we are going in reverse, the preceder may still be relocated as well, so
11511 + we can't ask the block allocator "is there a closer block available to relocate?" here.
11512 + In the _forward_ parent-first relocate context (not here) we actually call the block
11513 + allocator to try and find a closer location. */
11514 +static int
11515 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11516 + const reiser4_block_nr * nblk)
11517 +{
11518 + reiser4_block_nr dist;
11519 +
11520 + assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11521 + assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11522 + assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11523 +
11524 + /* Distance is the absolute value. */
11525 + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11526 +
11527 + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11528 + block, do not relocate. */
11529 + if (dist <= get_current_super_private()->flush.relocate_distance) {
11530 + return 0;
11531 + }
11532 +
11533 + return 1;
11534 +}
11535 +
11536 +/* This function is a predicate that tests for relocation. Always called in the
11537 + reverse-parent-first context, when we are asking whether the current node should be
11538 + relocated in order to expand the flush by dirtying the parent level (and thus
11539 + proceeding to flush that level). When traversing in the forward parent-first direction
11540 + (not here), relocation decisions are handled in two places: allocate_znode() and
11541 + extent_needs_allocation(). */
11542 +static int
11543 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11544 + flush_pos_t * pos)
11545 +{
11546 + reiser4_block_nr pblk = 0;
11547 + reiser4_block_nr nblk = 0;
11548 +
11549 + assert("jmacd-8989", !jnode_is_root(node));
11550 +
11551 + /*
11552 + * This function is called only from the
11553 + * reverse_relocate_check_dirty_parent() and only if the parent
11554 + * node is clean. This implies that the parent has the real (i.e., not
11555 + * fake) block number, and, so does the child, because otherwise the
11556 + * parent would be dirty.
11557 + */
11558 +
11559 + /* New nodes are treated as if they are being relocated. */
11560 + if (JF_ISSET (node, JNODE_CREATED) ||
11561 + (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11562 + return 1;
11563 + }
11564 +
11565 + /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11566 + existing node, the coord may be leftmost even though the child is not the
11567 + parent-first preceder of the parent. If the first dirty node appears somewhere
11568 + in the middle of the first extent unit, this preceder calculation is wrong.
11569 + Needs more logic in here. */
11570 + if (coord_is_leftmost_unit(parent_coord)) {
11571 + pblk = *znode_get_block(parent_coord->node);
11572 + } else {
11573 + pblk = pos->preceder.blk;
11574 + }
11575 + check_preceder(pblk);
11576 +
11577 + /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11578 + if (pblk == 0) {
11579 + return 1;
11580 + }
11581 +
11582 + nblk = *jnode_get_block(node);
11583 +
11584 + if (reiser4_blocknr_is_fake(&nblk))
11585 + /* child is unallocated, mark parent dirty */
11586 + return 1;
11587 +
11588 + return reverse_relocate_if_close_enough(&pblk, &nblk);
11589 +}
11590 +
11591 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11592 + relocation decision and then, if yes, it marks the parent dirty. */
11593 +static int
11594 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11595 + flush_pos_t * pos)
11596 +{
11597 + int ret;
11598 +
11599 + if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11600 +
11601 + ret = reverse_relocate_test(node, parent_coord, pos);
11602 + if (ret < 0) {
11603 + return ret;
11604 + }
11605 +
11606 + /* FIXME-ZAM
11607 + if parent is already relocated - we do not want to grab space, right? */
11608 + if (ret == 1) {
11609 + int grabbed;
11610 +
11611 + grabbed = get_current_context()->grabbed_blocks;
11612 + if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11613 + 0)
11614 + reiser4_panic("umka-1250",
11615 + "No space left during flush.");
11616 +
11617 + assert("jmacd-18923",
11618 + znode_is_write_locked(parent_coord->node));
11619 + znode_make_dirty(parent_coord->node);
11620 + grabbed2free_mark(grabbed);
11621 + }
11622 + }
11623 +
11624 + return 0;
11625 +}
11626 +
11627 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11628 + PARENT-FIRST LOOP BEGINS) */
11629 +
11630 +/* Get the leftmost child for given coord. */
11631 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11632 +{
11633 + int ret;
11634 +
11635 + ret = item_utmost_child(coord, LEFT_SIDE, child);
11636 +
11637 + if (ret)
11638 + return ret;
11639 +
11640 + if (IS_ERR(*child))
11641 + return PTR_ERR(*child);
11642 +
11643 + return 0;
11644 +}
11645 +
11646 +/* This step occurs after the left- and right-scans are completed, before starting the
11647 + forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11648 + flush point, which means continuing in the reverse parent-first direction to the
11649 + parent, grandparent, and so on (as long as the child is a leftmost child). This
11650 + routine calls a recursive process, alloc_one_ancestor, which does the real work,
11651 + except there is special-case handling here for the first ancestor, which may be a twig.
11652 + At each level (here and alloc_one_ancestor), we check for relocation and then, if
11653 + the child is a leftmost child, repeat at the next level. On the way back down (the
11654 + recursion), we allocate the ancestors in parent-first order. */
11655 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
11656 +{
11657 + int ret = 0;
11658 + lock_handle plock;
11659 + load_count pload;
11660 + coord_t pcoord;
11661 +
11662 + if (znode_check_flushprepped(pos->lock.node))
11663 + return 0;
11664 +
11665 + coord_init_invalid(&pcoord, NULL);
11666 + init_lh(&plock);
11667 + init_load_count(&pload);
11668 +
11669 + if (pos->state == POS_ON_EPOINT) {
11670 + /* a special case for pos on twig level, where we already have
11671 + a lock on parent node. */
11672 + /* The parent may not be dirty, in which case we should decide
11673 + whether to relocate the child now. If decision is made to
11674 + relocate the child, the parent is marked dirty. */
11675 + ret =
11676 + reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11677 + pos);
11678 + if (ret)
11679 + goto exit;
11680 +
11681 + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11682 + is leftmost) and the leaf/child, so recursion is not needed.
11683 + Levels above the twig will be allocated for
11684 + write-optimization before the transaction commits. */
11685 +
11686 + /* Do the recursive step, allocating zero or more of our
11687 + * ancestors. */
11688 + ret = alloc_one_ancestor(&pos->coord, pos);
11689 +
11690 + } else {
11691 + if (!znode_is_root(pos->lock.node)) {
11692 + /* all formatted nodes except tree root */
11693 + ret =
11694 + reiser4_get_parent(&plock, pos->lock.node,
11695 + ZNODE_WRITE_LOCK);
11696 + if (ret)
11697 + goto exit;
11698 +
11699 + ret = incr_load_count_znode(&pload, plock.node);
11700 + if (ret)
11701 + goto exit;
11702 +
11703 + ret =
11704 + find_child_ptr(plock.node, pos->lock.node, &pcoord);
11705 + if (ret)
11706 + goto exit;
11707 +
11708 + ret =
11709 + reverse_relocate_check_dirty_parent(ZJNODE
11710 + (pos->lock.
11711 + node), &pcoord,
11712 + pos);
11713 + if (ret)
11714 + goto exit;
11715 +
11716 + ret = alloc_one_ancestor(&pcoord, pos);
11717 + if (ret)
11718 + goto exit;
11719 + }
11720 +
11721 + ret = allocate_znode(pos->lock.node, &pcoord, pos);
11722 + }
11723 + exit:
11724 + done_load_count(&pload);
11725 + done_lh(&plock);
11726 + return ret;
11727 +}
11728 +
11729 +/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11730 + call to set_preceder, which is the next function described, this checks if the
11731 + child is a leftmost child and returns if it is not. If the child is a leftmost child
11732 + it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11733 + step. */
11734 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11735 +{
11736 + int ret = 0;
11737 + lock_handle alock;
11738 + load_count aload;
11739 + coord_t acoord;
11740 +
11741 + /* As we ascend at the left-edge of the region to flush, take this opportunity at
11742 + the twig level to find our parent-first preceder unless we have already set
11743 + it. */
11744 + if (pos->preceder.blk == 0) {
11745 + ret = set_preceder(coord, pos);
11746 + if (ret != 0)
11747 + return ret;
11748 + }
11749 +
11750 + /* If the ancestor is clean or already allocated, or if the child is not a
11751 + leftmost child, stop going up, even leaving coord->node not flushprepped. */
11752 + if (znode_check_flushprepped(coord->node)
11753 + || !coord_is_leftmost_unit(coord))
11754 + return 0;
11755 +
11756 + init_lh(&alock);
11757 + init_load_count(&aload);
11758 + coord_init_invalid(&acoord, NULL);
11759 +
11760 + /* Only ascend to the next level if it is a leftmost child, but write-lock the
11761 + parent in case we will relocate the child. */
11762 + if (!znode_is_root(coord->node)) {
11763 +
11764 + ret =
11765 + jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11766 + &alock, &aload, ZNODE_WRITE_LOCK,
11767 + 0);
11768 + if (ret != 0) {
11769 + /* FIXME(C): check EINVAL, E_DEADLOCK */
11770 + goto exit;
11771 + }
11772 +
11773 + ret =
11774 + reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11775 + &acoord, pos);
11776 + if (ret != 0) {
11777 + goto exit;
11778 + }
11779 +
11780 + /* Recursive call. */
11781 + if (!znode_check_flushprepped(acoord.node)) {
11782 + ret = alloc_one_ancestor(&acoord, pos);
11783 + if (ret)
11784 + goto exit;
11785 + }
11786 + }
11787 +
11788 + /* Note: we call allocate with the parent write-locked (except at the root) in
11789 + case we relocate the child, in which case it will modify the parent during this
11790 + call. */
11791 + ret = allocate_znode(coord->node, &acoord, pos);
11792 +
11793 + exit:
11794 + done_load_count(&aload);
11795 + done_lh(&alock);
11796 + return ret;
11797 +}
11798 +
11799 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11800 + a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11801 + should this node be relocated (in reverse parent-first context)? We repeat this
11802 + process as long as the child is the leftmost child, eventually reaching an ancestor of
11803 + the flush point that is not a leftmost child. The preceder of that ancestors, which is
11804 + not a leftmost child, is actually on the leaf level. The preceder of that block is the
11805 + left-neighbor of the flush point. The preceder of that block is the rightmost child of
11806 + the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11807 + level, it stops momentarily to remember the block of the rightmost child of the twig on
11808 + the left and sets it to the flush_position's preceder_hint.
11809 +
11810 + There is one other place where we may set the flush_position's preceder hint, which is
11811 + during scan-left.
11812 +*/
11813 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11814 +{
11815 + int ret;
11816 + coord_t coord;
11817 + lock_handle left_lock;
11818 + load_count left_load;
11819 +
11820 + coord_dup(&coord, coord_in);
11821 +
11822 + init_lh(&left_lock);
11823 + init_load_count(&left_load);
11824 +
11825 + /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11826 + coord_is_leftmost_unit is not the right test if the unformatted child is in the
11827 + middle of the first extent unit. */
11828 + if (!coord_is_leftmost_unit(&coord)) {
11829 + coord_prev_unit(&coord);
11830 + } else {
11831 + ret =
11832 + reiser4_get_left_neighbor(&left_lock, coord.node,
11833 + ZNODE_READ_LOCK, GN_SAME_ATOM);
11834 + if (ret) {
11835 + /* If we fail for any reason it doesn't matter because the
11836 + preceder is only a hint. We are low-priority at this point, so
11837 + this must be the case. */
11838 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11839 + ret == -ENOENT || ret == -EINVAL
11840 + || ret == -E_DEADLOCK) {
11841 + ret = 0;
11842 + }
11843 + goto exit;
11844 + }
11845 +
11846 + ret = incr_load_count_znode(&left_load, left_lock.node);
11847 + if (ret)
11848 + goto exit;
11849 +
11850 + coord_init_last_unit(&coord, left_lock.node);
11851 + }
11852 +
11853 + ret =
11854 + item_utmost_child_real_block(&coord, RIGHT_SIDE,
11855 + &pos->preceder.blk);
11856 + exit:
11857 + check_preceder(pos->preceder.blk);
11858 + done_load_count(&left_load);
11859 + done_lh(&left_lock);
11860 + return ret;
11861 +}
11862 +
11863 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11864 +
11865 +/* This procedure implements the outer loop of the flush algorithm. To put this in
11866 + context, here is the general list of steps taken by the flush routine as a whole:
11867 +
11868 + 1. Scan-left
11869 + 2. Scan-right (maybe)
11870 + 3. Allocate initial flush position and its ancestors
11871 + 4. <handle extents>
11872 + 5. <squeeze and next position and its ancestors to-the-right,
11873 + then update position to-the-right>
11874 + 6. <repeat from #4 until flush is stopped>
11875 +
11876 + This procedure implements the loop in steps 4 through 6 in the above listing.
11877 +
11878 + Step 4: if the current flush position is an extent item (position on the twig level),
11879 + it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11880 + coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11881 + If the next coordinate is an internal item, we descend back to the leaf level,
11882 + otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11883 + brings us past the end of the twig level, then we call
11884 + reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11885 + step #5 which moves to the right.
11886 +
11887 + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11888 + tree to allocate any ancestors of the next-right flush position that are not also
11889 + ancestors of the current position. Those ancestors (in top-down order) are the next in
11890 + parent-first order. We squeeze adjacent nodes on the way up until the right node and
11891 + current node share the same parent, then allocate on the way back down. Finally, this
11892 + step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11893 +*/
11894 +
11895 +/* SQUEEZE CODE */
11896 +
11897 +/* squalloc_right_twig helper function, cut a range of extent items from
11898 + cut node to->node from the beginning up to coord @to. */
11899 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11900 + znode * left)
11901 +{
11902 + coord_t from;
11903 + reiser4_key from_key;
11904 +
11905 + coord_init_first_unit(&from, to->node);
11906 + item_key_by_coord(&from, &from_key);
11907 +
11908 + return cut_node_content(&from, to, &from_key, to_key, NULL);
11909 +}
11910 +
11911 +/* Copy as much of the leading extents from @right to @left, allocating
11912 + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11913 + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11914 + internal item it calls shift_one_internal_unit and may then return
11915 + SUBTREE_MOVED. */
11916 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11917 +{
11918 + int ret = SUBTREE_MOVED;
11919 + coord_t coord; /* used to iterate over items */
11920 + reiser4_key stop_key;
11921 +
11922 + assert("jmacd-2008", !node_is_empty(right));
11923 + coord_init_first_unit(&coord, right);
11924 +
11925 + /* FIXME: can be optimized to cut once */
11926 + while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11927 + ON_DEBUG(void *vp);
11928 +
11929 + assert("vs-1468", coord_is_leftmost_unit(&coord));
11930 + ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11931 +
11932 + /* stop_key is used to find what was copied and what to cut */
11933 + stop_key = *reiser4_min_key();
11934 + ret = squalloc_extent(left, &coord, pos, &stop_key);
11935 + if (ret != SQUEEZE_CONTINUE) {
11936 + ON_DEBUG(kfree(vp));
11937 + break;
11938 + }
11939 + assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11940 +
11941 + /* Helper function to do the cutting. */
11942 + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11943 + check_me("vs-1466",
11944 + squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11945 +
11946 + ON_DEBUG(shift_check(vp, left, coord.node));
11947 + }
11948 +
11949 + if (node_is_empty(coord.node))
11950 + ret = SQUEEZE_SOURCE_EMPTY;
11951 +
11952 + if (ret == SQUEEZE_TARGET_FULL) {
11953 + goto out;
11954 + }
11955 +
11956 + if (node_is_empty(right)) {
11957 + /* The whole right node was copied into @left. */
11958 + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
11959 + goto out;
11960 + }
11961 +
11962 + coord_init_first_unit(&coord, right);
11963 +
11964 + if (!item_is_internal(&coord)) {
11965 + /* we do not want to squeeze anything else to left neighbor because "slum"
11966 + is over */
11967 + ret = SQUEEZE_TARGET_FULL;
11968 + goto out;
11969 + }
11970 + assert("jmacd-433", item_is_internal(&coord));
11971 +
11972 + /* Shift an internal unit. The child must be allocated before shifting any more
11973 + extents, so we stop here. */
11974 + ret = shift_one_internal_unit(left, right);
11975 +
11976 + out:
11977 + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
11978 + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
11979 +
11980 + if (ret == SQUEEZE_TARGET_FULL) {
11981 + /* We submit prepped nodes here and expect that this @left twig
11982 + * will not be modified again during this jnode_flush() call. */
11983 + int ret1;
11984 +
11985 + /* NOTE: seems like io is done under long term locks. */
11986 + ret1 = write_prepped_nodes(pos);
11987 + if (ret1 < 0)
11988 + return ret1;
11989 + }
11990 +
11991 + return ret;
11992 +}
11993 +
11994 +#if REISER4_DEBUG
11995 +static void item_convert_invariant(flush_pos_t * pos)
11996 +{
11997 + assert("edward-1225", coord_is_existing_item(&pos->coord));
11998 + if (chaining_data_present(pos)) {
11999 + item_plugin *iplug = item_convert_plug(pos);
12000 +
12001 + assert("edward-1000",
12002 + iplug == item_plugin_by_coord(&pos->coord));
12003 + assert("edward-1001", iplug->f.convert != NULL);
12004 + } else
12005 + assert("edward-1226", pos->child == NULL);
12006 +}
12007 +#else
12008 +
12009 +#define item_convert_invariant(pos) noop
12010 +
12011 +#endif
12012 +
12013 +/* Scan node items starting from the first one and apply for each
12014 + item its flush ->convert() method (if any). This method may
12015 + resize/kill the item so the tree will be changed.
12016 +*/
12017 +static int convert_node(flush_pos_t * pos, znode * node)
12018 +{
12019 + int ret = 0;
12020 + item_plugin *iplug;
12021 +
12022 + assert("edward-304", pos != NULL);
12023 + assert("edward-305", pos->child == NULL);
12024 + assert("edward-475", znode_convertible(node));
12025 + assert("edward-669", znode_is_wlocked(node));
12026 + assert("edward-1210", !node_is_empty(node));
12027 +
12028 + if (znode_get_level(node) != LEAF_LEVEL)
12029 + /* unsupported */
12030 + goto exit;
12031 +
12032 + coord_init_first_unit(&pos->coord, node);
12033 +
12034 + while (1) {
12035 + ret = 0;
12036 + coord_set_to_left(&pos->coord);
12037 + item_convert_invariant(pos);
12038 +
12039 + iplug = item_plugin_by_coord(&pos->coord);
12040 + assert("edward-844", iplug != NULL);
12041 +
12042 + if (iplug->f.convert) {
12043 + ret = iplug->f.convert(pos);
12044 + if (ret)
12045 + goto exit;
12046 + }
12047 + assert("edward-307", pos->child == NULL);
12048 +
12049 + if (coord_next_item(&pos->coord)) {
12050 + /* node is over */
12051 +
12052 + if (!chaining_data_present(pos))
12053 + /* finished this node */
12054 + break;
12055 + if (should_chain_next_node(pos)) {
12056 + /* go to next node */
12057 + move_chaining_data(pos, 0 /* to next node */ );
12058 + break;
12059 + }
12060 + /* repeat this node */
12061 + move_chaining_data(pos, 1 /* this node */ );
12062 + continue;
12063 + }
12064 + /* Node is not over.
12065 + Check if there is attached convert data.
12066 + If so roll one item position back and repeat
12067 + on this node
12068 + */
12069 + if (chaining_data_present(pos)) {
12070 +
12071 + if (iplug != item_plugin_by_coord(&pos->coord))
12072 + set_item_convert_count(pos, 0);
12073 +
12074 + ret = coord_prev_item(&pos->coord);
12075 + assert("edward-1003", !ret);
12076 +
12077 + move_chaining_data(pos, 1 /* this node */ );
12078 + }
12079 + }
12080 + JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12081 + znode_make_dirty(node);
12082 + exit:
12083 + assert("edward-1004", !ret);
12084 + return ret;
12085 +}
12086 +
12087 +/* Squeeze and allocate the right neighbor. This is called after @left and
12088 + its current children have been squeezed and allocated already. This
12089 + procedure's job is to squeeze and items from @right to @left.
12090 +
12091 + If at the leaf level, use the shift_everything_left memcpy-optimized
12092 + version of shifting (squeeze_right_leaf).
12093 +
12094 + If at the twig level, extents are allocated as they are shifted from @right
12095 + to @left (squalloc_right_twig).
12096 +
12097 + At any other level, shift one internal item and return to the caller
12098 + (squalloc_parent_first) so that the shifted-subtree can be processed in
12099 + parent-first order.
12100 +
12101 + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12102 + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12103 + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12104 + is returned.
12105 +*/
12106 +
12107 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12108 + znode * right)
12109 +{
12110 + int ret;
12111 +
12112 + /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12113 + * tree owing to error (for example, ENOSPC) in write */
12114 + /* assert("jmacd-9321", !node_is_empty(left)); */
12115 + assert("jmacd-9322", !node_is_empty(right));
12116 + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12117 +
12118 + switch (znode_get_level(left)) {
12119 + case TWIG_LEVEL:
12120 + /* Shift with extent allocating until either an internal item
12121 + is encountered or everything is shifted or no free space
12122 + left in @left */
12123 + ret = squeeze_right_twig(left, right, pos);
12124 + break;
12125 +
12126 + default:
12127 + /* All other levels can use shift_everything until we implement per-item
12128 + flush plugins. */
12129 + ret = squeeze_right_non_twig(left, right);
12130 + break;
12131 + }
12132 +
12133 + assert("jmacd-2011", (ret < 0 ||
12134 + ret == SQUEEZE_SOURCE_EMPTY
12135 + || ret == SQUEEZE_TARGET_FULL
12136 + || ret == SUBTREE_MOVED));
12137 + return ret;
12138 +}
12139 +
12140 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12141 + znode * right)
12142 +{
12143 + int ret;
12144 +
12145 + ret = squeeze_right_twig(pos->lock.node, right, pos);
12146 + if (ret < 0)
12147 + return ret;
12148 + if (ret > 0) {
12149 + coord_init_after_last_item(&pos->coord, pos->lock.node);
12150 + return ret;
12151 + }
12152 +
12153 + coord_init_last_unit(&pos->coord, pos->lock.node);
12154 + return 0;
12155 +}
12156 +
12157 +/* forward declaration */
12158 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12159 +
12160 +/* do a fast check for "same parents" condition before calling
12161 + * squalloc_upper_levels() */
12162 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12163 + znode * left,
12164 + znode * right)
12165 +{
12166 + if (znode_same_parents(left, right))
12167 + return 0;
12168 +
12169 + return squalloc_upper_levels(pos, left, right);
12170 +}
12171 +
12172 +/* Check whether the parent of given @right node needs to be processes
12173 + ((re)allocated) prior to processing of the child. If @left and @right do not
12174 + share at least the parent of the @right is after the @left but before the
12175 + @right in parent-first order, we have to (re)allocate it before the @right
12176 + gets (re)allocated. */
12177 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12178 +{
12179 + int ret;
12180 +
12181 + lock_handle left_parent_lock;
12182 + lock_handle right_parent_lock;
12183 +
12184 + load_count left_parent_load;
12185 + load_count right_parent_load;
12186 +
12187 + init_lh(&left_parent_lock);
12188 + init_lh(&right_parent_lock);
12189 +
12190 + init_load_count(&left_parent_load);
12191 + init_load_count(&right_parent_load);
12192 +
12193 + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12194 + if (ret)
12195 + goto out;
12196 +
12197 + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12198 + if (ret)
12199 + goto out;
12200 +
12201 + /* Check for same parents */
12202 + if (left_parent_lock.node == right_parent_lock.node)
12203 + goto out;
12204 +
12205 + if (znode_check_flushprepped(right_parent_lock.node)) {
12206 + /* Keep parent-first order. In the order, the right parent node stands
12207 + before the @right node. If it is already allocated, we set the
12208 + preceder (next block search start point) to its block number, @right
12209 + node should be allocated after it.
12210 +
12211 + However, preceder is set only if the right parent is on twig level.
12212 + The explanation is the following: new branch nodes are allocated over
12213 + already allocated children while the tree grows, it is difficult to
12214 + keep tree ordered, we assume that only leaves and twings are correctly
12215 + allocated. So, only twigs are used as a preceder for allocating of the
12216 + rest of the slum. */
12217 + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12218 + pos->preceder.blk =
12219 + *znode_get_block(right_parent_lock.node);
12220 + check_preceder(pos->preceder.blk);
12221 + }
12222 + goto out;
12223 + }
12224 +
12225 + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12226 + if (ret)
12227 + goto out;
12228 +
12229 + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12230 + if (ret)
12231 + goto out;
12232 +
12233 + ret =
12234 + squeeze_right_neighbor(pos, left_parent_lock.node,
12235 + right_parent_lock.node);
12236 + /* We stop if error. We stop if some items/units were shifted (ret == 0)
12237 + * and thus @right changed its parent. It means we have not process
12238 + * right_parent node prior to processing of @right. Positive return
12239 + * values say that shifting items was not happen because of "empty
12240 + * source" or "target full" conditions. */
12241 + if (ret <= 0)
12242 + goto out;
12243 +
12244 + /* parent(@left) and parent(@right) may have different parents also. We
12245 + * do a recursive call for checking that. */
12246 + ret =
12247 + check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12248 + right_parent_lock.node);
12249 + if (ret)
12250 + goto out;
12251 +
12252 + /* allocate znode when going down */
12253 + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12254 +
12255 + out:
12256 + done_load_count(&left_parent_load);
12257 + done_load_count(&right_parent_load);
12258 +
12259 + done_lh(&left_parent_lock);
12260 + done_lh(&right_parent_lock);
12261 +
12262 + return ret;
12263 +}
12264 +
12265 +/* Check the leftmost child "flushprepped" status, also returns true if child
12266 + * node was not found in cache. */
12267 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12268 +{
12269 + int ret;
12270 + int prepped;
12271 +
12272 + jnode *child;
12273 +
12274 + ret = get_leftmost_child_of_unit(coord, &child);
12275 +
12276 + if (ret)
12277 + return ret;
12278 +
12279 + if (child) {
12280 + prepped = jnode_check_flushprepped(child);
12281 + jput(child);
12282 + } else {
12283 + /* We consider not existing child as a node which slum
12284 + processing should not continue to. Not cached node is clean,
12285 + so it is flushprepped. */
12286 + prepped = 1;
12287 + }
12288 +
12289 + return prepped;
12290 +}
12291 +
12292 +/* (re)allocate znode with automated getting parent node */
12293 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12294 +{
12295 + int ret;
12296 + lock_handle parent_lock;
12297 + load_count parent_load;
12298 + coord_t pcoord;
12299 +
12300 + assert("zam-851", znode_is_write_locked(node));
12301 +
12302 + init_lh(&parent_lock);
12303 + init_load_count(&parent_load);
12304 +
12305 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12306 + if (ret)
12307 + goto out;
12308 +
12309 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12310 + if (ret)
12311 + goto out;
12312 +
12313 + ret = find_child_ptr(parent_lock.node, node, &pcoord);
12314 + if (ret)
12315 + goto out;
12316 +
12317 + ret = allocate_znode(node, &pcoord, pos);
12318 +
12319 + out:
12320 + done_load_count(&parent_load);
12321 + done_lh(&parent_lock);
12322 + return ret;
12323 +}
12324 +
12325 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12326 + * slum reached. */
12327 +static int handle_pos_on_formatted(flush_pos_t * pos)
12328 +{
12329 + int ret;
12330 + lock_handle right_lock;
12331 + load_count right_load;
12332 +
12333 + init_lh(&right_lock);
12334 + init_load_count(&right_load);
12335 +
12336 + if (should_convert_node(pos, pos->lock.node)) {
12337 + ret = convert_node(pos, pos->lock.node);
12338 + if (ret)
12339 + return ret;
12340 + }
12341 +
12342 + while (1) {
12343 + ret =
12344 + neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12345 + ZNODE_WRITE_LOCK,
12346 + !should_convert_next_node(pos,
12347 + right_lock.
12348 + node));
12349 + if (ret)
12350 + break;
12351 +
12352 + /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12353 + * can be optimal. For now we choose to live with the risk that it will
12354 + * be suboptimal because it would be quite complex to code it to be
12355 + * smarter. */
12356 + if (znode_check_flushprepped(right_lock.node)
12357 + && !znode_convertible(right_lock.node)) {
12358 + assert("edward-1005",
12359 + !should_convert_next_node(pos, right_lock.node));
12360 + pos_stop(pos);
12361 + break;
12362 + }
12363 +
12364 + ret = incr_load_count_znode(&right_load, right_lock.node);
12365 + if (ret)
12366 + break;
12367 +
12368 + if (should_convert_node(pos, right_lock.node)) {
12369 + ret = convert_node(pos, right_lock.node);
12370 + if (ret)
12371 + break;
12372 + if (node_is_empty(right_lock.node)) {
12373 + /* node became empty after converting, repeat */
12374 + done_load_count(&right_load);
12375 + done_lh(&right_lock);
12376 + continue;
12377 + }
12378 + }
12379 +
12380 + /* squeeze _before_ going upward. */
12381 + ret =
12382 + squeeze_right_neighbor(pos, pos->lock.node,
12383 + right_lock.node);
12384 + if (ret < 0)
12385 + break;
12386 +
12387 + if (znode_check_flushprepped(right_lock.node)) {
12388 + if (should_convert_next_node(pos, right_lock.node)) {
12389 + /* in spite of flushprepped status of the node,
12390 + its right slum neighbor should be converted */
12391 + assert("edward-953", convert_data(pos));
12392 + assert("edward-954", item_convert_data(pos));
12393 +
12394 + if (node_is_empty(right_lock.node)) {
12395 + done_load_count(&right_load);
12396 + done_lh(&right_lock);
12397 + } else
12398 + move_flush_pos(pos, &right_lock,
12399 + &right_load, NULL);
12400 + continue;
12401 + }
12402 + pos_stop(pos);
12403 + break;
12404 + }
12405 +
12406 + if (node_is_empty(right_lock.node)) {
12407 + /* repeat if right node was squeezed completely */
12408 + done_load_count(&right_load);
12409 + done_lh(&right_lock);
12410 + continue;
12411 + }
12412 +
12413 + /* parent(right_lock.node) has to be processed before
12414 + * (right_lock.node) due to "parent-first" allocation order. */
12415 + ret =
12416 + check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12417 + right_lock.node);
12418 + if (ret)
12419 + break;
12420 + /* (re)allocate _after_ going upward */
12421 + ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12422 + if (ret)
12423 + break;
12424 +
12425 + if (should_terminate_squalloc(pos)) {
12426 + set_item_convert_count(pos, 0);
12427 + break;
12428 + }
12429 +
12430 + /* advance the flush position to the right neighbor */
12431 + move_flush_pos(pos, &right_lock, &right_load, NULL);
12432 +
12433 + ret = rapid_flush(pos);
12434 + if (ret)
12435 + break;
12436 + }
12437 +
12438 + assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12439 +
12440 + done_load_count(&right_load);
12441 + done_lh(&right_lock);
12442 +
12443 + /* This function indicates via pos whether to stop or go to twig or continue on current
12444 + * level. */
12445 + return ret;
12446 +
12447 +}
12448 +
12449 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12450 + * slum reached. */
12451 +static int handle_pos_on_leaf(flush_pos_t * pos)
12452 +{
12453 + int ret;
12454 +
12455 + assert("zam-845", pos->state == POS_ON_LEAF);
12456 +
12457 + ret = handle_pos_on_formatted(pos);
12458 +
12459 + if (ret == -E_NO_NEIGHBOR) {
12460 + /* cannot get right neighbor, go process extents. */
12461 + pos->state = POS_TO_TWIG;
12462 + return 0;
12463 + }
12464 +
12465 + return ret;
12466 +}
12467 +
12468 +/* Process slum on level > 1 */
12469 +static int handle_pos_on_internal(flush_pos_t * pos)
12470 +{
12471 + assert("zam-850", pos->state == POS_ON_INTERNAL);
12472 + return handle_pos_on_formatted(pos);
12473 +}
12474 +
12475 +/* check whether squalloc should stop before processing given extent */
12476 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12477 +{
12478 + assert("zam-869", item_is_extent(&pos->coord));
12479 +
12480 + /* pos->child is a jnode handle_pos_on_extent() should start with in
12481 + * stead of the first child of the first extent unit. */
12482 + if (pos->child) {
12483 + int prepped;
12484 +
12485 + assert("vs-1383", jnode_is_unformatted(pos->child));
12486 + prepped = jnode_check_flushprepped(pos->child);
12487 + pos->pos_in_unit =
12488 + jnode_get_index(pos->child) -
12489 + extent_unit_index(&pos->coord);
12490 + assert("vs-1470",
12491 + pos->pos_in_unit < extent_unit_width(&pos->coord));
12492 + assert("nikita-3434",
12493 + ergo(extent_is_unallocated(&pos->coord),
12494 + pos->pos_in_unit == 0));
12495 + jput(pos->child);
12496 + pos->child = NULL;
12497 +
12498 + return prepped;
12499 + }
12500 +
12501 + pos->pos_in_unit = 0;
12502 + if (extent_is_unallocated(&pos->coord))
12503 + return 0;
12504 +
12505 + return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12506 +}
12507 +
12508 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12509 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12510 + * unformatted nodes. By having a lock on twig level and use extent code
12511 + * routines to process unformatted nodes we swim around an irregular part of
12512 + * reiser4 tree. */
12513 +static int handle_pos_on_twig(flush_pos_t * pos)
12514 +{
12515 + int ret;
12516 +
12517 + assert("zam-844", pos->state == POS_ON_EPOINT);
12518 + assert("zam-843", item_is_extent(&pos->coord));
12519 +
12520 + /* We decide should we continue slum processing with current extent
12521 + unit: if leftmost child of current extent unit is flushprepped
12522 + (i.e. clean or already processed by flush) we stop squalloc(). There
12523 + is a fast check for unallocated extents which we assume contain all
12524 + not flushprepped nodes. */
12525 + /* FIXME: Here we implement simple check, we are only looking on the
12526 + leftmost child. */
12527 + ret = squalloc_extent_should_stop(pos);
12528 + if (ret != 0) {
12529 + pos_stop(pos);
12530 + return ret;
12531 + }
12532 +
12533 + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12534 + && item_is_extent(&pos->coord)) {
12535 + ret = reiser4_alloc_extent(pos);
12536 + if (ret) {
12537 + break;
12538 + }
12539 + coord_next_unit(&pos->coord);
12540 + }
12541 +
12542 + if (coord_is_after_rightmost(&pos->coord)) {
12543 + pos->state = POS_END_OF_TWIG;
12544 + return 0;
12545 + }
12546 + if (item_is_internal(&pos->coord)) {
12547 + pos->state = POS_TO_LEAF;
12548 + return 0;
12549 + }
12550 +
12551 + assert("zam-860", item_is_extent(&pos->coord));
12552 +
12553 + /* "slum" is over */
12554 + pos->state = POS_INVALID;
12555 + return 0;
12556 +}
12557 +
12558 +/* When we about to return flush position from twig to leaf level we can process
12559 + * the right twig node or move position to the leaf. This processes right twig
12560 + * if it is possible and jump to leaf level if not. */
12561 +static int handle_pos_end_of_twig(flush_pos_t * pos)
12562 +{
12563 + int ret;
12564 + lock_handle right_lock;
12565 + load_count right_load;
12566 + coord_t at_right;
12567 + jnode *child = NULL;
12568 +
12569 + assert("zam-848", pos->state == POS_END_OF_TWIG);
12570 + assert("zam-849", coord_is_after_rightmost(&pos->coord));
12571 +
12572 + init_lh(&right_lock);
12573 + init_load_count(&right_load);
12574 +
12575 + /* We get a lock on the right twig node even it is not dirty because
12576 + * slum continues or discontinues on leaf level not on next twig. This
12577 + * lock on the right twig is needed for getting its leftmost child. */
12578 + ret =
12579 + reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12580 + ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12581 + if (ret)
12582 + goto out;
12583 +
12584 + ret = incr_load_count_znode(&right_load, right_lock.node);
12585 + if (ret)
12586 + goto out;
12587 +
12588 + /* right twig could be not dirty */
12589 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12590 + /* If right twig node is dirty we always attempt to squeeze it
12591 + * content to the left... */
12592 + became_dirty:
12593 + ret =
12594 + squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12595 + if (ret <= 0) {
12596 + /* pos->coord is on internal item, go to leaf level, or
12597 + * we have an error which will be caught in squalloc() */
12598 + pos->state = POS_TO_LEAF;
12599 + goto out;
12600 + }
12601 +
12602 + /* If right twig was squeezed completely we wave to re-lock
12603 + * right twig. now it is done through the top-level squalloc
12604 + * routine. */
12605 + if (node_is_empty(right_lock.node))
12606 + goto out;
12607 +
12608 + /* ... and prep it if it is not yet prepped */
12609 + if (!znode_check_flushprepped(right_lock.node)) {
12610 + /* As usual, process parent before ... */
12611 + ret =
12612 + check_parents_and_squalloc_upper_levels(pos,
12613 + pos->lock.
12614 + node,
12615 + right_lock.
12616 + node);
12617 + if (ret)
12618 + goto out;
12619 +
12620 + /* ... processing the child */
12621 + ret =
12622 + lock_parent_and_allocate_znode(right_lock.node,
12623 + pos);
12624 + if (ret)
12625 + goto out;
12626 + }
12627 + } else {
12628 + coord_init_first_unit(&at_right, right_lock.node);
12629 +
12630 + /* check first child of next twig, should we continue there ? */
12631 + ret = get_leftmost_child_of_unit(&at_right, &child);
12632 + if (ret || child == NULL || jnode_check_flushprepped(child)) {
12633 + pos_stop(pos);
12634 + goto out;
12635 + }
12636 +
12637 + /* check clean twig for possible relocation */
12638 + if (!znode_check_flushprepped(right_lock.node)) {
12639 + ret =
12640 + reverse_relocate_check_dirty_parent(child,
12641 + &at_right, pos);
12642 + if (ret)
12643 + goto out;
12644 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12645 + goto became_dirty;
12646 + }
12647 + }
12648 +
12649 + assert("zam-875", znode_check_flushprepped(right_lock.node));
12650 +
12651 + /* Update the preceder by a block number of just processed right twig
12652 + * node. The code above could miss the preceder updating because
12653 + * allocate_znode() could not be called for this node. */
12654 + pos->preceder.blk = *znode_get_block(right_lock.node);
12655 + check_preceder(pos->preceder.blk);
12656 +
12657 + coord_init_first_unit(&at_right, right_lock.node);
12658 + assert("zam-868", coord_is_existing_unit(&at_right));
12659 +
12660 + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12661 + move_flush_pos(pos, &right_lock, &right_load, &at_right);
12662 +
12663 + out:
12664 + done_load_count(&right_load);
12665 + done_lh(&right_lock);
12666 +
12667 + if (child)
12668 + jput(child);
12669 +
12670 + return ret;
12671 +}
12672 +
12673 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12674 + * continue there. */
12675 +static int handle_pos_to_leaf(flush_pos_t * pos)
12676 +{
12677 + int ret;
12678 + lock_handle child_lock;
12679 + load_count child_load;
12680 + jnode *child;
12681 +
12682 + assert("zam-846", pos->state == POS_TO_LEAF);
12683 + assert("zam-847", item_is_internal(&pos->coord));
12684 +
12685 + init_lh(&child_lock);
12686 + init_load_count(&child_load);
12687 +
12688 + ret = get_leftmost_child_of_unit(&pos->coord, &child);
12689 + if (ret)
12690 + return ret;
12691 + if (child == NULL) {
12692 + pos_stop(pos);
12693 + return 0;
12694 + }
12695 +
12696 + if (jnode_check_flushprepped(child)) {
12697 + pos->state = POS_INVALID;
12698 + goto out;
12699 + }
12700 +
12701 + ret =
12702 + longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12703 + ZNODE_LOCK_LOPRI);
12704 + if (ret)
12705 + goto out;
12706 +
12707 + ret = incr_load_count_znode(&child_load, JZNODE(child));
12708 + if (ret)
12709 + goto out;
12710 +
12711 + ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12712 + if (ret)
12713 + goto out;
12714 +
12715 + /* move flush position to leaf level */
12716 + pos->state = POS_ON_LEAF;
12717 + move_flush_pos(pos, &child_lock, &child_load, NULL);
12718 +
12719 + if (node_is_empty(JZNODE(child))) {
12720 + ret = delete_empty_node(JZNODE(child));
12721 + pos->state = POS_INVALID;
12722 + }
12723 + out:
12724 + done_load_count(&child_load);
12725 + done_lh(&child_lock);
12726 + jput(child);
12727 +
12728 + return ret;
12729 +}
12730 +
12731 +/* move pos from leaf to twig, and move lock from leaf to twig. */
12732 +/* Move pos->lock to upper (twig) level */
12733 +static int handle_pos_to_twig(flush_pos_t * pos)
12734 +{
12735 + int ret;
12736 +
12737 + lock_handle parent_lock;
12738 + load_count parent_load;
12739 + coord_t pcoord;
12740 +
12741 + assert("zam-852", pos->state == POS_TO_TWIG);
12742 +
12743 + init_lh(&parent_lock);
12744 + init_load_count(&parent_load);
12745 +
12746 + ret =
12747 + reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12748 + if (ret)
12749 + goto out;
12750 +
12751 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12752 + if (ret)
12753 + goto out;
12754 +
12755 + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12756 + if (ret)
12757 + goto out;
12758 +
12759 + assert("zam-870", item_is_internal(&pcoord));
12760 + coord_next_item(&pcoord);
12761 +
12762 + if (coord_is_after_rightmost(&pcoord))
12763 + pos->state = POS_END_OF_TWIG;
12764 + else if (item_is_extent(&pcoord))
12765 + pos->state = POS_ON_EPOINT;
12766 + else {
12767 + /* Here we understand that getting -E_NO_NEIGHBOR in
12768 + * handle_pos_on_leaf() was because of just a reaching edge of
12769 + * slum */
12770 + pos_stop(pos);
12771 + goto out;
12772 + }
12773 +
12774 + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12775 +
12776 + out:
12777 + done_load_count(&parent_load);
12778 + done_lh(&parent_lock);
12779 +
12780 + return ret;
12781 +}
12782 +
12783 +typedef int (*pos_state_handle_t) (flush_pos_t *);
12784 +static pos_state_handle_t flush_pos_handlers[] = {
12785 + /* process formatted nodes on leaf level, keep lock on a leaf node */
12786 + [POS_ON_LEAF] = handle_pos_on_leaf,
12787 + /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12788 + * being processed */
12789 + [POS_ON_EPOINT] = handle_pos_on_twig,
12790 + /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12791 + [POS_TO_TWIG] = handle_pos_to_twig,
12792 + /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12793 + * pos->coord points to the leaf node we jump to */
12794 + [POS_TO_LEAF] = handle_pos_to_leaf,
12795 + /* after processing last extent in the twig node, attempting to shift items from the twigs
12796 + * right neighbor and process them while shifting */
12797 + [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12798 + /* process formatted nodes on internal level, keep lock on an internal node */
12799 + [POS_ON_INTERNAL] = handle_pos_on_internal
12800 +};
12801 +
12802 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12803 + * encrypt) nodes and their ancestors in "parent-first" order */
12804 +static int squalloc(flush_pos_t * pos)
12805 +{
12806 + int ret = 0;
12807 +
12808 + /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12809 + * greater CPU efficiency? Measure and see.... -Hans */
12810 + while (pos_valid(pos)) {
12811 + ret = flush_pos_handlers[pos->state] (pos);
12812 + if (ret < 0)
12813 + break;
12814 +
12815 + ret = rapid_flush(pos);
12816 + if (ret)
12817 + break;
12818 + }
12819 +
12820 + /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12821 + routines, -E_NO_NEIGHBOR means that slum edge was reached */
12822 + if (ret > 0 || ret == -E_NO_NEIGHBOR)
12823 + ret = 0;
12824 +
12825 + return ret;
12826 +}
12827 +
12828 +static void update_ldkey(znode * node)
12829 +{
12830 + reiser4_key ldkey;
12831 +
12832 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12833 + if (node_is_empty(node))
12834 + return;
12835 +
12836 + znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12837 +}
12838 +
12839 +/* this is to be called after calling of shift node's method to shift data from @right to
12840 + @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12841 + and @right correspondingly and sets right delimiting key of @left to first key of @right */
12842 +static void update_znode_dkeys(znode * left, znode * right)
12843 +{
12844 + assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12845 + assert("vs-1629", (znode_is_write_locked(left) &&
12846 + znode_is_write_locked(right)));
12847 +
12848 + /* we need to update left delimiting of left if it was empty before shift */
12849 + update_ldkey(left);
12850 + update_ldkey(right);
12851 + if (node_is_empty(right))
12852 + znode_set_rd_key(left, znode_get_rd_key(right));
12853 + else
12854 + znode_set_rd_key(left, znode_get_ld_key(right));
12855 +}
12856 +
12857 +/* try to shift everything from @right to @left. If everything was shifted -
12858 + @right is removed from the tree. Result is the number of bytes shifted. */
12859 +static int
12860 +shift_everything_left(znode * right, znode * left, carry_level * todo)
12861 +{
12862 + coord_t from;
12863 + node_plugin *nplug;
12864 + carry_plugin_info info;
12865 +
12866 + coord_init_after_last_item(&from, right);
12867 +
12868 + nplug = node_plugin_by_node(right);
12869 + info.doing = NULL;
12870 + info.todo = todo;
12871 + return nplug->shift(&from, left, SHIFT_LEFT,
12872 + 1 /* delete @right if it becomes empty */ ,
12873 + 1
12874 + /* move coord @from to node @left if everything will be shifted */
12875 + ,
12876 + &info);
12877 +}
12878 +
12879 +/* Shift as much as possible from @right to @left using the memcpy-optimized
12880 + shift_everything_left. @left and @right are formatted neighboring nodes on
12881 + leaf level. */
12882 +static int squeeze_right_non_twig(znode * left, znode * right)
12883 +{
12884 + int ret;
12885 + carry_pool *pool;
12886 + carry_level *todo;
12887 +
12888 + assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12889 +
12890 + if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12891 + !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12892 + return SQUEEZE_TARGET_FULL;
12893 +
12894 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12895 + if (IS_ERR(pool))
12896 + return PTR_ERR(pool);
12897 + todo = (carry_level *) (pool + 1);
12898 + init_carry_level(todo, pool);
12899 +
12900 + ret = shift_everything_left(right, left, todo);
12901 + if (ret > 0) {
12902 + /* something was shifted */
12903 + reiser4_tree *tree;
12904 + __u64 grabbed;
12905 +
12906 + znode_make_dirty(left);
12907 + znode_make_dirty(right);
12908 +
12909 + /* update delimiting keys of nodes which participated in
12910 + shift. FIXME: it would be better to have this in shift
12911 + node's operation. But it can not be done there. Nobody
12912 + remembers why, though */
12913 + tree = znode_get_tree(left);
12914 + write_lock_dk(tree);
12915 + update_znode_dkeys(left, right);
12916 + write_unlock_dk(tree);
12917 +
12918 + /* Carry is called to update delimiting key and, maybe, to remove empty
12919 + node. */
12920 + grabbed = get_current_context()->grabbed_blocks;
12921 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12922 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
12923 + ret = reiser4_carry(todo, NULL /* previous level */ );
12924 + grabbed2free_mark(grabbed);
12925 + } else {
12926 + /* Shifting impossible, we return appropriate result code */
12927 + ret =
12928 + node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12929 + SQUEEZE_TARGET_FULL;
12930 + }
12931 +
12932 + done_carry_pool(pool);
12933 +
12934 + return ret;
12935 +}
12936 +
12937 +#if REISER4_DEBUG
12938 +static int sibling_link_is_ok(const znode *left, const znode *right)
12939 +{
12940 + int result;
12941 +
12942 + read_lock_tree(znode_get_tree(left));
12943 + result = (left->right == right && left == right->left);
12944 + read_unlock_tree(znode_get_tree(left));
12945 + return result;
12946 +}
12947 +#endif
12948 +
12949 +/* Shift first unit of first item if it is an internal one. Return
12950 + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
12951 + SUBTREE_MOVED. */
12952 +static int shift_one_internal_unit(znode * left, znode * right)
12953 +{
12954 + int ret;
12955 + carry_pool *pool;
12956 + carry_level *todo;
12957 + coord_t *coord;
12958 + carry_plugin_info *info;
12959 + int size, moved;
12960 +
12961 + assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
12962 + assert("nikita-2435", znode_is_write_locked(left));
12963 + assert("nikita-2436", znode_is_write_locked(right));
12964 + assert("nikita-2434", sibling_link_is_ok(left, right));
12965 +
12966 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
12967 + sizeof(*coord) + sizeof(*info)
12968 +#if REISER4_DEBUG
12969 + + sizeof(*coord) + 2 * sizeof(reiser4_key)
12970 +#endif
12971 + );
12972 + if (IS_ERR(pool))
12973 + return PTR_ERR(pool);
12974 + todo = (carry_level *) (pool + 1);
12975 + init_carry_level(todo, pool);
12976 +
12977 + coord = (coord_t *) (todo + 3);
12978 + coord_init_first_unit(coord, right);
12979 + info = (carry_plugin_info *) (coord + 1);
12980 +
12981 +#if REISER4_DEBUG
12982 + if (!node_is_empty(left)) {
12983 + coord_t *last;
12984 + reiser4_key *right_key;
12985 + reiser4_key *left_key;
12986 +
12987 + last = (coord_t *) (info + 1);
12988 + right_key = (reiser4_key *) (last + 1);
12989 + left_key = right_key + 1;
12990 + coord_init_last_unit(last, left);
12991 +
12992 + assert("nikita-2463",
12993 + keyle(item_key_by_coord(last, left_key),
12994 + item_key_by_coord(coord, right_key)));
12995 + }
12996 +#endif
12997 +
12998 + assert("jmacd-2007", item_is_internal(coord));
12999 +
13000 + size = item_length_by_coord(coord);
13001 + info->todo = todo;
13002 + info->doing = NULL;
13003 +
13004 + ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13005 + 1
13006 + /* delete @right if it becomes empty */
13007 + ,
13008 + 0
13009 + /* do not move coord @coord to node @left */
13010 + ,
13011 + info);
13012 +
13013 + /* If shift returns positive, then we shifted the item. */
13014 + assert("vs-423", ret <= 0 || size == ret);
13015 + moved = (ret > 0);
13016 +
13017 + if (moved) {
13018 + /* something was moved */
13019 + reiser4_tree *tree;
13020 + int grabbed;
13021 +
13022 + znode_make_dirty(left);
13023 + znode_make_dirty(right);
13024 + tree = znode_get_tree(left);
13025 + write_lock_dk(tree);
13026 + update_znode_dkeys(left, right);
13027 + write_unlock_dk(tree);
13028 +
13029 + /* reserve space for delimiting keys after shifting */
13030 + grabbed = get_current_context()->grabbed_blocks;
13031 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13032 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13033 +
13034 + ret = reiser4_carry(todo, NULL /* previous level */ );
13035 + grabbed2free_mark(grabbed);
13036 + }
13037 +
13038 + done_carry_pool(pool);
13039 +
13040 + if (ret != 0) {
13041 + /* Shift or carry operation failed. */
13042 + assert("jmacd-7325", ret < 0);
13043 + return ret;
13044 + }
13045 +
13046 + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13047 +}
13048 +
13049 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13050 + znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13051 +static int
13052 +allocate_znode_loaded(znode * node,
13053 + const coord_t * parent_coord, flush_pos_t * pos)
13054 +{
13055 + int ret;
13056 + reiser4_super_info_data *sbinfo = get_current_super_private();
13057 + /* FIXME(D): We have the node write-locked and should have checked for !
13058 + allocated() somewhere before reaching this point, but there can be a race, so
13059 + this assertion is bogus. */
13060 + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13061 + assert("jmacd-7988", znode_is_write_locked(node));
13062 + assert("jmacd-7989", coord_is_invalid(parent_coord)
13063 + || znode_is_write_locked(parent_coord->node));
13064 +
13065 + if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13066 + znode_is_root(node) ||
13067 + /* We have enough nodes to relocate no matter what. */
13068 + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13069 + /* No need to decide with new nodes, they are treated the same as
13070 + relocate. If the root node is dirty, relocate. */
13071 + if (pos->preceder.blk == 0) {
13072 + /* preceder is unknown and we have decided to relocate node --
13073 + using of default value for search start is better than search
13074 + from block #0. */
13075 + get_blocknr_hint_default(&pos->preceder.blk);
13076 + check_preceder(pos->preceder.blk);
13077 + }
13078 +
13079 + goto best_reloc;
13080 +
13081 + } else if (pos->preceder.blk == 0) {
13082 + /* If we don't know the preceder, leave it where it is. */
13083 + jnode_make_wander(ZJNODE(node));
13084 + } else {
13085 + /* Make a decision based on block distance. */
13086 + reiser4_block_nr dist;
13087 + reiser4_block_nr nblk = *znode_get_block(node);
13088 +
13089 + assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13090 + assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13091 + assert("jmacd-6174", pos->preceder.blk != 0);
13092 +
13093 + if (pos->preceder.blk == nblk - 1) {
13094 + /* Ideal. */
13095 + jnode_make_wander(ZJNODE(node));
13096 + } else {
13097 +
13098 + dist =
13099 + (nblk <
13100 + pos->preceder.blk) ? (pos->preceder.blk -
13101 + nblk) : (nblk -
13102 + pos->preceder.blk);
13103 +
13104 + /* See if we can find a closer block (forward direction only). */
13105 + pos->preceder.max_dist =
13106 + min((reiser4_block_nr) sbinfo->flush.
13107 + relocate_distance, dist);
13108 + pos->preceder.level = znode_get_level(node);
13109 +
13110 + ret = allocate_znode_update(node, parent_coord, pos);
13111 +
13112 + pos->preceder.max_dist = 0;
13113 +
13114 + if (ret && (ret != -ENOSPC))
13115 + return ret;
13116 +
13117 + if (ret == 0) {
13118 + /* Got a better allocation. */
13119 + znode_make_reloc(node, pos->fq);
13120 + } else if (dist < sbinfo->flush.relocate_distance) {
13121 + /* The present allocation is good enough. */
13122 + jnode_make_wander(ZJNODE(node));
13123 + } else {
13124 + /* Otherwise, try to relocate to the best position. */
13125 + best_reloc:
13126 + ret =
13127 + allocate_znode_update(node, parent_coord,
13128 + pos);
13129 + if (ret != 0)
13130 + return ret;
13131 +
13132 + /* set JNODE_RELOC bit _after_ node gets allocated */
13133 + znode_make_reloc(node, pos->fq);
13134 + }
13135 + }
13136 + }
13137 +
13138 + /* This is the new preceder. */
13139 + pos->preceder.blk = *znode_get_block(node);
13140 + check_preceder(pos->preceder.blk);
13141 + pos->alloc_cnt += 1;
13142 +
13143 + assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13144 +
13145 + return 0;
13146 +}
13147 +
13148 +static int
13149 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13150 +{
13151 + /*
13152 + * perform znode allocation with znode pinned in memory to avoid races
13153 + * with asynchronous emergency flush (which plays with
13154 + * JNODE_FLUSH_RESERVED bit).
13155 + */
13156 + return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13157 +}
13158 +
13159 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13160 + position to relocate to. It may return ENOSPC if there is no close position. If there
13161 + is no close position it may not relocate. This takes care of updating the parent node
13162 + with the relocated block address. */
13163 +static int
13164 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13165 + flush_pos_t * pos)
13166 +{
13167 + int ret;
13168 + reiser4_block_nr blk;
13169 + lock_handle uber_lock;
13170 + int flush_reserved_used = 0;
13171 + int grabbed;
13172 + reiser4_context *ctx;
13173 + reiser4_super_info_data *sbinfo;
13174 +
13175 + init_lh(&uber_lock);
13176 +
13177 + ctx = get_current_context();
13178 + sbinfo = get_super_private(ctx->super);
13179 +
13180 + grabbed = ctx->grabbed_blocks;
13181 +
13182 + /* discard e-flush allocation */
13183 + ret = zload(node);
13184 + if (ret)
13185 + return ret;
13186 +
13187 + if (ZF_ISSET(node, JNODE_CREATED)) {
13188 + assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13189 + pos->preceder.block_stage = BLOCK_UNALLOCATED;
13190 + } else {
13191 + pos->preceder.block_stage = BLOCK_GRABBED;
13192 +
13193 + /* The disk space for relocating the @node is already reserved in "flush reserved"
13194 + * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13195 + * space from whole disk not from only 95%). */
13196 + if (znode_get_level(node) == LEAF_LEVEL) {
13197 + /*
13198 + * earlier (during do_jnode_make_dirty()) we decided
13199 + * that @node can possibly go into overwrite set and
13200 + * reserved block for its wandering location.
13201 + */
13202 + txn_atom *atom = get_current_atom_locked();
13203 + assert("nikita-3449",
13204 + ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13205 + flush_reserved2grabbed(atom, (__u64) 1);
13206 + spin_unlock_atom(atom);
13207 + /*
13208 + * we are trying to move node into relocate
13209 + * set. Allocation of relocated position "uses"
13210 + * reserved block.
13211 + */
13212 + ZF_CLR(node, JNODE_FLUSH_RESERVED);
13213 + flush_reserved_used = 1;
13214 + } else {
13215 + ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13216 + if (ret != 0)
13217 + goto exit;
13218 + }
13219 + }
13220 +
13221 + /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13222 + ret = reiser4_alloc_block(&pos->preceder, &blk,
13223 + BA_FORMATTED | BA_PERMANENT);
13224 + if (ret)
13225 + goto exit;
13226 +
13227 + if (!ZF_ISSET(node, JNODE_CREATED) &&
13228 + (ret =
13229 + reiser4_dealloc_block(znode_get_block(node), 0,
13230 + BA_DEFER | BA_FORMATTED)))
13231 + goto exit;
13232 +
13233 + if (likely(!znode_is_root(node))) {
13234 + item_plugin *iplug;
13235 +
13236 + iplug = item_plugin_by_coord(parent_coord);
13237 + assert("nikita-2954", iplug->f.update != NULL);
13238 + iplug->f.update(parent_coord, &blk);
13239 +
13240 + znode_make_dirty(parent_coord->node);
13241 +
13242 + } else {
13243 + reiser4_tree *tree = znode_get_tree(node);
13244 + znode *uber;
13245 +
13246 + /* We take a longterm lock on the fake node in order to change
13247 + the root block number. This may cause atom fusion. */
13248 + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13249 + &uber_lock);
13250 + /* The fake node cannot be deleted, and we must have priority
13251 + here, and may not be confused with ENOSPC. */
13252 + assert("jmacd-74412",
13253 + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13254 +
13255 + if (ret)
13256 + goto exit;
13257 +
13258 + uber = uber_lock.node;
13259 +
13260 + write_lock_tree(tree);
13261 + tree->root_block = blk;
13262 + write_unlock_tree(tree);
13263 +
13264 + znode_make_dirty(uber);
13265 + }
13266 +
13267 + ret = znode_rehash(node, &blk);
13268 + exit:
13269 + if (ret) {
13270 + /* Get flush reserved block back if something fails, because
13271 + * callers assume that on error block wasn't relocated and its
13272 + * flush reserved block wasn't used. */
13273 + if (flush_reserved_used) {
13274 + /*
13275 + * ok, we failed to move node into relocate
13276 + * set. Restore status quo.
13277 + */
13278 + grabbed2flush_reserved((__u64) 1);
13279 + ZF_SET(node, JNODE_FLUSH_RESERVED);
13280 + }
13281 + }
13282 + zrelse(node);
13283 + done_lh(&uber_lock);
13284 + grabbed2free_mark(grabbed);
13285 + return ret;
13286 +}
13287 +
13288 +/* JNODE INTERFACE */
13289 +
13290 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13291 + coordinate in the parent. If the child is the root node, the above_root
13292 + znode is returned but the coord is not set. This function may cause atom
13293 + fusion, but it is only used for read locks (at this point) and therefore
13294 + fusion only occurs when the parent is already dirty. */
13295 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13296 + pointer in jnodes. */
13297 +static int
13298 +jnode_lock_parent_coord(jnode * node,
13299 + coord_t * coord,
13300 + lock_handle * parent_lh,
13301 + load_count * parent_zh,
13302 + znode_lock_mode parent_mode, int try)
13303 +{
13304 + int ret;
13305 +
13306 + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13307 + assert("edward-54", jnode_is_unformatted(node)
13308 + || znode_is_any_locked(JZNODE(node)));
13309 +
13310 + if (!jnode_is_znode(node)) {
13311 + reiser4_key key;
13312 + tree_level stop_level = TWIG_LEVEL;
13313 + lookup_bias bias = FIND_EXACT;
13314 +
13315 + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13316 +
13317 + /* The case when node is not znode, but can have parent coord
13318 + (unformatted node, node which represents cluster page,
13319 + etc..). Generate a key for the appropriate entry, search
13320 + in the tree using coord_by_key, which handles locking for
13321 + us. */
13322 +
13323 + /*
13324 + * nothing is locked at this moment, so, nothing prevents
13325 + * concurrent truncate from removing jnode from inode. To
13326 + * prevent this spin-lock jnode. jnode can be truncated just
13327 + * after call to the jnode_build_key(), but this is ok,
13328 + * because coord_by_key() will just fail to find appropriate
13329 + * extent.
13330 + */
13331 + spin_lock_jnode(node);
13332 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13333 + jnode_build_key(node, &key);
13334 + ret = 0;
13335 + } else
13336 + ret = RETERR(-ENOENT);
13337 + spin_unlock_jnode(node);
13338 +
13339 + if (ret != 0)
13340 + return ret;
13341 +
13342 + if (jnode_is_cluster_page(node))
13343 + stop_level = LEAF_LEVEL;
13344 +
13345 + assert("jmacd-1812", coord != NULL);
13346 +
13347 + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13348 + parent_mode, bias, stop_level, stop_level,
13349 + CBK_UNIQUE, NULL /*ra_info */ );
13350 + switch (ret) {
13351 + case CBK_COORD_NOTFOUND:
13352 + assert("edward-1038",
13353 + ergo(jnode_is_cluster_page(node),
13354 + JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13355 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13356 + warning("nikita-3177", "Parent not found");
13357 + return ret;
13358 + case CBK_COORD_FOUND:
13359 + if (coord->between != AT_UNIT) {
13360 + /* FIXME: comment needed */
13361 + done_lh(parent_lh);
13362 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13363 + warning("nikita-3178",
13364 + "Found but not happy: %i",
13365 + coord->between);
13366 + }
13367 + return RETERR(-ENOENT);
13368 + }
13369 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13370 + if (ret != 0)
13371 + return ret;
13372 + /* if (jnode_is_cluster_page(node)) {
13373 + races with write() are possible
13374 + check_child_cluster (parent_lh->node);
13375 + }
13376 + */
13377 + break;
13378 + default:
13379 + return ret;
13380 + }
13381 +
13382 + } else {
13383 + int flags;
13384 + znode *z;
13385 +
13386 + z = JZNODE(node);
13387 + /* Formatted node case: */
13388 + assert("jmacd-2061", !znode_is_root(z));
13389 +
13390 + flags = GN_ALLOW_NOT_CONNECTED;
13391 + if (try)
13392 + flags |= GN_TRY_LOCK;
13393 +
13394 + ret =
13395 + reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13396 + if (ret != 0)
13397 + /* -E_REPEAT is ok here, it is handled by the caller. */
13398 + return ret;
13399 +
13400 + /* Make the child's position "hint" up-to-date. (Unless above
13401 + root, which caller must check.) */
13402 + if (coord != NULL) {
13403 +
13404 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13405 + if (ret != 0) {
13406 + warning("jmacd-976812386",
13407 + "incr_load_count_znode failed: %d",
13408 + ret);
13409 + return ret;
13410 + }
13411 +
13412 + ret = find_child_ptr(parent_lh->node, z, coord);
13413 + if (ret != 0) {
13414 + warning("jmacd-976812",
13415 + "find_child_ptr failed: %d", ret);
13416 + return ret;
13417 + }
13418 + }
13419 + }
13420 +
13421 + return 0;
13422 +}
13423 +
13424 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13425 + If there is no next neighbor or the neighbor is not in memory or if there is a
13426 + neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13427 + In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13428 +static int neighbor_in_slum(znode * node, /* starting point */
13429 + lock_handle * lock, /* lock on starting point */
13430 + sideof side, /* left or right direction we seek the next node in */
13431 + znode_lock_mode mode, /* kind of lock we want */
13432 + int check_dirty)
13433 +{ /* true if the neighbor should be dirty */
13434 + int ret;
13435 +
13436 + assert("jmacd-6334", znode_is_connected(node));
13437 +
13438 + ret =
13439 + reiser4_get_neighbor(lock, node, mode,
13440 + GN_SAME_ATOM | (side ==
13441 + LEFT_SIDE ? GN_GO_LEFT : 0));
13442 +
13443 + if (ret) {
13444 + /* May return -ENOENT or -E_NO_NEIGHBOR. */
13445 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13446 + if (ret == -ENOENT) {
13447 + ret = RETERR(-E_NO_NEIGHBOR);
13448 + }
13449 +
13450 + return ret;
13451 + }
13452 + if (!check_dirty)
13453 + return 0;
13454 + /* Check dirty bit of locked znode, no races here */
13455 + if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13456 + return 0;
13457 +
13458 + done_lh(lock);
13459 + return RETERR(-E_NO_NEIGHBOR);
13460 +}
13461 +
13462 +/* Return true if two znodes have the same parent. This is called with both nodes
13463 + write-locked (for squeezing) so no tree lock is needed. */
13464 +static int znode_same_parents(znode * a, znode * b)
13465 +{
13466 + int result;
13467 +
13468 + assert("jmacd-7011", znode_is_write_locked(a));
13469 + assert("jmacd-7012", znode_is_write_locked(b));
13470 +
13471 + /* We lock the whole tree for this check.... I really don't like whole tree
13472 + * locks... -Hans */
13473 + read_lock_tree(znode_get_tree(a));
13474 + result = (znode_parent(a) == znode_parent(b));
13475 + read_unlock_tree(znode_get_tree(a));
13476 + return result;
13477 +}
13478 +
13479 +/* FLUSH SCAN */
13480 +
13481 +/* Initialize the flush_scan data structure. */
13482 +static void scan_init(flush_scan * scan)
13483 +{
13484 + memset(scan, 0, sizeof(*scan));
13485 + init_lh(&scan->node_lock);
13486 + init_lh(&scan->parent_lock);
13487 + init_load_count(&scan->parent_load);
13488 + init_load_count(&scan->node_load);
13489 + coord_init_invalid(&scan->parent_coord, NULL);
13490 +}
13491 +
13492 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13493 +static void scan_done(flush_scan * scan)
13494 +{
13495 + done_load_count(&scan->node_load);
13496 + if (scan->node != NULL) {
13497 + jput(scan->node);
13498 + scan->node = NULL;
13499 + }
13500 + done_load_count(&scan->parent_load);
13501 + done_lh(&scan->parent_lock);
13502 + done_lh(&scan->node_lock);
13503 +}
13504 +
13505 +/* Returns true if flush scanning is finished. */
13506 +int reiser4_scan_finished(flush_scan * scan)
13507 +{
13508 + return scan->stop || (scan->direction == RIGHT_SIDE &&
13509 + scan->count >= scan->max_count);
13510 +}
13511 +
13512 +/* Return true if the scan should continue to the @tonode. True if the node meets the
13513 + same_slum_check condition. If not, deref the "left" node and stop the scan. */
13514 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13515 +{
13516 + int go = same_slum_check(scan->node, tonode, 1, 0);
13517 +
13518 + if (!go) {
13519 + scan->stop = 1;
13520 + jput(tonode);
13521 + }
13522 +
13523 + return go;
13524 +}
13525 +
13526 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13527 + count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13528 + parent coordinate. */
13529 +int
13530 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13531 + const coord_t * parent)
13532 +{
13533 + /* Release the old references, take the new reference. */
13534 + done_load_count(&scan->node_load);
13535 +
13536 + if (scan->node != NULL) {
13537 + jput(scan->node);
13538 + }
13539 + scan->node = node;
13540 + scan->count += add_count;
13541 +
13542 + /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13543 + delay this update step until it finishes and update the parent_coord only once.
13544 + It did that before, but there was a bug and this was the easiest way to make it
13545 + correct. */
13546 + if (parent != NULL) {
13547 + coord_dup(&scan->parent_coord, parent);
13548 + }
13549 +
13550 + /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13551 + is safely taken. */
13552 + return incr_load_count_jnode(&scan->node_load, node);
13553 +}
13554 +
13555 +/* Return true if scanning in the leftward direction. */
13556 +int reiser4_scanning_left(flush_scan * scan)
13557 +{
13558 + return scan->direction == LEFT_SIDE;
13559 +}
13560 +
13561 +/* Performs leftward scanning starting from either kind of node. Counts the starting
13562 + node. The right-scan object is passed in for the left-scan in order to copy the parent
13563 + of an unformatted starting position. This way we avoid searching for the unformatted
13564 + node's parent when scanning in each direction. If we search for the parent once it is
13565 + set in both scan objects. The limit parameter tells flush-scan when to stop.
13566 +
13567 + Rapid scanning is used only during scan_left, where we are interested in finding the
13568 + 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13569 + of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13570 + problem is finding a way to flush only those nodes without unallocated children, and it
13571 + is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13572 + problem can be solved by scanning left at every level as we go upward, but this would
13573 + basically bring us back to using a top-down allocation strategy, which we already tried
13574 + (see BK history from May 2002), and has a different set of problems. The top-down
13575 + strategy makes avoiding unallocated children easier, but makes it difficult to
13576 + propertly flush dirty children with clean parents that would otherwise stop the
13577 + top-down flush, only later to dirty the parent once the children are flushed. So we
13578 + solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13579 + only.
13580 +
13581 + The first step in solving the problem is this rapid leftward scan. After we determine
13582 + that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13583 + are no longer interested in the exact count, we are only interested in finding a the
13584 + best place to start the flush. We could choose one of two possibilities:
13585 +
13586 + 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13587 + This requires checking one leaf per rapid-scan twig
13588 +
13589 + 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13590 + to the left. This requires checking possibly all of the in-memory children of each
13591 + twig during the rapid scan.
13592 +
13593 + For now we implement the first policy.
13594 +*/
13595 +static int
13596 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13597 +{
13598 + int ret = 0;
13599 +
13600 + scan->max_count = limit;
13601 + scan->direction = LEFT_SIDE;
13602 +
13603 + ret = scan_set_current(scan, jref(node), 1, NULL);
13604 + if (ret != 0) {
13605 + return ret;
13606 + }
13607 +
13608 + ret = scan_common(scan, right);
13609 + if (ret != 0) {
13610 + return ret;
13611 + }
13612 +
13613 + /* Before rapid scanning, we need a lock on scan->node so that we can get its
13614 + parent, only if formatted. */
13615 + if (jnode_is_znode(scan->node)) {
13616 + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13617 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13618 + }
13619 +
13620 + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13621 + return ret;
13622 +}
13623 +
13624 +/* Performs rightward scanning... Does not count the starting node. The limit parameter
13625 + is described in scan_left. If the starting node is unformatted then the
13626 + parent_coord was already set during scan_left. The rapid_after parameter is not used
13627 + during right-scanning.
13628 +
13629 + scan_right is only called if the scan_left operation does not count at least
13630 + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13631 + the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13632 + scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13633 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13634 +{
13635 + int ret;
13636 +
13637 + scan->max_count = limit;
13638 + scan->direction = RIGHT_SIDE;
13639 +
13640 + ret = scan_set_current(scan, jref(node), 0, NULL);
13641 + if (ret != 0) {
13642 + return ret;
13643 + }
13644 +
13645 + return scan_common(scan, NULL);
13646 +}
13647 +
13648 +/* Common code to perform left or right scanning. */
13649 +static int scan_common(flush_scan * scan, flush_scan * other)
13650 +{
13651 + int ret;
13652 +
13653 + assert("nikita-2376", scan->node != NULL);
13654 + assert("edward-54", jnode_is_unformatted(scan->node)
13655 + || jnode_is_znode(scan->node));
13656 +
13657 + /* Special case for starting at an unformatted node. Optimization: we only want
13658 + to search for the parent (which requires a tree traversal) once. Obviously, we
13659 + shouldn't have to call it once for the left scan and once for the right scan.
13660 + For this reason, if we search for the parent during scan-left we then duplicate
13661 + the coord/lock/load into the scan-right object. */
13662 + if (jnode_is_unformatted(scan->node)) {
13663 + ret = scan_unformatted(scan, other);
13664 + if (ret != 0)
13665 + return ret;
13666 + }
13667 + /* This loop expects to start at a formatted position and performs chaining of
13668 + formatted regions */
13669 + while (!reiser4_scan_finished(scan)) {
13670 +
13671 + ret = scan_formatted(scan);
13672 + if (ret != 0) {
13673 + return ret;
13674 + }
13675 + }
13676 +
13677 + return 0;
13678 +}
13679 +
13680 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13681 +{
13682 + int ret = 0;
13683 + int try = 0;
13684 +
13685 + if (!coord_is_invalid(&scan->parent_coord))
13686 + goto scan;
13687 +
13688 + /* set parent coord from */
13689 + if (!jnode_is_unformatted(scan->node)) {
13690 + /* formatted position */
13691 +
13692 + lock_handle lock;
13693 + assert("edward-301", jnode_is_znode(scan->node));
13694 + init_lh(&lock);
13695 +
13696 + /*
13697 + * when flush starts from unformatted node, first thing it
13698 + * does is tree traversal to find formatted parent of starting
13699 + * node. This parent is then kept lock across scans to the
13700 + * left and to the right. This means that during scan to the
13701 + * left we cannot take left-ward lock, because this is
13702 + * dead-lock prone. So, if we are scanning to the left and
13703 + * there is already lock held by this thread,
13704 + * jnode_lock_parent_coord() should use try-lock.
13705 + */
13706 + try = reiser4_scanning_left(scan)
13707 + && !lock_stack_isclean(get_current_lock_stack());
13708 + /* Need the node locked to get the parent lock, We have to
13709 + take write lock since there is at least one call path
13710 + where this znode is already write-locked by us. */
13711 + ret =
13712 + longterm_lock_znode(&lock, JZNODE(scan->node),
13713 + ZNODE_WRITE_LOCK,
13714 + reiser4_scanning_left(scan) ?
13715 + ZNODE_LOCK_LOPRI :
13716 + ZNODE_LOCK_HIPRI);
13717 + if (ret != 0)
13718 + /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13719 + scanned too far and can't back out, just start over. */
13720 + return ret;
13721 +
13722 + ret = jnode_lock_parent_coord(scan->node,
13723 + &scan->parent_coord,
13724 + &scan->parent_lock,
13725 + &scan->parent_load,
13726 + ZNODE_WRITE_LOCK, try);
13727 +
13728 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13729 + done_lh(&lock);
13730 + if (ret == -E_REPEAT) {
13731 + scan->stop = 1;
13732 + return 0;
13733 + }
13734 + if (ret)
13735 + return ret;
13736 +
13737 + } else {
13738 + /* unformatted position */
13739 +
13740 + ret =
13741 + jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13742 + &scan->parent_lock,
13743 + &scan->parent_load,
13744 + ZNODE_WRITE_LOCK, try);
13745 +
13746 + if (IS_CBKERR(ret))
13747 + return ret;
13748 +
13749 + if (ret == CBK_COORD_NOTFOUND)
13750 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13751 + return ret;
13752 +
13753 + /* parent was found */
13754 + assert("jmacd-8661", other != NULL);
13755 + /* Duplicate the reference into the other flush_scan. */
13756 + coord_dup(&other->parent_coord, &scan->parent_coord);
13757 + copy_lh(&other->parent_lock, &scan->parent_lock);
13758 + copy_load_count(&other->parent_load, &scan->parent_load);
13759 + }
13760 + scan:
13761 + return scan_by_coord(scan);
13762 +}
13763 +
13764 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
13765 + pointers under tree lock as long as:
13766 +
13767 + - node->left/right is non-NULL
13768 + - node->left/right is connected, dirty
13769 + - node->left/right belongs to the same atom
13770 + - scan has not reached maximum count
13771 +*/
13772 +static int scan_formatted(flush_scan * scan)
13773 +{
13774 + int ret;
13775 + znode *neighbor = NULL;
13776 +
13777 + assert("jmacd-1401", !reiser4_scan_finished(scan));
13778 +
13779 + do {
13780 + znode *node = JZNODE(scan->node);
13781 +
13782 + /* Node should be connected, but if not stop the scan. */
13783 + if (!znode_is_connected(node)) {
13784 + scan->stop = 1;
13785 + break;
13786 + }
13787 +
13788 + /* Lock the tree, check-for and reference the next sibling. */
13789 + read_lock_tree(znode_get_tree(node));
13790 +
13791 + /* It may be that a node is inserted or removed between a node and its
13792 + left sibling while the tree lock is released, but the flush-scan count
13793 + does not need to be precise. Thus, we release the tree lock as soon as
13794 + we get the neighboring node. */
13795 + neighbor =
13796 + reiser4_scanning_left(scan) ? node->left : node->right;
13797 + if (neighbor != NULL) {
13798 + zref(neighbor);
13799 + }
13800 +
13801 + read_unlock_tree(znode_get_tree(node));
13802 +
13803 + /* If neighbor is NULL at the leaf level, need to check for an unformatted
13804 + sibling using the parent--break in any case. */
13805 + if (neighbor == NULL) {
13806 + break;
13807 + }
13808 +
13809 + /* Check the condition for going left, break if it is not met. This also
13810 + releases (jputs) the neighbor if false. */
13811 + if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13812 + break;
13813 + }
13814 +
13815 + /* Advance the flush_scan state to the left, repeat. */
13816 + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13817 + if (ret != 0) {
13818 + return ret;
13819 + }
13820 +
13821 + } while (!reiser4_scan_finished(scan));
13822 +
13823 + /* If neighbor is NULL then we reached the end of a formatted region, or else the
13824 + sibling is out of memory, now check for an extent to the left (as long as
13825 + LEAF_LEVEL). */
13826 + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13827 + || reiser4_scan_finished(scan)) {
13828 + scan->stop = 1;
13829 + return 0;
13830 + }
13831 + /* Otherwise, calls scan_by_coord for the right(left)most item of the
13832 + left(right) neighbor on the parent level, then possibly continue. */
13833 +
13834 + coord_init_invalid(&scan->parent_coord, NULL);
13835 + return scan_unformatted(scan, NULL);
13836 +}
13837 +
13838 +/* NOTE-EDWARD:
13839 + This scans adjacent items of the same type and calls scan flush plugin for each one.
13840 + Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13841 + from unformatted node, then we continue only if the next neighbor is also unformatted.
13842 + When called from scan_formatted, we skip first iteration (to make sure that
13843 + right(left)most item of the left(right) neighbor on the parent level is of the same
13844 + type and set appropriate coord). */
13845 +static int scan_by_coord(flush_scan * scan)
13846 +{
13847 + int ret = 0;
13848 + int scan_this_coord;
13849 + lock_handle next_lock;
13850 + load_count next_load;
13851 + coord_t next_coord;
13852 + jnode *child;
13853 + item_plugin *iplug;
13854 +
13855 + init_lh(&next_lock);
13856 + init_load_count(&next_load);
13857 + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13858 +
13859 + /* set initial item id */
13860 + iplug = item_plugin_by_coord(&scan->parent_coord);
13861 +
13862 + for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13863 + if (scan_this_coord) {
13864 + /* Here we expect that unit is scannable. it would not be so due
13865 + * to race with extent->tail conversion. */
13866 + if (iplug->f.scan == NULL) {
13867 + scan->stop = 1;
13868 + ret = -E_REPEAT;
13869 + /* skip the check at the end. */
13870 + goto race;
13871 + }
13872 +
13873 + ret = iplug->f.scan(scan);
13874 + if (ret != 0)
13875 + goto exit;
13876 +
13877 + if (reiser4_scan_finished(scan)) {
13878 + checkchild(scan);
13879 + break;
13880 + }
13881 + } else {
13882 + /* the same race against truncate as above is possible
13883 + * here, it seems */
13884 +
13885 + /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13886 + the first coordinate. */
13887 + assert("jmacd-1231",
13888 + item_is_internal(&scan->parent_coord));
13889 + }
13890 +
13891 + if (iplug->f.utmost_child == NULL
13892 + || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13893 + /* stop this coord and continue on parrent level */
13894 + ret =
13895 + scan_set_current(scan,
13896 + ZJNODE(zref
13897 + (scan->parent_coord.node)),
13898 + 1, NULL);
13899 + if (ret != 0)
13900 + goto exit;
13901 + break;
13902 + }
13903 +
13904 + /* Either way, the invariant is that scan->parent_coord is set to the
13905 + parent of scan->node. Now get the next unit. */
13906 + coord_dup(&next_coord, &scan->parent_coord);
13907 + coord_sideof_unit(&next_coord, scan->direction);
13908 +
13909 + /* If off-the-end of the twig, try the next twig. */
13910 + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13911 + /* We take the write lock because we may start flushing from this
13912 + * coordinate. */
13913 + ret =
13914 + neighbor_in_slum(next_coord.node, &next_lock,
13915 + scan->direction, ZNODE_WRITE_LOCK,
13916 + 1 /* check dirty */ );
13917 + if (ret == -E_NO_NEIGHBOR) {
13918 + scan->stop = 1;
13919 + ret = 0;
13920 + break;
13921 + }
13922 +
13923 + if (ret != 0) {
13924 + goto exit;
13925 + }
13926 +
13927 + ret = incr_load_count_znode(&next_load, next_lock.node);
13928 + if (ret != 0) {
13929 + goto exit;
13930 + }
13931 +
13932 + coord_init_sideof_unit(&next_coord, next_lock.node,
13933 + sideof_reverse(scan->direction));
13934 + }
13935 +
13936 + iplug = item_plugin_by_coord(&next_coord);
13937 +
13938 + /* Get the next child. */
13939 + ret =
13940 + iplug->f.utmost_child(&next_coord,
13941 + sideof_reverse(scan->direction),
13942 + &child);
13943 + if (ret != 0)
13944 + goto exit;
13945 + /* If the next child is not in memory, or, item_utmost_child
13946 + failed (due to race with unlink, most probably), stop
13947 + here. */
13948 + if (child == NULL || IS_ERR(child)) {
13949 + scan->stop = 1;
13950 + checkchild(scan);
13951 + break;
13952 + }
13953 +
13954 + assert("nikita-2374", jnode_is_unformatted(child)
13955 + || jnode_is_znode(child));
13956 +
13957 + /* See if it is dirty, part of the same atom. */
13958 + if (!reiser4_scan_goto(scan, child)) {
13959 + checkchild(scan);
13960 + break;
13961 + }
13962 +
13963 + /* If so, make this child current. */
13964 + ret = scan_set_current(scan, child, 1, &next_coord);
13965 + if (ret != 0)
13966 + goto exit;
13967 +
13968 + /* Now continue. If formatted we release the parent lock and return, then
13969 + proceed. */
13970 + if (jnode_is_znode(child))
13971 + break;
13972 +
13973 + /* Otherwise, repeat the above loop with next_coord. */
13974 + if (next_load.node != NULL) {
13975 + done_lh(&scan->parent_lock);
13976 + move_lh(&scan->parent_lock, &next_lock);
13977 + move_load_count(&scan->parent_load, &next_load);
13978 + }
13979 + }
13980 +
13981 + assert("jmacd-6233",
13982 + reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
13983 + exit:
13984 + checkchild(scan);
13985 + race: /* skip the above check */
13986 + if (jnode_is_znode(scan->node)) {
13987 + done_lh(&scan->parent_lock);
13988 + done_load_count(&scan->parent_load);
13989 + }
13990 +
13991 + done_load_count(&next_load);
13992 + done_lh(&next_lock);
13993 + return ret;
13994 +}
13995 +
13996 +/* FLUSH POS HELPERS */
13997 +
13998 +/* Initialize the fields of a flush_position. */
13999 +static void pos_init(flush_pos_t * pos)
14000 +{
14001 + memset(pos, 0, sizeof *pos);
14002 +
14003 + pos->state = POS_INVALID;
14004 + coord_init_invalid(&pos->coord, NULL);
14005 + init_lh(&pos->lock);
14006 + init_load_count(&pos->load);
14007 +
14008 + reiser4_blocknr_hint_init(&pos->preceder);
14009 +}
14010 +
14011 +/* The flush loop inside squalloc periodically checks pos_valid to
14012 + determine when "enough flushing" has been performed. This will return true until one
14013 + of the following conditions is met:
14014 +
14015 + 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14016 + parameter, meaning we have flushed as many blocks as the kernel requested. When
14017 + flushing to commit, this parameter is NULL.
14018 +
14019 + 2. pos_stop() is called because squalloc discovers that the "next" node in the
14020 + flush order is either non-existant, not dirty, or not in the same atom.
14021 +*/
14022 +
14023 +static int pos_valid(flush_pos_t * pos)
14024 +{
14025 + return pos->state != POS_INVALID;
14026 +}
14027 +
14028 +/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14029 +static void pos_done(flush_pos_t * pos)
14030 +{
14031 + pos_stop(pos);
14032 + reiser4_blocknr_hint_done(&pos->preceder);
14033 + if (convert_data(pos))
14034 + free_convert_data(pos);
14035 +}
14036 +
14037 +/* Reset the point and parent. Called during flush subroutines to terminate the
14038 + squalloc loop. */
14039 +static int pos_stop(flush_pos_t * pos)
14040 +{
14041 + pos->state = POS_INVALID;
14042 + done_lh(&pos->lock);
14043 + done_load_count(&pos->load);
14044 + coord_init_invalid(&pos->coord, NULL);
14045 +
14046 + if (pos->child) {
14047 + jput(pos->child);
14048 + pos->child = NULL;
14049 + }
14050 +
14051 + return 0;
14052 +}
14053 +
14054 +/* Return the flush_position's block allocator hint. */
14055 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14056 +{
14057 + return &pos->preceder;
14058 +}
14059 +
14060 +flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14061 +{
14062 + return pos->fq;
14063 +}
14064 +
14065 +/* Make Linus happy.
14066 + Local variables:
14067 + c-indentation-style: "K&R"
14068 + mode-name: "LC"
14069 + c-basic-offset: 8
14070 + tab-width: 8
14071 + fill-column: 90
14072 + LocalWords: preceder
14073 + End:
14074 +*/
14075 diff -urN linux-2.6.20.orig/fs/reiser4/flush.h linux-2.6.20/fs/reiser4/flush.h
14076 --- linux-2.6.20.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14077 +++ linux-2.6.20/fs/reiser4/flush.h 2007-05-06 14:50:43.718981974 +0400
14078 @@ -0,0 +1,274 @@
14079 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14080 +
14081 +/* DECLARATIONS: */
14082 +
14083 +#if !defined(__REISER4_FLUSH_H__)
14084 +#define __REISER4_FLUSH_H__
14085 +
14086 +#include "plugin/cluster.h"
14087 +
14088 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14089 + single level of the tree. A flush-scan is used for counting the number of adjacent
14090 + nodes to flush, which is used to determine whether we should relocate, and it is also
14091 + used to find a starting point for flush. A flush-scan object can scan in both right
14092 + and left directions via the scan_left() and scan_right() interfaces. The
14093 + right- and left-variations are similar but perform different functions. When scanning
14094 + left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14095 + When scanning right we are simply counting the number of adjacent, dirty nodes. */
14096 +struct flush_scan {
14097 +
14098 + /* The current number of nodes scanned on this level. */
14099 + unsigned count;
14100 +
14101 + /* There may be a maximum number of nodes for a scan on any single level. When
14102 + going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14103 + unsigned max_count;
14104 +
14105 + /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14106 + sideof direction;
14107 +
14108 + /* Initially @stop is set to false then set true once some condition stops the
14109 + search (e.g., we found a clean node before reaching max_count or we found a
14110 + node belonging to another atom). */
14111 + int stop;
14112 +
14113 + /* The current scan position. If @node is non-NULL then its reference count has
14114 + been incremented to reflect this reference. */
14115 + jnode *node;
14116 +
14117 + /* A handle for zload/zrelse of current scan position node. */
14118 + load_count node_load;
14119 +
14120 + /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14121 + node is locked using this lock handle. The endpoint needs to be locked for
14122 + transfer to the flush_position object after scanning finishes. */
14123 + lock_handle node_lock;
14124 +
14125 + /* When the position is unformatted, its parent, coordinate, and parent
14126 + zload/zrelse handle. */
14127 + lock_handle parent_lock;
14128 + coord_t parent_coord;
14129 + load_count parent_load;
14130 +
14131 + /* The block allocator preceder hint. Sometimes flush_scan determines what the
14132 + preceder is and if so it sets it here, after which it is copied into the
14133 + flush_position. Otherwise, the preceder is computed later. */
14134 + reiser4_block_nr preceder_blk;
14135 +};
14136 +
14137 +typedef struct convert_item_info {
14138 + dc_item_stat d_cur; /* disk cluster state of the current item */
14139 + dc_item_stat d_next; /* disk cluster state of the next slum item */
14140 + struct inode *inode;
14141 + flow_t flow;
14142 +} convert_item_info_t;
14143 +
14144 +typedef struct convert_info {
14145 + int count; /* for squalloc terminating */
14146 + reiser4_cluster_t clust; /* transform cluster */
14147 + item_plugin *iplug; /* current item plugin */
14148 + convert_item_info_t *itm; /* current item info */
14149 +} convert_info_t;
14150 +
14151 +typedef enum flush_position_state {
14152 + POS_INVALID, /* Invalid or stopped pos, do not continue slum
14153 + * processing */
14154 + POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14155 + * leaf level */
14156 + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14157 + * to traverse unformatted nodes */
14158 + POS_TO_LEAF, /* pos is being moved to leaf level */
14159 + POS_TO_TWIG, /* pos is being moved to twig level */
14160 + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14161 + * rightmost unit of the current twig */
14162 + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14163 +} flushpos_state_t;
14164 +
14165 +/* An encapsulation of the current flush point and all the parameters that are passed
14166 + through the entire squeeze-and-allocate stage of the flush routine. A single
14167 + flush_position object is constructed after left- and right-scanning finishes. */
14168 +struct flush_position {
14169 + flushpos_state_t state;
14170 +
14171 + coord_t coord; /* coord to traverse unformatted nodes */
14172 + lock_handle lock; /* current lock we hold */
14173 + load_count load; /* load status for current locked formatted node */
14174 +
14175 + jnode *child; /* for passing a reference to unformatted child
14176 + * across pos state changes */
14177 +
14178 + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14179 + int leaf_relocate; /* True if enough leaf-level nodes were
14180 + * found to suggest a relocate policy. */
14181 + int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14182 + int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14183 + flush_queue_t *fq;
14184 + long *nr_written; /* number of nodes submitted to disk */
14185 + int flags; /* a copy of jnode_flush flags argument */
14186 +
14187 + znode *prev_twig; /* previous parent pointer value, used to catch
14188 + * processing of new twig node */
14189 + convert_info_t *sq; /* convert info */
14190 +
14191 + unsigned long pos_in_unit; /* for extents only. Position
14192 + within an extent unit of first
14193 + jnode of slum */
14194 + long nr_to_write; /* number of unformatted nodes to handle on flush */
14195 +};
14196 +
14197 +static inline int item_convert_count(flush_pos_t * pos)
14198 +{
14199 + return pos->sq->count;
14200 +}
14201 +static inline void inc_item_convert_count(flush_pos_t * pos)
14202 +{
14203 + pos->sq->count++;
14204 +}
14205 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14206 +{
14207 + pos->sq->count = count;
14208 +}
14209 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14210 +{
14211 + return pos->sq->iplug;
14212 +}
14213 +
14214 +static inline convert_info_t *convert_data(flush_pos_t * pos)
14215 +{
14216 + return pos->sq;
14217 +}
14218 +
14219 +static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14220 +{
14221 + assert("edward-955", convert_data(pos));
14222 + return pos->sq->itm;
14223 +}
14224 +
14225 +static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14226 +{
14227 + return &pos->sq->clust.tc;
14228 +}
14229 +
14230 +static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14231 +{
14232 + assert("edward-854", pos->sq != NULL);
14233 + return tfm_stream(tfm_cluster_sq(pos), id);
14234 +}
14235 +
14236 +static inline int chaining_data_present(flush_pos_t * pos)
14237 +{
14238 + return convert_data(pos) && item_convert_data(pos);
14239 +}
14240 +
14241 +/* Returns true if next node contains next item of the disk cluster
14242 + so item convert data should be moved to the right slum neighbor.
14243 +*/
14244 +static inline int should_chain_next_node(flush_pos_t * pos)
14245 +{
14246 + int result = 0;
14247 +
14248 + assert("edward-1007", chaining_data_present(pos));
14249 +
14250 + switch (item_convert_data(pos)->d_next) {
14251 + case DC_CHAINED_ITEM:
14252 + result = 1;
14253 + break;
14254 + case DC_AFTER_CLUSTER:
14255 + break;
14256 + default:
14257 + impossible("edward-1009", "bad state of next slum item");
14258 + }
14259 + return result;
14260 +}
14261 +
14262 +/* update item state in a disk cluster to assign conversion mode */
14263 +static inline void
14264 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14265 +{
14266 +
14267 + assert("edward-1010", chaining_data_present(pos));
14268 +
14269 + if (this_node == 0) {
14270 + /* next item is on the right neighbor */
14271 + assert("edward-1011",
14272 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14273 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14274 + assert("edward-1012",
14275 + item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14276 +
14277 + item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14278 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14279 + } else {
14280 + /* next item is on the same node */
14281 + assert("edward-1013",
14282 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14283 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14284 + assert("edward-1227",
14285 + item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14286 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
14287 +
14288 + item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14289 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14290 + }
14291 +}
14292 +
14293 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14294 +{
14295 + return znode_convertible(node);
14296 +}
14297 +
14298 +/* true if there is attached convert item info */
14299 +static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14300 +{
14301 + return convert_data(pos) && item_convert_data(pos);
14302 +}
14303 +
14304 +#define SQUALLOC_THRESHOLD 256
14305 +
14306 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14307 +{
14308 + return convert_data(pos) &&
14309 + !item_convert_data(pos) &&
14310 + item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14311 +}
14312 +
14313 +void free_convert_data(flush_pos_t * pos);
14314 +/* used in extent.c */
14315 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14316 + const coord_t * parent);
14317 +int reiser4_scan_finished(flush_scan * scan);
14318 +int reiser4_scanning_left(flush_scan * scan);
14319 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14320 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14321 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14322 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14323 + reiser4_key *stop_key);
14324 +extern int reiser4_init_fqs(void);
14325 +extern void reiser4_done_fqs(void);
14326 +
14327 +#if REISER4_DEBUG
14328 +
14329 +extern void reiser4_check_fq(const txn_atom *atom);
14330 +extern atomic_t flush_cnt;
14331 +
14332 +#define check_preceder(blk) \
14333 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14334 +extern void check_pos(flush_pos_t * pos);
14335 +#else
14336 +#define check_preceder(b) noop
14337 +#define check_pos(pos) noop
14338 +#endif
14339 +
14340 +/* __REISER4_FLUSH_H__ */
14341 +#endif
14342 +
14343 +/* Make Linus happy.
14344 + Local variables:
14345 + c-indentation-style: "K&R"
14346 + mode-name: "LC"
14347 + c-basic-offset: 8
14348 + tab-width: 8
14349 + fill-column: 90
14350 + LocalWords: preceder
14351 + End:
14352 +*/
14353 diff -urN linux-2.6.20.orig/fs/reiser4/flush_queue.c linux-2.6.20/fs/reiser4/flush_queue.c
14354 --- linux-2.6.20.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14355 +++ linux-2.6.20/fs/reiser4/flush_queue.c 2007-05-06 14:50:43.718981974 +0400
14356 @@ -0,0 +1,680 @@
14357 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14358 +
14359 +#include "debug.h"
14360 +#include "super.h"
14361 +#include "txnmgr.h"
14362 +#include "jnode.h"
14363 +#include "znode.h"
14364 +#include "page_cache.h"
14365 +#include "wander.h"
14366 +#include "vfs_ops.h"
14367 +#include "writeout.h"
14368 +#include "flush.h"
14369 +
14370 +#include <linux/bio.h>
14371 +#include <linux/mm.h>
14372 +#include <linux/pagemap.h>
14373 +#include <linux/blkdev.h>
14374 +#include <linux/writeback.h>
14375 +
14376 +/* A flush queue object is an accumulator for keeping jnodes prepared
14377 + by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14378 + kept on the flush queue until memory pressure or atom commit asks
14379 + flush queues to write some or all from their jnodes. */
14380 +
14381 +/*
14382 + LOCKING:
14383 +
14384 + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14385 + list protected by atom spin lock. fq->prepped list uses the following
14386 + locking:
14387 +
14388 + two ways to protect fq->prepped list for read-only list traversal:
14389 +
14390 + 1. atom spin-lock atom.
14391 + 2. fq is IN_USE, atom->nr_running_queues increased.
14392 +
14393 + and one for list modification:
14394 +
14395 + 1. atom is spin-locked and one condition is true: fq is IN_USE or
14396 + atom->nr_running_queues == 0.
14397 +
14398 + The deadlock-safe order for flush queues and atoms is: first lock atom, then
14399 + lock flush queue, then lock jnode.
14400 +*/
14401 +
14402 +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14403 +#define fq_ready(fq) (!fq_in_use(fq))
14404 +
14405 +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14406 +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14407 +
14408 +/* get lock on atom from locked flush queue object */
14409 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14410 +{
14411 + /* This code is similar to jnode_get_atom(), look at it for the
14412 + * explanation. */
14413 + txn_atom *atom;
14414 +
14415 + assert_spin_locked(&(fq->guard));
14416 +
14417 + while (1) {
14418 + atom = fq->atom;
14419 + if (atom == NULL)
14420 + break;
14421 +
14422 + if (spin_trylock_atom(atom))
14423 + break;
14424 +
14425 + atomic_inc(&atom->refcount);
14426 + spin_unlock(&(fq->guard));
14427 + spin_lock_atom(atom);
14428 + spin_lock(&(fq->guard));
14429 +
14430 + if (fq->atom == atom) {
14431 + atomic_dec(&atom->refcount);
14432 + break;
14433 + }
14434 +
14435 + spin_unlock(&(fq->guard));
14436 + atom_dec_and_unlock(atom);
14437 + spin_lock(&(fq->guard));
14438 + }
14439 +
14440 + return atom;
14441 +}
14442 +
14443 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14444 +{
14445 + txn_atom *atom;
14446 +
14447 + spin_lock(&(fq->guard));
14448 + atom = atom_locked_by_fq_nolock(fq);
14449 + spin_unlock(&(fq->guard));
14450 + return atom;
14451 +}
14452 +
14453 +static void init_fq(flush_queue_t * fq)
14454 +{
14455 + memset(fq, 0, sizeof *fq);
14456 +
14457 + atomic_set(&fq->nr_submitted, 0);
14458 +
14459 + INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14460 +
14461 + init_waitqueue_head(&fq->wait);
14462 + spin_lock_init(&fq->guard);
14463 +}
14464 +
14465 +/* slab for flush queues */
14466 +static struct kmem_cache *fq_slab;
14467 +
14468 +/**
14469 + * reiser4_init_fqs - create flush queue cache
14470 + *
14471 + * Initializes slab cache of flush queues. It is part of reiser4 module
14472 + * initialization.
14473 + */
14474 +int reiser4_init_fqs(void)
14475 +{
14476 + fq_slab = kmem_cache_create("fq",
14477 + sizeof(flush_queue_t),
14478 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14479 + if (fq_slab == NULL)
14480 + return RETERR(-ENOMEM);
14481 + return 0;
14482 +}
14483 +
14484 +/**
14485 + * reiser4_done_fqs - delete flush queue cache
14486 + *
14487 + * This is called on reiser4 module unloading or system shutdown.
14488 + */
14489 +void reiser4_done_fqs(void)
14490 +{
14491 + destroy_reiser4_cache(&fq_slab);
14492 +}
14493 +
14494 +/* create new flush queue object */
14495 +static flush_queue_t *create_fq(gfp_t gfp)
14496 +{
14497 + flush_queue_t *fq;
14498 +
14499 + fq = kmem_cache_alloc(fq_slab, gfp);
14500 + if (fq)
14501 + init_fq(fq);
14502 +
14503 + return fq;
14504 +}
14505 +
14506 +/* adjust atom's and flush queue's counters of queued nodes */
14507 +static void count_enqueued_node(flush_queue_t * fq)
14508 +{
14509 + ON_DEBUG(fq->atom->num_queued++);
14510 +}
14511 +
14512 +static void count_dequeued_node(flush_queue_t * fq)
14513 +{
14514 + assert("zam-993", fq->atom->num_queued > 0);
14515 + ON_DEBUG(fq->atom->num_queued--);
14516 +}
14517 +
14518 +/* attach flush queue object to the atom */
14519 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14520 +{
14521 + assert_spin_locked(&(atom->alock));
14522 + list_add(&fq->alink, &atom->flush_queues);
14523 + fq->atom = atom;
14524 + ON_DEBUG(atom->nr_flush_queues++);
14525 +}
14526 +
14527 +static void detach_fq(flush_queue_t * fq)
14528 +{
14529 + assert_spin_locked(&(fq->atom->alock));
14530 +
14531 + spin_lock(&(fq->guard));
14532 + list_del_init(&fq->alink);
14533 + assert("vs-1456", fq->atom->nr_flush_queues > 0);
14534 + ON_DEBUG(fq->atom->nr_flush_queues--);
14535 + fq->atom = NULL;
14536 + spin_unlock(&(fq->guard));
14537 +}
14538 +
14539 +/* destroy flush queue object */
14540 +static void done_fq(flush_queue_t * fq)
14541 +{
14542 + assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14543 + assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14544 +
14545 + kmem_cache_free(fq_slab, fq);
14546 +}
14547 +
14548 +/* */
14549 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14550 +{
14551 + JF_SET(node, JNODE_FLUSH_QUEUED);
14552 + count_enqueued_node(fq);
14553 +}
14554 +
14555 +/* Putting jnode into the flush queue. Both atom and jnode should be
14556 + spin-locked. */
14557 +void queue_jnode(flush_queue_t * fq, jnode * node)
14558 +{
14559 + assert_spin_locked(&(node->guard));
14560 + assert("zam-713", node->atom != NULL);
14561 + assert_spin_locked(&(node->atom->alock));
14562 + assert("zam-716", fq->atom != NULL);
14563 + assert("zam-717", fq->atom == node->atom);
14564 + assert("zam-907", fq_in_use(fq));
14565 +
14566 + assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14567 + assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14568 + assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14569 + assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14570 +
14571 + mark_jnode_queued(fq, node);
14572 + list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14573 +
14574 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14575 + FQ_LIST, 1));
14576 +}
14577 +
14578 +/* repeatable process for waiting io completion on a flush queue object */
14579 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14580 +{
14581 + assert("zam-738", fq->atom != NULL);
14582 + assert_spin_locked(&(fq->atom->alock));
14583 + assert("zam-736", fq_in_use(fq));
14584 + assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14585 +
14586 + if (atomic_read(&fq->nr_submitted) != 0) {
14587 + struct super_block *super;
14588 +
14589 + spin_unlock_atom(fq->atom);
14590 +
14591 + assert("nikita-3013", reiser4_schedulable());
14592 +
14593 + super = reiser4_get_current_sb();
14594 +
14595 + /* FIXME: this is instead of blk_run_queues() */
14596 + blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14597 +
14598 + if (!(super->s_flags & MS_RDONLY))
14599 + wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14600 +
14601 + /* Ask the caller to re-acquire the locks and call this
14602 + function again. Note: this technique is commonly used in
14603 + the txnmgr code. */
14604 + return -E_REPEAT;
14605 + }
14606 +
14607 + *nr_io_errors += atomic_read(&fq->nr_errors);
14608 + return 0;
14609 +}
14610 +
14611 +/* wait on I/O completion, re-submit dirty nodes to write */
14612 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14613 +{
14614 + int ret;
14615 + txn_atom *atom = fq->atom;
14616 +
14617 + assert("zam-801", atom != NULL);
14618 + assert_spin_locked(&(atom->alock));
14619 + assert("zam-762", fq_in_use(fq));
14620 +
14621 + ret = wait_io(fq, nr_io_errors);
14622 + if (ret)
14623 + return ret;
14624 +
14625 + detach_fq(fq);
14626 + done_fq(fq);
14627 +
14628 + reiser4_atom_send_event(atom);
14629 +
14630 + return 0;
14631 +}
14632 +
14633 +/* wait for all i/o for given atom to be completed, actually do one iteration
14634 + on that and return -E_REPEAT if there more iterations needed */
14635 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14636 +{
14637 + flush_queue_t *fq;
14638 +
14639 + assert_spin_locked(&(atom->alock));
14640 +
14641 + if (list_empty_careful(&atom->flush_queues))
14642 + return 0;
14643 +
14644 + list_for_each_entry(fq, &atom->flush_queues, alink) {
14645 + if (fq_ready(fq)) {
14646 + int ret;
14647 +
14648 + mark_fq_in_use(fq);
14649 + assert("vs-1247", fq->owner == NULL);
14650 + ON_DEBUG(fq->owner = current);
14651 + ret = finish_fq(fq, nr_io_errors);
14652 +
14653 + if (*nr_io_errors)
14654 + reiser4_handle_error();
14655 +
14656 + if (ret) {
14657 + reiser4_fq_put(fq);
14658 + return ret;
14659 + }
14660 +
14661 + spin_unlock_atom(atom);
14662 +
14663 + return -E_REPEAT;
14664 + }
14665 + }
14666 +
14667 + /* All flush queues are in use; atom remains locked */
14668 + return -EBUSY;
14669 +}
14670 +
14671 +/* wait all i/o for current atom */
14672 +int current_atom_finish_all_fq(void)
14673 +{
14674 + txn_atom *atom;
14675 + int nr_io_errors = 0;
14676 + int ret = 0;
14677 +
14678 + do {
14679 + while (1) {
14680 + atom = get_current_atom_locked();
14681 + ret = finish_all_fq(atom, &nr_io_errors);
14682 + if (ret != -EBUSY)
14683 + break;
14684 + reiser4_atom_wait_event(atom);
14685 + }
14686 + } while (ret == -E_REPEAT);
14687 +
14688 + /* we do not need locked atom after this function finishes, SUCCESS or
14689 + -EBUSY are two return codes when atom remains locked after
14690 + finish_all_fq */
14691 + if (!ret)
14692 + spin_unlock_atom(atom);
14693 +
14694 + assert_spin_not_locked(&(atom->alock));
14695 +
14696 + if (ret)
14697 + return ret;
14698 +
14699 + if (nr_io_errors)
14700 + return RETERR(-EIO);
14701 +
14702 + return 0;
14703 +}
14704 +
14705 +/* change node->atom field for all jnode from given list */
14706 +static void
14707 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14708 +{
14709 + jnode *cur;
14710 +
14711 + list_for_each_entry(cur, list, capture_link) {
14712 + spin_lock_jnode(cur);
14713 + cur->atom = atom;
14714 + spin_unlock_jnode(cur);
14715 + }
14716 +}
14717 +
14718 +/* support for atom fusion operation */
14719 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14720 +{
14721 + flush_queue_t *fq;
14722 +
14723 + assert_spin_locked(&(to->alock));
14724 + assert_spin_locked(&(from->alock));
14725 +
14726 + list_for_each_entry(fq, &from->flush_queues, alink) {
14727 + scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14728 + spin_lock(&(fq->guard));
14729 + fq->atom = to;
14730 + spin_unlock(&(fq->guard));
14731 + }
14732 +
14733 + list_splice_init(&from->flush_queues, to->flush_queues.prev);
14734 +
14735 +#if REISER4_DEBUG
14736 + to->num_queued += from->num_queued;
14737 + to->nr_flush_queues += from->nr_flush_queues;
14738 + from->nr_flush_queues = 0;
14739 +#endif
14740 +}
14741 +
14742 +#if REISER4_DEBUG
14743 +int atom_fq_parts_are_clean(txn_atom * atom)
14744 +{
14745 + assert("zam-915", atom != NULL);
14746 + return list_empty_careful(&atom->flush_queues);
14747 +}
14748 +#endif
14749 +/* Bio i/o completion routine for reiser4 write operations. */
14750 +static int
14751 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14752 + int err)
14753 +{
14754 + int i;
14755 + int nr_errors = 0;
14756 + flush_queue_t *fq;
14757 +
14758 + assert("zam-958", bio->bi_rw & WRITE);
14759 +
14760 + /* i/o op. is not fully completed */
14761 + if (bio->bi_size != 0)
14762 + return 1;
14763 +
14764 + if (err == -EOPNOTSUPP)
14765 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14766 +
14767 + /* we expect that bio->private is set to NULL or fq object which is used
14768 + * for synchronization and error counting. */
14769 + fq = bio->bi_private;
14770 + /* Check all elements of io_vec for correct write completion. */
14771 + for (i = 0; i < bio->bi_vcnt; i += 1) {
14772 + struct page *pg = bio->bi_io_vec[i].bv_page;
14773 +
14774 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14775 + SetPageError(pg);
14776 + nr_errors++;
14777 + }
14778 +
14779 + {
14780 + /* jnode WRITEBACK ("write is in progress bit") is
14781 + * atomically cleared here. */
14782 + jnode *node;
14783 +
14784 + assert("zam-736", pg != NULL);
14785 + assert("zam-736", PagePrivate(pg));
14786 + node = jprivate(pg);
14787 +
14788 + JF_CLR(node, JNODE_WRITEBACK);
14789 + }
14790 +
14791 + end_page_writeback(pg);
14792 + page_cache_release(pg);
14793 + }
14794 +
14795 + if (fq) {
14796 + /* count i/o error in fq object */
14797 + atomic_add(nr_errors, &fq->nr_errors);
14798 +
14799 + /* If all write requests registered in this "fq" are done we up
14800 + * the waiter. */
14801 + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14802 + wake_up(&fq->wait);
14803 + }
14804 +
14805 + bio_put(bio);
14806 + return 0;
14807 +}
14808 +
14809 +/* Count I/O requests which will be submitted by @bio in given flush queues
14810 + @fq */
14811 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14812 +{
14813 + bio->bi_private = fq;
14814 + bio->bi_end_io = end_io_handler;
14815 +
14816 + if (fq)
14817 + atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14818 +}
14819 +
14820 +/* Move all queued nodes out from @fq->prepped list. */
14821 +static void release_prepped_list(flush_queue_t * fq)
14822 +{
14823 + txn_atom *atom;
14824 +
14825 + assert("zam-904", fq_in_use(fq));
14826 + atom = atom_locked_by_fq(fq);
14827 +
14828 + while (!list_empty(ATOM_FQ_LIST(fq))) {
14829 + jnode *cur;
14830 +
14831 + cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14832 + list_del_init(&cur->capture_link);
14833 +
14834 + count_dequeued_node(fq);
14835 + spin_lock_jnode(cur);
14836 + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14837 + assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14838 + assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14839 + JF_CLR(cur, JNODE_FLUSH_QUEUED);
14840 +
14841 + if (JF_ISSET(cur, JNODE_DIRTY)) {
14842 + list_add_tail(&cur->capture_link,
14843 + ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14844 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14845 + DIRTY_LIST, 1));
14846 + } else {
14847 + list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14848 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14849 + CLEAN_LIST, 1));
14850 + }
14851 +
14852 + spin_unlock_jnode(cur);
14853 + }
14854 +
14855 + if (--atom->nr_running_queues == 0)
14856 + reiser4_atom_send_event(atom);
14857 +
14858 + spin_unlock_atom(atom);
14859 +}
14860 +
14861 +/* Submit write requests for nodes on the already filled flush queue @fq.
14862 +
14863 + @fq: flush queue object which contains jnodes we can (and will) write.
14864 + @return: number of submitted blocks (>=0) if success, otherwise -- an error
14865 + code (<0). */
14866 +int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14867 +{
14868 + int ret;
14869 + txn_atom *atom;
14870 +
14871 + while (1) {
14872 + atom = atom_locked_by_fq(fq);
14873 + assert("zam-924", atom);
14874 + /* do not write fq in parallel. */
14875 + if (atom->nr_running_queues == 0
14876 + || !(flags & WRITEOUT_SINGLE_STREAM))
14877 + break;
14878 + reiser4_atom_wait_event(atom);
14879 + }
14880 +
14881 + atom->nr_running_queues++;
14882 + spin_unlock_atom(atom);
14883 +
14884 + ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14885 + release_prepped_list(fq);
14886 +
14887 + return ret;
14888 +}
14889 +
14890 +/* Getting flush queue object for exclusive use by one thread. May require
14891 + several iterations which is indicated by -E_REPEAT return code.
14892 +
14893 + This function does not contain code for obtaining an atom lock because an
14894 + atom lock is obtained by different ways in different parts of reiser4,
14895 + usually it is current atom, but we need a possibility for getting fq for the
14896 + atom of given jnode. */
14897 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14898 +{
14899 + flush_queue_t *fq;
14900 +
14901 + assert_spin_locked(&(atom->alock));
14902 +
14903 + fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14904 + while (&atom->flush_queues != &fq->alink) {
14905 + spin_lock(&(fq->guard));
14906 +
14907 + if (fq_ready(fq)) {
14908 + mark_fq_in_use(fq);
14909 + assert("vs-1246", fq->owner == NULL);
14910 + ON_DEBUG(fq->owner = current);
14911 + spin_unlock(&(fq->guard));
14912 +
14913 + if (*new_fq)
14914 + done_fq(*new_fq);
14915 +
14916 + *new_fq = fq;
14917 +
14918 + return 0;
14919 + }
14920 +
14921 + spin_unlock(&(fq->guard));
14922 +
14923 + fq = list_entry(fq->alink.next, flush_queue_t, alink);
14924 + }
14925 +
14926 + /* Use previously allocated fq object */
14927 + if (*new_fq) {
14928 + mark_fq_in_use(*new_fq);
14929 + assert("vs-1248", (*new_fq)->owner == 0);
14930 + ON_DEBUG((*new_fq)->owner = current);
14931 + attach_fq(atom, *new_fq);
14932 +
14933 + return 0;
14934 + }
14935 +
14936 + spin_unlock_atom(atom);
14937 +
14938 + *new_fq = create_fq(gfp);
14939 +
14940 + if (*new_fq == NULL)
14941 + return RETERR(-ENOMEM);
14942 +
14943 + return RETERR(-E_REPEAT);
14944 +}
14945 +
14946 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
14947 +{
14948 + return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
14949 +}
14950 +
14951 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
14952 + object for current atom, if success fq->atom remains locked. */
14953 +flush_queue_t *get_fq_for_current_atom(void)
14954 +{
14955 + flush_queue_t *fq = NULL;
14956 + txn_atom *atom;
14957 + int ret;
14958 +
14959 + do {
14960 + atom = get_current_atom_locked();
14961 + ret = reiser4_fq_by_atom(atom, &fq);
14962 + } while (ret == -E_REPEAT);
14963 +
14964 + if (ret)
14965 + return ERR_PTR(ret);
14966 + return fq;
14967 +}
14968 +
14969 +/* Releasing flush queue object after exclusive use */
14970 +void reiser4_fq_put_nolock(flush_queue_t *fq)
14971 +{
14972 + assert("zam-747", fq->atom != NULL);
14973 + assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
14974 + mark_fq_ready(fq);
14975 + assert("vs-1245", fq->owner == current);
14976 + ON_DEBUG(fq->owner = NULL);
14977 +}
14978 +
14979 +void reiser4_fq_put(flush_queue_t * fq)
14980 +{
14981 + txn_atom *atom;
14982 +
14983 + spin_lock(&(fq->guard));
14984 + atom = atom_locked_by_fq_nolock(fq);
14985 +
14986 + assert("zam-746", atom != NULL);
14987 +
14988 + reiser4_fq_put_nolock(fq);
14989 + reiser4_atom_send_event(atom);
14990 +
14991 + spin_unlock(&(fq->guard));
14992 + spin_unlock_atom(atom);
14993 +}
14994 +
14995 +/* A part of atom object initialization related to the embedded flush queue
14996 + list head */
14997 +
14998 +void init_atom_fq_parts(txn_atom *atom)
14999 +{
15000 + INIT_LIST_HEAD(&atom->flush_queues);
15001 +}
15002 +
15003 +#if REISER4_DEBUG
15004 +
15005 +void reiser4_check_fq(const txn_atom *atom)
15006 +{
15007 + /* check number of nodes on all atom's flush queues */
15008 + flush_queue_t *fq;
15009 + int count;
15010 + struct list_head *pos;
15011 +
15012 + count = 0;
15013 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15014 + spin_lock(&(fq->guard));
15015 + /* calculate number of jnodes on fq' list of prepped jnodes */
15016 + list_for_each(pos, ATOM_FQ_LIST(fq))
15017 + count++;
15018 + spin_unlock(&(fq->guard));
15019 + }
15020 + if (count != atom->fq)
15021 + warning("", "fq counter %d, real %d\n", atom->fq, count);
15022 +
15023 +}
15024 +
15025 +#endif
15026 +
15027 +/*
15028 + * Local variables:
15029 + * c-indentation-style: "K&R"
15030 + * mode-name: "LC"
15031 + * c-basic-offset: 8
15032 + * tab-width: 8
15033 + * fill-column: 79
15034 + * scroll-step: 1
15035 + * End:
15036 + */
15037 diff -urN linux-2.6.20.orig/fs/reiser4/forward.h linux-2.6.20/fs/reiser4/forward.h
15038 --- linux-2.6.20.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15039 +++ linux-2.6.20/fs/reiser4/forward.h 2007-05-06 14:50:43.718981974 +0400
15040 @@ -0,0 +1,256 @@
15041 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15042 +
15043 +/* Forward declarations. Thank you Kernighan. */
15044 +
15045 +#if !defined( __REISER4_FORWARD_H__ )
15046 +#define __REISER4_FORWARD_H__
15047 +
15048 +#include <asm/errno.h>
15049 +#include <linux/types.h>
15050 +
15051 +typedef struct zlock zlock;
15052 +typedef struct lock_stack lock_stack;
15053 +typedef struct lock_handle lock_handle;
15054 +typedef struct znode znode;
15055 +typedef struct flow flow_t;
15056 +typedef struct coord coord_t;
15057 +typedef struct tree_access_pointer tap_t;
15058 +typedef struct item_coord item_coord;
15059 +typedef struct shift_params shift_params;
15060 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15061 +typedef union reiser4_plugin reiser4_plugin;
15062 +typedef __u16 reiser4_plugin_id;
15063 +typedef __u64 reiser4_plugin_groups;
15064 +typedef struct item_plugin item_plugin;
15065 +typedef struct jnode_plugin jnode_plugin;
15066 +typedef struct reiser4_item_data reiser4_item_data;
15067 +typedef union reiser4_key reiser4_key;
15068 +typedef struct reiser4_tree reiser4_tree;
15069 +typedef struct carry_cut_data carry_cut_data;
15070 +typedef struct carry_kill_data carry_kill_data;
15071 +typedef struct carry_tree_op carry_tree_op;
15072 +typedef struct carry_tree_node carry_tree_node;
15073 +typedef struct carry_plugin_info carry_plugin_info;
15074 +typedef struct reiser4_journal reiser4_journal;
15075 +typedef struct txn_atom txn_atom;
15076 +typedef struct txn_handle txn_handle;
15077 +typedef struct txn_mgr txn_mgr;
15078 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15079 +typedef struct reiser4_context reiser4_context;
15080 +typedef struct carry_level carry_level;
15081 +typedef struct blocknr_set_entry blocknr_set_entry;
15082 +/* super_block->s_fs_info points to this */
15083 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15084 +/* next two objects are fields of reiser4_super_info_data */
15085 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15086 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15087 +
15088 +typedef struct flush_scan flush_scan;
15089 +typedef struct flush_position flush_pos_t;
15090 +
15091 +typedef unsigned short pos_in_node_t;
15092 +#define MAX_POS_IN_NODE 65535
15093 +
15094 +typedef struct jnode jnode;
15095 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15096 +
15097 +typedef struct uf_coord uf_coord_t;
15098 +typedef struct hint hint_t;
15099 +
15100 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15101 +
15102 +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15103 +
15104 +struct inode;
15105 +struct page;
15106 +struct file;
15107 +struct dentry;
15108 +struct super_block;
15109 +
15110 +/* return values of coord_by_key(). cbk == coord_by_key */
15111 +typedef enum {
15112 + CBK_COORD_FOUND = 0,
15113 + CBK_COORD_NOTFOUND = -ENOENT,
15114 +} lookup_result;
15115 +
15116 +/* results of lookup with directory file */
15117 +typedef enum {
15118 + FILE_NAME_FOUND = 0,
15119 + FILE_NAME_NOTFOUND = -ENOENT,
15120 + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15121 + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15122 +} file_lookup_result;
15123 +
15124 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15125 + both coincide. */
15126 +typedef enum {
15127 + /* search exactly for the coord with key given */
15128 + FIND_EXACT,
15129 + /* search for coord with the maximal key not greater than one
15130 + given */
15131 + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15132 +} lookup_bias;
15133 +
15134 +typedef enum {
15135 + /* number of leaf level of the tree
15136 + The fake root has (tree_level=0). */
15137 + LEAF_LEVEL = 1,
15138 +
15139 + /* number of level one above leaf level of the tree.
15140 +
15141 + It is supposed that internal tree used by reiser4 to store file
15142 + system data and meta data will have height 2 initially (when
15143 + created by mkfs).
15144 + */
15145 + TWIG_LEVEL = 2,
15146 +} tree_level;
15147 +
15148 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15149 + array, since the zero'th level is not used. */
15150 +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15151 +
15152 +/* enumeration of possible mutual position of item and coord. This enum is
15153 + return type of ->is_in_item() item plugin method which see. */
15154 +typedef enum {
15155 + /* coord is on the left of an item */
15156 + IP_ON_THE_LEFT,
15157 + /* coord is inside item */
15158 + IP_INSIDE,
15159 + /* coord is inside item, but to the right of the rightmost unit of
15160 + this item */
15161 + IP_RIGHT_EDGE,
15162 + /* coord is on the right of an item */
15163 + IP_ON_THE_RIGHT
15164 +} interposition;
15165 +
15166 +/* type of lock to acquire on znode before returning it to caller */
15167 +typedef enum {
15168 + ZNODE_NO_LOCK = 0,
15169 + ZNODE_READ_LOCK = 1,
15170 + ZNODE_WRITE_LOCK = 2,
15171 +} znode_lock_mode;
15172 +
15173 +/* type of lock request */
15174 +typedef enum {
15175 + ZNODE_LOCK_LOPRI = 0,
15176 + ZNODE_LOCK_HIPRI = (1 << 0),
15177 +
15178 + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15179 + waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15180 + return the value -E_REPEAT. */
15181 + ZNODE_LOCK_NONBLOCK = (1 << 1),
15182 + /* An option for longterm_lock_znode which prevents atom fusion */
15183 + ZNODE_LOCK_DONT_FUSE = (1 << 2)
15184 +} znode_lock_request;
15185 +
15186 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15187 +
15188 +/* used to specify direction of shift. These must be -1 and 1 */
15189 +typedef enum {
15190 + SHIFT_LEFT = 1,
15191 + SHIFT_RIGHT = -1
15192 +} shift_direction;
15193 +
15194 +typedef enum {
15195 + LEFT_SIDE,
15196 + RIGHT_SIDE
15197 +} sideof;
15198 +
15199 +#define round_up( value, order ) \
15200 + ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15201 + ~( ( order ) - 1 ) ) )
15202 +
15203 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15204 +typedef enum {
15205 + /* unit of internal item is moved */
15206 + SUBTREE_MOVED = 0,
15207 + /* nothing else can be squeezed into left neighbor */
15208 + SQUEEZE_TARGET_FULL = 1,
15209 + /* all content of node is squeezed into its left neighbor */
15210 + SQUEEZE_SOURCE_EMPTY = 2,
15211 + /* one more item is copied (this is only returned by
15212 + allocate_and_copy_extent to squalloc_twig)) */
15213 + SQUEEZE_CONTINUE = 3
15214 +} squeeze_result;
15215 +
15216 +/* Do not change items ids. If you do - there will be format change */
15217 +typedef enum {
15218 + STATIC_STAT_DATA_ID = 0x0,
15219 + SIMPLE_DIR_ENTRY_ID = 0x1,
15220 + COMPOUND_DIR_ID = 0x2,
15221 + NODE_POINTER_ID = 0x3,
15222 + EXTENT_POINTER_ID = 0x5,
15223 + FORMATTING_ID = 0x6,
15224 + CTAIL_ID = 0x7,
15225 + BLACK_BOX_ID = 0x8,
15226 + LAST_ITEM_ID = 0x9
15227 +} item_id;
15228 +
15229 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15230 + whether commit() was called or VM memory pressure was applied. */
15231 +typedef enum {
15232 + /* submit flush queue to disk at jnode_flush completion */
15233 + JNODE_FLUSH_WRITE_BLOCKS = 1,
15234 +
15235 + /* flush is called for commit */
15236 + JNODE_FLUSH_COMMIT = 2,
15237 + /* not implemented */
15238 + JNODE_FLUSH_MEMORY_FORMATTED = 4,
15239 +
15240 + /* not implemented */
15241 + JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15242 +} jnode_flush_flags;
15243 +
15244 +/* Flags to insert/paste carry operations. Currently they only used in
15245 + flushing code, but in future, they can be used to optimize for repetitive
15246 + accesses. */
15247 +typedef enum {
15248 + /* carry is not allowed to shift data to the left when trying to find
15249 + free space */
15250 + COPI_DONT_SHIFT_LEFT = (1 << 0),
15251 + /* carry is not allowed to shift data to the right when trying to find
15252 + free space */
15253 + COPI_DONT_SHIFT_RIGHT = (1 << 1),
15254 + /* carry is not allowed to allocate new node(s) when trying to find
15255 + free space */
15256 + COPI_DONT_ALLOCATE = (1 << 2),
15257 + /* try to load left neighbor if its not in a cache */
15258 + COPI_LOAD_LEFT = (1 << 3),
15259 + /* try to load right neighbor if its not in a cache */
15260 + COPI_LOAD_RIGHT = (1 << 4),
15261 + /* shift insertion point to the left neighbor */
15262 + COPI_GO_LEFT = (1 << 5),
15263 + /* shift insertion point to the right neighbor */
15264 + COPI_GO_RIGHT = (1 << 6),
15265 + /* try to step back into original node if insertion into new node
15266 + fails after shifting data there. */
15267 + COPI_STEP_BACK = (1 << 7)
15268 +} cop_insert_flag;
15269 +
15270 +typedef enum {
15271 + SAFE_UNLINK, /* safe-link for unlink */
15272 + SAFE_TRUNCATE /* safe-link for truncate */
15273 +} reiser4_safe_link_t;
15274 +
15275 +/* this is to show on which list of atom jnode is */
15276 +typedef enum {
15277 + NOT_CAPTURED,
15278 + DIRTY_LIST,
15279 + CLEAN_LIST,
15280 + FQ_LIST,
15281 + WB_LIST,
15282 + OVRWR_LIST
15283 +} atom_list;
15284 +
15285 +/* __REISER4_FORWARD_H__ */
15286 +#endif
15287 +
15288 +/* Make Linus happy.
15289 + Local variables:
15290 + c-indentation-style: "K&R"
15291 + mode-name: "LC"
15292 + c-basic-offset: 8
15293 + tab-width: 8
15294 + fill-column: 120
15295 + End:
15296 +*/
15297 diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.c linux-2.6.20/fs/reiser4/fsdata.c
15298 --- linux-2.6.20.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15299 +++ linux-2.6.20/fs/reiser4/fsdata.c 2007-05-06 14:50:43.722983224 +0400
15300 @@ -0,0 +1,803 @@
15301 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15302 + * reiser4/README */
15303 +
15304 +#include "fsdata.h"
15305 +#include "inode.h"
15306 +
15307 +/* cache or dir_cursors */
15308 +static struct kmem_cache *d_cursor_cache;
15309 +static struct shrinker *d_cursor_shrinker;
15310 +
15311 +/* list of unused cursors */
15312 +static LIST_HEAD(cursor_cache);
15313 +
15314 +/* number of cursors in list of ununsed cursors */
15315 +static unsigned long d_cursor_unused = 0;
15316 +
15317 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15318 +DEFINE_SPINLOCK(d_lock);
15319 +
15320 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15321 +static int file_is_stateless(struct file *file);
15322 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15323 +static void kill_cursor(dir_cursor *);
15324 +
15325 +/**
15326 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15327 + * @nr: number of objects to free
15328 + * @mask: GFP mask
15329 + *
15330 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15331 + * number. Return number of still freeable cursors.
15332 + */
15333 +static int d_cursor_shrink(int nr, gfp_t mask)
15334 +{
15335 + if (nr != 0) {
15336 + dir_cursor *scan;
15337 + int killed;
15338 +
15339 + killed = 0;
15340 + spin_lock(&d_lock);
15341 + while (!list_empty(&cursor_cache)) {
15342 + scan = list_entry(cursor_cache.next, dir_cursor, alist);
15343 + assert("nikita-3567", scan->ref == 0);
15344 + kill_cursor(scan);
15345 + ++killed;
15346 + --nr;
15347 + if (nr == 0)
15348 + break;
15349 + }
15350 + spin_unlock(&d_lock);
15351 + }
15352 + return d_cursor_unused;
15353 +}
15354 +
15355 +/**
15356 + * reiser4_init_d_cursor - create d_cursor cache
15357 + *
15358 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15359 + * initialization.
15360 + */
15361 +int reiser4_init_d_cursor(void)
15362 +{
15363 + d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15364 + SLAB_HWCACHE_ALIGN, NULL, NULL);
15365 + if (d_cursor_cache == NULL)
15366 + return RETERR(-ENOMEM);
15367 +
15368 + /*
15369 + * actually, d_cursors are "priceless", because there is no way to
15370 + * recover information stored in them. On the other hand, we don't
15371 + * want to consume all kernel memory by them. As a compromise, just
15372 + * assign higher "seeks" value to d_cursor cache, so that it will be
15373 + * shrunk only if system is really tight on memory.
15374 + */
15375 + d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15376 + d_cursor_shrink);
15377 + if (d_cursor_shrinker == NULL) {
15378 + destroy_reiser4_cache(&d_cursor_cache);
15379 + d_cursor_cache = NULL;
15380 + return RETERR(-ENOMEM);
15381 + }
15382 + return 0;
15383 +}
15384 +
15385 +/**
15386 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15387 + *
15388 + * This is called on reiser4 module unloading or system shutdown.
15389 + */
15390 +void reiser4_done_d_cursor(void)
15391 +{
15392 + BUG_ON(d_cursor_shrinker == NULL);
15393 + remove_shrinker(d_cursor_shrinker);
15394 + d_cursor_shrinker = NULL;
15395 +
15396 + destroy_reiser4_cache(&d_cursor_cache);
15397 +}
15398 +
15399 +#define D_CURSOR_TABLE_SIZE (256)
15400 +
15401 +static inline unsigned long
15402 +d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15403 +{
15404 + assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15405 + return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15406 +}
15407 +
15408 +static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15409 +{
15410 + return k1->cid == k2->cid && k1->oid == k2->oid;
15411 +}
15412 +
15413 +/*
15414 + * define functions to manipulate reiser4 super block's hash table of
15415 + * dir_cursors
15416 + */
15417 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15418 +#define KFREE(ptr, size) kfree(ptr)
15419 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15420 + dir_cursor,
15421 + d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15422 +#undef KFREE
15423 +#undef KMALLOC
15424 +
15425 +/**
15426 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15427 + * @super: super block to initialize
15428 + *
15429 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15430 + * of mount.
15431 + */
15432 +int reiser4_init_super_d_info(struct super_block *super)
15433 +{
15434 + d_cursor_info *p;
15435 +
15436 + p = &get_super_private(super)->d_info;
15437 +
15438 + INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15439 + return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15440 +}
15441 +
15442 +/**
15443 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15444 + * @super: super block being umounted
15445 + *
15446 + * It is called on umount. Kills all directory cursors attached to suoer block.
15447 + */
15448 +void reiser4_done_super_d_info(struct super_block *super)
15449 +{
15450 + d_cursor_info *d_info;
15451 + dir_cursor *cursor, *next;
15452 +
15453 + d_info = &get_super_private(super)->d_info;
15454 + for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15455 + kill_cursor(cursor);
15456 +
15457 + BUG_ON(d_info->tree.rnode != NULL);
15458 + d_cursor_hash_done(&d_info->table);
15459 +}
15460 +
15461 +/**
15462 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15463 + * @cursor: cursor to free
15464 + *
15465 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15466 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15467 + * indices, hash table, list of unused cursors and frees it.
15468 + */
15469 +static void kill_cursor(dir_cursor *cursor)
15470 +{
15471 + unsigned long index;
15472 +
15473 + assert("nikita-3566", cursor->ref == 0);
15474 + assert("nikita-3572", cursor->fsdata != NULL);
15475 +
15476 + index = (unsigned long)cursor->key.oid;
15477 + list_del_init(&cursor->fsdata->dir.linkage);
15478 + free_fsdata(cursor->fsdata);
15479 + cursor->fsdata = NULL;
15480 +
15481 + if (list_empty_careful(&cursor->list))
15482 + /* this is last cursor for a file. Kill radix-tree entry */
15483 + radix_tree_delete(&cursor->info->tree, index);
15484 + else {
15485 + void **slot;
15486 +
15487 + /*
15488 + * there are other cursors for the same oid.
15489 + */
15490 +
15491 + /*
15492 + * if radix tree point to the cursor being removed, re-target
15493 + * radix tree slot to the next cursor in the (non-empty as was
15494 + * checked above) element of the circular list of all cursors
15495 + * for this oid.
15496 + */
15497 + slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15498 + assert("nikita-3571", *slot != NULL);
15499 + if (*slot == cursor)
15500 + *slot = list_entry(cursor->list.next, dir_cursor, list);
15501 + /* remove cursor from circular list */
15502 + list_del_init(&cursor->list);
15503 + }
15504 + /* remove cursor from the list of unused cursors */
15505 + list_del_init(&cursor->alist);
15506 + /* remove cursor from the hash table */
15507 + d_cursor_hash_remove(&cursor->info->table, cursor);
15508 + /* and free it */
15509 + kmem_cache_free(d_cursor_cache, cursor);
15510 + --d_cursor_unused;
15511 +}
15512 +
15513 +/* possible actions that can be performed on all cursors for the given file */
15514 +enum cursor_action {
15515 + /*
15516 + * load all detached state: this is called when stat-data is loaded
15517 + * from the disk to recover information about all pending readdirs
15518 + */
15519 + CURSOR_LOAD,
15520 + /*
15521 + * detach all state from inode, leaving it in the cache. This is called
15522 + * when inode is removed form the memory by memory pressure
15523 + */
15524 + CURSOR_DISPOSE,
15525 + /*
15526 + * detach cursors from the inode, and free them. This is called when
15527 + * inode is destroyed
15528 + */
15529 + CURSOR_KILL
15530 +};
15531 +
15532 +/*
15533 + * return d_cursor data for the file system @inode is in.
15534 + */
15535 +static inline d_cursor_info *d_info(struct inode *inode)
15536 +{
15537 + return &get_super_private(inode->i_sb)->d_info;
15538 +}
15539 +
15540 +/*
15541 + * lookup d_cursor in the per-super-block radix tree.
15542 + */
15543 +static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
15544 +{
15545 + return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15546 +}
15547 +
15548 +/*
15549 + * attach @cursor to the radix tree. There may be multiple cursors for the
15550 + * same oid, they are chained into circular list.
15551 + */
15552 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15553 +{
15554 + dir_cursor *head;
15555 +
15556 + head = lookup(cursor->info, index);
15557 + if (head == NULL) {
15558 + /* this is the first cursor for this index */
15559 + INIT_LIST_HEAD(&cursor->list);
15560 + radix_tree_insert(&cursor->info->tree, index, cursor);
15561 + } else {
15562 + /* some cursor already exists. Chain ours */
15563 + list_add(&cursor->list, &head->list);
15564 + }
15565 +}
15566 +
15567 +/*
15568 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15569 + * "unused" list. Called when file descriptor is not longer in active use.
15570 + */
15571 +static void clean_fsdata(struct file *file)
15572 +{
15573 + dir_cursor *cursor;
15574 + reiser4_file_fsdata *fsdata;
15575 +
15576 + assert("nikita-3570", file_is_stateless(file));
15577 +
15578 + fsdata = (reiser4_file_fsdata *) file->private_data;
15579 + if (fsdata != NULL) {
15580 + cursor = fsdata->cursor;
15581 + if (cursor != NULL) {
15582 + spin_lock(&d_lock);
15583 + --cursor->ref;
15584 + if (cursor->ref == 0) {
15585 + list_add_tail(&cursor->alist, &cursor_cache);
15586 + ++d_cursor_unused;
15587 + }
15588 + spin_unlock(&d_lock);
15589 + file->private_data = NULL;
15590 + }
15591 + }
15592 +}
15593 +
15594 +/*
15595 + * global counter used to generate "client ids". These ids are encoded into
15596 + * high bits of fpos.
15597 + */
15598 +static __u32 cid_counter = 0;
15599 +#define CID_SHIFT (20)
15600 +#define CID_MASK (0xfffffull)
15601 +
15602 +static void free_file_fsdata_nolock(struct file *);
15603 +
15604 +/**
15605 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15606 + * @cursor:
15607 + * @file:
15608 + * @inode:
15609 + *
15610 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15611 + * reiser4 super block's hash table and radix tree.
15612 + add detachable readdir
15613 + * state to the @f
15614 + */
15615 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15616 + struct inode *inode)
15617 +{
15618 + int result;
15619 + reiser4_file_fsdata *fsdata;
15620 +
15621 + memset(cursor, 0, sizeof *cursor);
15622 +
15623 + /* this is either first call to readdir, or rewind. Anyway, create new
15624 + * cursor. */
15625 + fsdata = create_fsdata(NULL);
15626 + if (fsdata != NULL) {
15627 + result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15628 + if (result == 0) {
15629 + d_cursor_info *info;
15630 + oid_t oid;
15631 +
15632 + info = d_info(inode);
15633 + oid = get_inode_oid(inode);
15634 + /* cid occupies higher 12 bits of f->f_pos. Don't
15635 + * allow it to become negative: this confuses
15636 + * nfsd_readdir() */
15637 + cursor->key.cid = (++cid_counter) & 0x7ff;
15638 + cursor->key.oid = oid;
15639 + cursor->fsdata = fsdata;
15640 + cursor->info = info;
15641 + cursor->ref = 1;
15642 +
15643 + spin_lock_inode(inode);
15644 + /* install cursor as @f's private_data, discarding old
15645 + * one if necessary */
15646 +#if REISER4_DEBUG
15647 + if (file->private_data)
15648 + warning("", "file has fsdata already");
15649 +#endif
15650 + clean_fsdata(file);
15651 + free_file_fsdata_nolock(file);
15652 + file->private_data = fsdata;
15653 + fsdata->cursor = cursor;
15654 + spin_unlock_inode(inode);
15655 + spin_lock(&d_lock);
15656 + /* insert cursor into hash table */
15657 + d_cursor_hash_insert(&info->table, cursor);
15658 + /* and chain it into radix-tree */
15659 + bind_cursor(cursor, (unsigned long)oid);
15660 + spin_unlock(&d_lock);
15661 + radix_tree_preload_end();
15662 + file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15663 + }
15664 + } else
15665 + result = RETERR(-ENOMEM);
15666 + return result;
15667 +}
15668 +
15669 +/**
15670 + * process_cursors - do action on each cursor attached to inode
15671 + * @inode:
15672 + * @act: action to do
15673 + *
15674 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15675 + * and performs action specified by @act on each of cursors.
15676 + */
15677 +static void process_cursors(struct inode *inode, enum cursor_action act)
15678 +{
15679 + oid_t oid;
15680 + dir_cursor *start;
15681 + struct list_head *head;
15682 + reiser4_context *ctx;
15683 + d_cursor_info *info;
15684 +
15685 + /* this can be called by
15686 + *
15687 + * kswapd->...->prune_icache->..reiser4_destroy_inode
15688 + *
15689 + * without reiser4_context
15690 + */
15691 + ctx = reiser4_init_context(inode->i_sb);
15692 + if (IS_ERR(ctx)) {
15693 + warning("vs-23", "failed to init context");
15694 + return;
15695 + }
15696 +
15697 + assert("nikita-3558", inode != NULL);
15698 +
15699 + info = d_info(inode);
15700 + oid = get_inode_oid(inode);
15701 + spin_lock_inode(inode);
15702 + head = get_readdir_list(inode);
15703 + spin_lock(&d_lock);
15704 + /* find any cursor for this oid: reference to it is hanging of radix
15705 + * tree */
15706 + start = lookup(info, (unsigned long)oid);
15707 + if (start != NULL) {
15708 + dir_cursor *scan;
15709 + reiser4_file_fsdata *fsdata;
15710 +
15711 + /* process circular list of cursors for this oid */
15712 + scan = start;
15713 + do {
15714 + dir_cursor *next;
15715 +
15716 + next = list_entry(scan->list.next, dir_cursor, list);
15717 + fsdata = scan->fsdata;
15718 + assert("nikita-3557", fsdata != NULL);
15719 + if (scan->key.oid == oid) {
15720 + switch (act) {
15721 + case CURSOR_DISPOSE:
15722 + list_del_init(&fsdata->dir.linkage);
15723 + break;
15724 + case CURSOR_LOAD:
15725 + list_add(&fsdata->dir.linkage, head);
15726 + break;
15727 + case CURSOR_KILL:
15728 + kill_cursor(scan);
15729 + break;
15730 + }
15731 + }
15732 + if (scan == next)
15733 + /* last cursor was just killed */
15734 + break;
15735 + scan = next;
15736 + } while (scan != start);
15737 + }
15738 + spin_unlock(&d_lock);
15739 + /* check that we killed 'em all */
15740 + assert("nikita-3568",
15741 + ergo(act == CURSOR_KILL,
15742 + list_empty_careful(get_readdir_list(inode))));
15743 + assert("nikita-3569",
15744 + ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15745 + spin_unlock_inode(inode);
15746 + reiser4_exit_context(ctx);
15747 +}
15748 +
15749 +/**
15750 + * reiser4_dispose_cursors - removes cursors from inode's list
15751 + * @inode: inode to dispose cursors of
15752 + *
15753 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15754 + * attached to cursor from inode's readdir list. This is called when inode is
15755 + * removed from the memory by memory pressure.
15756 + */
15757 +void reiser4_dispose_cursors(struct inode *inode)
15758 +{
15759 + process_cursors(inode, CURSOR_DISPOSE);
15760 +}
15761 +
15762 +/**
15763 + * reiser4_load_cursors - attach cursors to inode
15764 + * @inode: inode to load cursors to
15765 + *
15766 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15767 + * attached to cursor to inode's readdir list. This is done when inode is
15768 + * loaded into memory.
15769 + */
15770 +void reiser4_load_cursors(struct inode *inode)
15771 +{
15772 + process_cursors(inode, CURSOR_LOAD);
15773 +}
15774 +
15775 +/**
15776 + * reiser4_kill_cursors - kill all inode cursors
15777 + * @inode: inode to kill cursors of
15778 + *
15779 + * Frees all cursors for this inode. This is called when inode is destroyed.
15780 + */
15781 +void reiser4_kill_cursors(struct inode *inode)
15782 +{
15783 + process_cursors(inode, CURSOR_KILL);
15784 +}
15785 +
15786 +/**
15787 + * file_is_stateless -
15788 + * @file:
15789 + *
15790 + * true, if file descriptor @f is created by NFS server by "demand" to serve
15791 + * one file system operation. This means that there may be "detached state"
15792 + * for underlying inode.
15793 + */
15794 +static int file_is_stateless(struct file *file)
15795 +{
15796 + return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15797 +}
15798 +
15799 +/**
15800 + * reiser4_get_dir_fpos -
15801 + * @dir:
15802 + *
15803 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15804 + * in the case of stateless directory operation (readdir-over-nfs), client id
15805 + * was encoded in the high bits of cookie and should me masked off.
15806 + */
15807 +loff_t reiser4_get_dir_fpos(struct file *dir)
15808 +{
15809 + if (file_is_stateless(dir))
15810 + return dir->f_pos & CID_MASK;
15811 + else
15812 + return dir->f_pos;
15813 +}
15814 +
15815 +/**
15816 + * reiser4_attach_fsdata - try to attach fsdata
15817 + * @file:
15818 + * @inode:
15819 + *
15820 + * Finds or creates cursor for readdir-over-nfs.
15821 + */
15822 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15823 +{
15824 + loff_t pos;
15825 + int result;
15826 + dir_cursor *cursor;
15827 +
15828 + /*
15829 + * we are serialized by inode->i_mutex
15830 + */
15831 + if (!file_is_stateless(file))
15832 + return 0;
15833 +
15834 + pos = file->f_pos;
15835 + result = 0;
15836 + if (pos == 0) {
15837 + /*
15838 + * first call to readdir (or rewind to the beginning of
15839 + * directory)
15840 + */
15841 + cursor = kmem_cache_alloc(d_cursor_cache,
15842 + reiser4_ctx_gfp_mask_get());
15843 + if (cursor != NULL)
15844 + result = insert_cursor(cursor, file, inode);
15845 + else
15846 + result = RETERR(-ENOMEM);
15847 + } else {
15848 + /* try to find existing cursor */
15849 + d_cursor_key key;
15850 +
15851 + key.cid = pos >> CID_SHIFT;
15852 + key.oid = get_inode_oid(inode);
15853 + spin_lock(&d_lock);
15854 + cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15855 + if (cursor != NULL) {
15856 + /* cursor was found */
15857 + if (cursor->ref == 0) {
15858 + /* move it from unused list */
15859 + list_del_init(&cursor->alist);
15860 + --d_cursor_unused;
15861 + }
15862 + ++cursor->ref;
15863 + }
15864 + spin_unlock(&d_lock);
15865 + if (cursor != NULL) {
15866 + spin_lock_inode(inode);
15867 + assert("nikita-3556", cursor->fsdata->back == NULL);
15868 + clean_fsdata(file);
15869 + free_file_fsdata_nolock(file);
15870 + file->private_data = cursor->fsdata;
15871 + spin_unlock_inode(inode);
15872 + }
15873 + }
15874 + return result;
15875 +}
15876 +
15877 +/**
15878 + * reiser4_detach_fsdata - ???
15879 + * @file:
15880 + *
15881 + * detach fsdata, if necessary
15882 + */
15883 +void reiser4_detach_fsdata(struct file *file)
15884 +{
15885 + struct inode *inode;
15886 +
15887 + if (!file_is_stateless(file))
15888 + return;
15889 +
15890 + inode = file->f_dentry->d_inode;
15891 + spin_lock_inode(inode);
15892 + clean_fsdata(file);
15893 + spin_unlock_inode(inode);
15894 +}
15895 +
15896 +/* slab for reiser4_dentry_fsdata */
15897 +static struct kmem_cache *dentry_fsdata_cache;
15898 +
15899 +/**
15900 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15901 + *
15902 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
15903 + * part of reiser4 module initialization.
15904 + */
15905 +int reiser4_init_dentry_fsdata(void)
15906 +{
15907 + dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15908 + sizeof(reiser4_dentry_fsdata),
15909 + 0,
15910 + SLAB_HWCACHE_ALIGN |
15911 + SLAB_RECLAIM_ACCOUNT, NULL,
15912 + NULL);
15913 + if (dentry_fsdata_cache == NULL)
15914 + return RETERR(-ENOMEM);
15915 + return 0;
15916 +}
15917 +
15918 +/**
15919 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15920 + *
15921 + * This is called on reiser4 module unloading or system shutdown.
15922 + */
15923 +void reiser4_done_dentry_fsdata(void)
15924 +{
15925 + destroy_reiser4_cache(&dentry_fsdata_cache);
15926 +}
15927 +
15928 +/**
15929 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
15930 + * @dentry: queried dentry
15931 + *
15932 + * Allocates if necessary and returns per-dentry data that we attach to each
15933 + * dentry.
15934 + */
15935 +reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
15936 +{
15937 + assert("nikita-1365", dentry != NULL);
15938 +
15939 + if (dentry->d_fsdata == NULL) {
15940 + dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
15941 + reiser4_ctx_gfp_mask_get());
15942 + if (dentry->d_fsdata == NULL)
15943 + return ERR_PTR(RETERR(-ENOMEM));
15944 + memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
15945 + }
15946 + return dentry->d_fsdata;
15947 +}
15948 +
15949 +/**
15950 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
15951 + * @dentry: dentry to free fsdata of
15952 + *
15953 + * Detaches and frees fs-specific dentry data
15954 + */
15955 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
15956 +{
15957 + if (dentry->d_fsdata != NULL) {
15958 + kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
15959 + dentry->d_fsdata = NULL;
15960 + }
15961 +}
15962 +
15963 +/* slab for reiser4_file_fsdata */
15964 +static struct kmem_cache *file_fsdata_cache;
15965 +
15966 +/**
15967 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
15968 + *
15969 + * Initializes slab cache of structures attached to file->private_data. It is
15970 + * part of reiser4 module initialization.
15971 + */
15972 +int reiser4_init_file_fsdata(void)
15973 +{
15974 + file_fsdata_cache = kmem_cache_create("file_fsdata",
15975 + sizeof(reiser4_file_fsdata),
15976 + 0,
15977 + SLAB_HWCACHE_ALIGN |
15978 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
15979 + if (file_fsdata_cache == NULL)
15980 + return RETERR(-ENOMEM);
15981 + return 0;
15982 +}
15983 +
15984 +/**
15985 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
15986 + *
15987 + * This is called on reiser4 module unloading or system shutdown.
15988 + */
15989 +void reiser4_done_file_fsdata(void)
15990 +{
15991 + destroy_reiser4_cache(&file_fsdata_cache);
15992 +}
15993 +
15994 +/**
15995 + * create_fsdata - allocate and initialize reiser4_file_fsdata
15996 + * @file: what to create file_fsdata for, may be NULL
15997 + *
15998 + * Allocates and initializes reiser4_file_fsdata structure.
15999 + */
16000 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16001 +{
16002 + reiser4_file_fsdata *fsdata;
16003 +
16004 + fsdata = kmem_cache_alloc(file_fsdata_cache,
16005 + reiser4_ctx_gfp_mask_get());
16006 + if (fsdata != NULL) {
16007 + memset(fsdata, 0, sizeof *fsdata);
16008 + fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16009 + fsdata->back = file;
16010 + INIT_LIST_HEAD(&fsdata->dir.linkage);
16011 + }
16012 + return fsdata;
16013 +}
16014 +
16015 +/**
16016 + * free_fsdata - free reiser4_file_fsdata
16017 + * @fsdata: object to free
16018 + *
16019 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16020 + */
16021 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16022 +{
16023 + BUG_ON(fsdata == NULL);
16024 + kmem_cache_free(file_fsdata_cache, fsdata);
16025 +}
16026 +
16027 +/**
16028 + * reiser4_get_file_fsdata - get fs-specific file data
16029 + * @file: queried file
16030 + *
16031 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16032 + * to @file.
16033 + */
16034 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16035 +{
16036 + assert("nikita-1603", file != NULL);
16037 +
16038 + if (file->private_data == NULL) {
16039 + reiser4_file_fsdata *fsdata;
16040 + struct inode *inode;
16041 +
16042 + fsdata = create_fsdata(file);
16043 + if (fsdata == NULL)
16044 + return ERR_PTR(RETERR(-ENOMEM));
16045 +
16046 + inode = file->f_dentry->d_inode;
16047 + spin_lock_inode(inode);
16048 + if (file->private_data == NULL) {
16049 + file->private_data = fsdata;
16050 + fsdata = NULL;
16051 + }
16052 + spin_unlock_inode(inode);
16053 + if (fsdata != NULL)
16054 + /* other thread initialized ->fsdata */
16055 + kmem_cache_free(file_fsdata_cache, fsdata);
16056 + }
16057 + assert("nikita-2665", file->private_data != NULL);
16058 + return file->private_data;
16059 +}
16060 +
16061 +/**
16062 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16063 + * @file:
16064 + *
16065 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16066 + * readdir list, frees if it is not linked to d_cursor object.
16067 + */
16068 +static void free_file_fsdata_nolock(struct file *file)
16069 +{
16070 + reiser4_file_fsdata *fsdata;
16071 +
16072 + assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16073 + fsdata = file->private_data;
16074 + if (fsdata != NULL) {
16075 + list_del_init(&fsdata->dir.linkage);
16076 + if (fsdata->cursor == NULL)
16077 + free_fsdata(fsdata);
16078 + }
16079 + file->private_data = NULL;
16080 +}
16081 +
16082 +/**
16083 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16084 + * @file:
16085 + *
16086 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16087 + */
16088 +void reiser4_free_file_fsdata(struct file *file)
16089 +{
16090 + spin_lock_inode(file->f_dentry->d_inode);
16091 + free_file_fsdata_nolock(file);
16092 + spin_unlock_inode(file->f_dentry->d_inode);
16093 +}
16094 +
16095 +/*
16096 + * Local variables:
16097 + * c-indentation-style: "K&R"
16098 + * mode-name: "LC"
16099 + * c-basic-offset: 8
16100 + * tab-width: 8
16101 + * fill-column: 79
16102 + * End:
16103 + */
16104 diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.h linux-2.6.20/fs/reiser4/fsdata.h
16105 --- linux-2.6.20.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16106 +++ linux-2.6.20/fs/reiser4/fsdata.h 2007-05-06 14:50:43.722983224 +0400
16107 @@ -0,0 +1,207 @@
16108 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16109 + * reiser4/README */
16110 +
16111 +#if !defined( __REISER4_FSDATA_H__ )
16112 +#define __REISER4_FSDATA_H__
16113 +
16114 +#include "debug.h"
16115 +#include "kassign.h"
16116 +#include "seal.h"
16117 +#include "type_safe_hash.h"
16118 +#include "plugin/file/file.h"
16119 +#include "readahead.h"
16120 +
16121 +/*
16122 + * comment about reiser4_dentry_fsdata
16123 + *
16124 + *
16125 + */
16126 +
16127 +/*
16128 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16129 + * protected by ->i_mutex on inode. Under this lock following invariant
16130 + * holds:
16131 + *
16132 + * file descriptor is "looking" at the entry_no-th directory entry from
16133 + * the beginning of directory. This entry has key dir_entry_key and is
16134 + * pos-th entry with duplicate-key sequence.
16135 + *
16136 + */
16137 +
16138 +/* logical position within directory */
16139 +typedef struct {
16140 + /* key of directory entry (actually, part of a key sufficient to
16141 + identify directory entry) */
16142 + de_id dir_entry_key;
16143 + /* ordinal number of directory entry among all entries with the same
16144 + key. (Starting from 0.) */
16145 + unsigned pos;
16146 +} dir_pos;
16147 +
16148 +typedef struct {
16149 + /* f_pos corresponding to this readdir position */
16150 + __u64 fpos;
16151 + /* logical position within directory */
16152 + dir_pos position;
16153 + /* logical number of directory entry within
16154 + directory */
16155 + __u64 entry_no;
16156 +} readdir_pos;
16157 +
16158 +/*
16159 + * this is used to speed up lookups for directory entry: on initial call to
16160 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16161 + * in struct dentry and reused later to avoid tree traversals.
16162 + */
16163 +typedef struct de_location {
16164 + /* seal covering directory entry */
16165 + seal_t entry_seal;
16166 + /* coord of directory entry */
16167 + coord_t entry_coord;
16168 + /* ordinal number of directory entry among all entries with the same
16169 + key. (Starting from 0.) */
16170 + int pos;
16171 +} de_location;
16172 +
16173 +/**
16174 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16175 + *
16176 + * This is allocated dynamically and released in d_op->d_release()
16177 + *
16178 + * Currently it only contains cached location (hint) of directory entry, but
16179 + * it is expected that other information will be accumulated here.
16180 + */
16181 +typedef struct reiser4_dentry_fsdata {
16182 + /*
16183 + * here will go fields filled by ->lookup() to speedup next
16184 + * create/unlink, like blocknr of znode with stat-data, or key of
16185 + * stat-data.
16186 + */
16187 + de_location dec;
16188 + int stateless; /* created through reiser4_decode_fh, needs special
16189 + * treatment in readdir. */
16190 +} reiser4_dentry_fsdata;
16191 +
16192 +extern int reiser4_init_dentry_fsdata(void);
16193 +extern void reiser4_done_dentry_fsdata(void);
16194 +extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16195 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16196 +
16197 +/**
16198 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16199 + *
16200 + * This is allocated dynamically and released in inode->i_fop->release
16201 + */
16202 +typedef struct reiser4_file_fsdata {
16203 + /*
16204 + * pointer back to the struct file which this reiser4_file_fsdata is
16205 + * part of
16206 + */
16207 + struct file *back;
16208 + /* detached cursor for stateless readdir. */
16209 + struct dir_cursor *cursor;
16210 + /*
16211 + * We need both directory and regular file parts here, because there
16212 + * are file system objects that are files and directories.
16213 + */
16214 + struct {
16215 + /*
16216 + * position in directory. It is updated each time directory is
16217 + * modified
16218 + */
16219 + readdir_pos readdir;
16220 + /* head of this list is reiser4_inode->lists.readdir_list */
16221 + struct list_head linkage;
16222 + } dir;
16223 + /* hints to speed up operations with regular files: read and write. */
16224 + struct {
16225 + hint_t hint;
16226 + } reg;
16227 + struct reiser4_file_ra_state ra1;
16228 +
16229 +} reiser4_file_fsdata;
16230 +
16231 +extern int reiser4_init_file_fsdata(void);
16232 +extern void reiser4_done_file_fsdata(void);
16233 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16234 +extern void reiser4_free_file_fsdata(struct file *);
16235 +
16236 +/*
16237 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16238 + * used to address problem reiser4 has with readdir accesses via NFS. See
16239 + * plugin/file_ops_readdir.c for more details.
16240 + */
16241 +typedef struct {
16242 + __u16 cid;
16243 + __u64 oid;
16244 +} d_cursor_key;
16245 +
16246 +/*
16247 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16248 + * maintain hash table of dir_cursor-s in reiser4's super block
16249 + */
16250 +typedef struct dir_cursor dir_cursor;
16251 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16252 +
16253 +typedef struct d_cursor_info d_cursor_info;
16254 +
16255 +struct dir_cursor {
16256 + int ref;
16257 + reiser4_file_fsdata *fsdata;
16258 +
16259 + /* link to reiser4 super block hash table of cursors */
16260 + d_cursor_hash_link hash;
16261 +
16262 + /*
16263 + * this is to link cursors to reiser4 super block's radix tree of
16264 + * cursors if there are more than one cursor of the same objectid
16265 + */
16266 + struct list_head list;
16267 + d_cursor_key key;
16268 + d_cursor_info *info;
16269 + /* list of unused cursors */
16270 + struct list_head alist;
16271 +};
16272 +
16273 +extern int reiser4_init_d_cursor(void);
16274 +extern void reiser4_done_d_cursor(void);
16275 +
16276 +extern int reiser4_init_super_d_info(struct super_block *);
16277 +extern void reiser4_done_super_d_info(struct super_block *);
16278 +
16279 +extern loff_t reiser4_get_dir_fpos(struct file *);
16280 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16281 +extern void reiser4_detach_fsdata(struct file *);
16282 +
16283 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16284 + more details */
16285 +void reiser4_dispose_cursors(struct inode *inode);
16286 +void reiser4_load_cursors(struct inode *inode);
16287 +void reiser4_kill_cursors(struct inode *inode);
16288 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16289 + int offset, int adj);
16290 +
16291 +/*
16292 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16293 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16294 + */
16295 +struct d_cursor_info {
16296 + d_cursor_hash_table table;
16297 + struct radix_tree_root tree;
16298 +};
16299 +
16300 +/* spinlock protecting readdir cursors */
16301 +extern spinlock_t d_lock;
16302 +
16303 +/* __REISER4_FSDATA_H__ */
16304 +#endif
16305 +
16306 +/*
16307 + * Local variables:
16308 + * c-indentation-style: "K&R"
16309 + * mode-name: "LC"
16310 + * c-basic-offset: 8
16311 + * tab-width: 8
16312 + * fill-column: 120
16313 + * End:
16314 + */
16315 diff -urN linux-2.6.20.orig/fs/reiser4/init_super.c linux-2.6.20/fs/reiser4/init_super.c
16316 --- linux-2.6.20.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16317 +++ linux-2.6.20/fs/reiser4/init_super.c 2007-05-06 14:50:43.722983224 +0400
16318 @@ -0,0 +1,750 @@
16319 +/* Copyright by Hans Reiser, 2003 */
16320 +
16321 +#include "super.h"
16322 +#include "inode.h"
16323 +#include "plugin/plugin_set.h"
16324 +
16325 +#include <linux/swap.h>
16326 +
16327 +/**
16328 + * init_fs_info - allocate reiser4 specific super block
16329 + * @super: super block of filesystem
16330 + *
16331 + * Allocates and initialize reiser4_super_info_data, attaches it to
16332 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16333 + */
16334 +int reiser4_init_fs_info(struct super_block *super)
16335 +{
16336 + reiser4_super_info_data *sbinfo;
16337 +
16338 + sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16339 + reiser4_ctx_gfp_mask_get());
16340 + if (!sbinfo)
16341 + return RETERR(-ENOMEM);
16342 +
16343 + super->s_fs_info = sbinfo;
16344 + super->s_op = NULL;
16345 + memset(sbinfo, 0, sizeof(*sbinfo));
16346 +
16347 + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16348 + ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16349 +
16350 + mutex_init(&sbinfo->delete_mutex);
16351 + spin_lock_init(&(sbinfo->guard));
16352 +
16353 + /* initialize per-super-block d_cursor resources */
16354 + reiser4_init_super_d_info(super);
16355 +
16356 + return 0;
16357 +}
16358 +
16359 +/**
16360 + * reiser4_done_fs_info - free reiser4 specific super block
16361 + * @super: super block of filesystem
16362 + *
16363 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16364 + * frees reiser4_super_info_data.
16365 + */
16366 +void reiser4_done_fs_info(struct super_block *super)
16367 +{
16368 + assert("zam-990", super->s_fs_info != NULL);
16369 +
16370 + /* release per-super-block d_cursor resources */
16371 + reiser4_done_super_d_info(super);
16372 +
16373 + /* make sure that there are not jnodes already */
16374 + assert("", list_empty(&get_super_private(super)->all_jnodes));
16375 + assert("", get_current_context()->trans->atom == NULL);
16376 + reiser4_check_block_counters(super);
16377 + kfree(super->s_fs_info);
16378 + super->s_fs_info = NULL;
16379 +}
16380 +
16381 +/* type of option parseable by parse_option() */
16382 +typedef enum {
16383 + /* value of option is arbitrary string */
16384 + OPT_STRING,
16385 +
16386 + /*
16387 + * option specifies bit in a bitmask. When option is set - bit in
16388 + * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16389 + * dont_load_bitmap, atomic_write.
16390 + */
16391 + OPT_BIT,
16392 +
16393 + /*
16394 + * value of option should conform to sprintf() format. Examples are
16395 + * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16396 + */
16397 + OPT_FORMAT,
16398 +
16399 + /*
16400 + * option can take one of predefined values. Example is onerror=panic or
16401 + * onerror=remount-ro
16402 + */
16403 + OPT_ONEOF,
16404 +} opt_type_t;
16405 +
16406 +typedef struct opt_bitmask_bit {
16407 + const char *bit_name;
16408 + int bit_nr;
16409 +} opt_bitmask_bit;
16410 +
16411 +/* description of option parseable by parse_option() */
16412 +typedef struct opt_desc {
16413 + /* option name.
16414 +
16415 + parsed portion of string has a form "name=value".
16416 + */
16417 + const char *name;
16418 + /* type of option */
16419 + opt_type_t type;
16420 + union {
16421 + /* where to store value of string option (type == OPT_STRING) */
16422 + char **string;
16423 + /* description of bits for bit option (type == OPT_BIT) */
16424 + struct {
16425 + int nr;
16426 + void *addr;
16427 + } bit;
16428 + /* description of format and targets for format option (type
16429 + == OPT_FORMAT) */
16430 + struct {
16431 + const char *format;
16432 + int nr_args;
16433 + void *arg1;
16434 + void *arg2;
16435 + void *arg3;
16436 + void *arg4;
16437 + } f;
16438 + struct {
16439 + int *result;
16440 + const char *list[10];
16441 + } oneof;
16442 + struct {
16443 + void *addr;
16444 + int nr_bits;
16445 + opt_bitmask_bit *bits;
16446 + } bitmask;
16447 + } u;
16448 +} opt_desc_t;
16449 +
16450 +/**
16451 + * parse_option - parse one option
16452 + * @opt_strin: starting point of parsing
16453 + * @opt: option description
16454 + *
16455 + * foo=bar,
16456 + * ^ ^ ^
16457 + * | | +-- replaced to '\0'
16458 + * | +-- val_start
16459 + * +-- opt_string
16460 + * Figures out option type and handles option correspondingly.
16461 + */
16462 +static int parse_option(char *opt_string, opt_desc_t *opt)
16463 +{
16464 + char *val_start;
16465 + int result;
16466 + const char *err_msg;
16467 +
16468 + /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16469 +
16470 + val_start = strchr(opt_string, '=');
16471 + if (val_start != NULL) {
16472 + *val_start = '\0';
16473 + ++val_start;
16474 + }
16475 +
16476 + err_msg = NULL;
16477 + result = 0;
16478 + switch (opt->type) {
16479 + case OPT_STRING:
16480 + if (val_start == NULL) {
16481 + err_msg = "String arg missing";
16482 + result = RETERR(-EINVAL);
16483 + } else
16484 + *opt->u.string = val_start;
16485 + break;
16486 + case OPT_BIT:
16487 + if (val_start != NULL)
16488 + err_msg = "Value ignored";
16489 + else
16490 + set_bit(opt->u.bit.nr, opt->u.bit.addr);
16491 + break;
16492 + case OPT_FORMAT:
16493 + if (val_start == NULL) {
16494 + err_msg = "Formatted arg missing";
16495 + result = RETERR(-EINVAL);
16496 + break;
16497 + }
16498 + if (sscanf(val_start, opt->u.f.format,
16499 + opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16500 + opt->u.f.arg4) != opt->u.f.nr_args) {
16501 + err_msg = "Wrong conversion";
16502 + result = RETERR(-EINVAL);
16503 + }
16504 + break;
16505 + case OPT_ONEOF:
16506 + {
16507 + int i = 0;
16508 +
16509 + if (val_start == NULL) {
16510 + err_msg = "Value is missing";
16511 + result = RETERR(-EINVAL);
16512 + break;
16513 + }
16514 + err_msg = "Wrong option value";
16515 + result = RETERR(-EINVAL);
16516 + while (opt->u.oneof.list[i]) {
16517 + if (!strcmp(opt->u.oneof.list[i], val_start)) {
16518 + result = 0;
16519 + err_msg = NULL;
16520 + *opt->u.oneof.result = i;
16521 + break;
16522 + }
16523 + i++;
16524 + }
16525 + break;
16526 + }
16527 + default:
16528 + wrong_return_value("nikita-2100", "opt -> type");
16529 + break;
16530 + }
16531 + if (err_msg != NULL) {
16532 + warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16533 + err_msg, opt->name, val_start ? "=" : "",
16534 + val_start ? : "");
16535 + }
16536 + return result;
16537 +}
16538 +
16539 +/**
16540 + * parse_options - parse reiser4 mount options
16541 + * @opt_string: starting point
16542 + * @opts: array of option description
16543 + * @nr_opts: number of elements in @opts
16544 + *
16545 + * Parses comma separated list of reiser4 mount options.
16546 + */
16547 +static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
16548 +{
16549 + int result;
16550 +
16551 + result = 0;
16552 + while ((result == 0) && opt_string && *opt_string) {
16553 + int j;
16554 + char *next;
16555 +
16556 + next = strchr(opt_string, ',');
16557 + if (next != NULL) {
16558 + *next = '\0';
16559 + ++next;
16560 + }
16561 + for (j = 0; j < nr_opts; ++j) {
16562 + if (!strncmp(opt_string, opts[j].name,
16563 + strlen(opts[j].name))) {
16564 + result = parse_option(opt_string, &opts[j]);
16565 + break;
16566 + }
16567 + }
16568 + if (j == nr_opts) {
16569 + warning("nikita-2307", "Unrecognized option: \"%s\"",
16570 + opt_string);
16571 + /* traditionally, -EINVAL is returned on wrong mount
16572 + option */
16573 + result = RETERR(-EINVAL);
16574 + }
16575 + opt_string = next;
16576 + }
16577 + return result;
16578 +}
16579 +
16580 +#define NUM_OPT( label, fmt, addr ) \
16581 + { \
16582 + .name = ( label ), \
16583 + .type = OPT_FORMAT, \
16584 + .u = { \
16585 + .f = { \
16586 + .format = ( fmt ), \
16587 + .nr_args = 1, \
16588 + .arg1 = ( addr ), \
16589 + .arg2 = NULL, \
16590 + .arg3 = NULL, \
16591 + .arg4 = NULL \
16592 + } \
16593 + } \
16594 + }
16595 +
16596 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16597 +
16598 +#define BIT_OPT(label, bitnr) \
16599 + { \
16600 + .name = label, \
16601 + .type = OPT_BIT, \
16602 + .u = { \
16603 + .bit = { \
16604 + .nr = bitnr, \
16605 + .addr = &sbinfo->fs_flags \
16606 + } \
16607 + } \
16608 + }
16609 +
16610 +#define MAX_NR_OPTIONS (30)
16611 +
16612 +/**
16613 + * reiser4_init_super_data - initialize reiser4 private super block
16614 + * @super: super block to initialize
16615 + * @opt_string: list of reiser4 mount options
16616 + *
16617 + * Sets various reiser4 parameters to default values. Parses mount options and
16618 + * overwrites default settings.
16619 + */
16620 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
16621 +{
16622 + int result;
16623 + opt_desc_t *opts, *p;
16624 + reiser4_super_info_data *sbinfo = get_super_private(super);
16625 +
16626 + /* initialize super, export, dentry operations */
16627 + sbinfo->ops.super = reiser4_super_operations;
16628 + sbinfo->ops.export = reiser4_export_operations;
16629 + sbinfo->ops.dentry = reiser4_dentry_operations;
16630 + super->s_op = &sbinfo->ops.super;
16631 + super->s_export_op = &sbinfo->ops.export;
16632 +
16633 + /* initialize transaction manager parameters to default values */
16634 + sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16635 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16636 + sbinfo->tmgr.atom_min_size = 256;
16637 + sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16638 +
16639 + /* initialize cbk cache parameter */
16640 + sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16641 +
16642 + /* initialize flush parameters */
16643 + sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16644 + sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16645 + sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16646 + sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16647 +
16648 + sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16649 +
16650 + /* preliminary tree initializations */
16651 + sbinfo->tree.super = super;
16652 + sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16653 + sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16654 + sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16655 + sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16656 + rwlock_init(&(sbinfo->tree.tree_lock));
16657 + spin_lock_init(&(sbinfo->tree.epoch_lock));
16658 +
16659 + /* initialize default readahead params */
16660 + sbinfo->ra_params.max = num_physpages / 4;
16661 + sbinfo->ra_params.flags = 0;
16662 +
16663 + /* allocate memory for structure describing reiser4 mount options */
16664 + opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS,
16665 + reiser4_ctx_gfp_mask_get());
16666 + if (opts == NULL)
16667 + return RETERR(-ENOMEM);
16668 +
16669 + /* initialize structure describing reiser4 mount options */
16670 + p = opts;
16671 +
16672 +#if REISER4_DEBUG
16673 +# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16674 + warning ("zam-1046", "opt array is overloaded"); break; \
16675 + }
16676 +#else
16677 +# define OPT_ARRAY_CHECK noop
16678 +#endif
16679 +
16680 +#define PUSH_OPT(...) \
16681 +do { \
16682 + opt_desc_t o = __VA_ARGS__; \
16683 + OPT_ARRAY_CHECK; \
16684 + *p ++ = o; \
16685 +} while (0)
16686 +
16687 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16688 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16689 +
16690 + /*
16691 + * tmgr.atom_max_size=N
16692 + * Atoms containing more than N blocks will be forced to commit. N is
16693 + * decimal.
16694 + */
16695 + PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16696 + /*
16697 + * tmgr.atom_max_age=N
16698 + * Atoms older than N seconds will be forced to commit. N is decimal.
16699 + */
16700 + PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16701 + /*
16702 + * tmgr.atom_min_size=N
16703 + * In committing an atom to free dirty pages, force the atom less than
16704 + * N in size to fuse with another one.
16705 + */
16706 + PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16707 + /*
16708 + * tmgr.atom_max_flushers=N
16709 + * limit of concurrent flushers for one atom. 0 means no limit.
16710 + */
16711 + PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16712 + /*
16713 + * tree.cbk_cache_slots=N
16714 + * Number of slots in the cbk cache.
16715 + */
16716 + PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16717 + /*
16718 + * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16719 + * leaf-level blocks it will force them to be relocated.
16720 + */
16721 + PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16722 + /*
16723 + * If flush finds can find a block allocation closer than at most
16724 + * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16725 + * position.
16726 + */
16727 + PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16728 + /*
16729 + * If we have written this much or more blocks before encountering busy
16730 + * jnode in flush list - abort flushing hoping that next time we get
16731 + * called this jnode will be clean already, and we will save some
16732 + * seeks.
16733 + */
16734 + PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16735 + /* The maximum number of nodes to scan left on a level during flush. */
16736 + PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16737 + /* preferred IO size */
16738 + PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16739 + /* carry flags used for insertion of new nodes */
16740 + PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16741 + /* carry flags used for insertion of new extents */
16742 + PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16743 + /* carry flags used for paste operations */
16744 + PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16745 + /* carry flags used for insert operations */
16746 + PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16747 +
16748 +#ifdef CONFIG_REISER4_BADBLOCKS
16749 + /*
16750 + * Alternative master superblock location in case if it's original
16751 + * location is not writeable/accessable. This is offset in BYTES.
16752 + */
16753 + PUSH_SB_FIELD_OPT(altsuper, "%lu");
16754 +#endif
16755 +
16756 + /* turn on BSD-style gid assignment */
16757 + PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16758 + /* turn on 32 bit times */
16759 + PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16760 + /*
16761 + * Don't load all bitmap blocks at mount time, it is useful for
16762 + * machines with tiny RAM and large disks.
16763 + */
16764 + PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16765 + /* disable transaction commits during write() */
16766 + PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16767 + /* disable use of write barriers in the reiser4 log writer. */
16768 + PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16769 +
16770 + PUSH_OPT(
16771 + {
16772 + /*
16773 + * tree traversal readahead parameters:
16774 + * -o readahead:MAXNUM:FLAGS
16775 + * MAXNUM - max number fo nodes to request readahead for: -1UL
16776 + * will set it to max_sane_readahead()
16777 + * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16778 + * CONTINUE_ON_PRESENT
16779 + */
16780 + .name = "readahead",
16781 + .type = OPT_FORMAT,
16782 + .u = {
16783 + .f = {
16784 + .format = "%u:%u",
16785 + .nr_args = 2,
16786 + .arg1 = &sbinfo->ra_params.max,
16787 + .arg2 = &sbinfo->ra_params.flags,
16788 + .arg3 = NULL,
16789 + .arg4 = NULL
16790 + }
16791 + }
16792 + }
16793 + );
16794 +
16795 + /* What to do in case of fs error */
16796 + PUSH_OPT(
16797 + {
16798 + .name = "onerror",
16799 + .type = OPT_ONEOF,
16800 + .u = {
16801 + .oneof = {
16802 + .result = &sbinfo->onerror,
16803 + .list = {
16804 + "panic", "remount-ro", NULL
16805 + },
16806 + }
16807 + }
16808 + }
16809 + );
16810 +
16811 + /* modify default settings to values set by mount options */
16812 + result = parse_options(opt_string, opts, p - opts);
16813 + kfree(opts);
16814 + if (result != 0)
16815 + return result;
16816 +
16817 + /* correct settings to sanity values */
16818 + sbinfo->tmgr.atom_max_age *= HZ;
16819 + if (sbinfo->tmgr.atom_max_age <= 0)
16820 + /* overflow */
16821 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16822 +
16823 + /* round optimal io size up to 512 bytes */
16824 + sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16825 + sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16826 + if (sbinfo->optimal_io_size == 0) {
16827 + warning("nikita-2497", "optimal_io_size is too small");
16828 + return RETERR(-EINVAL);
16829 + }
16830 + return result;
16831 +}
16832 +
16833 +/**
16834 + * reiser4_init_read_super - read reiser4 master super block
16835 + * @super: super block to fill
16836 + * @silent: if 0 - print warnings
16837 + *
16838 + * Reads reiser4 master super block either from predefined location or from
16839 + * location specified by altsuper mount option, initializes disk format plugin.
16840 + */
16841 +int reiser4_init_read_super(struct super_block *super, int silent)
16842 +{
16843 + struct buffer_head *super_bh;
16844 + struct reiser4_master_sb *master_sb;
16845 + reiser4_super_info_data *sbinfo = get_super_private(super);
16846 + unsigned long blocksize;
16847 +
16848 + read_super_block:
16849 +#ifdef CONFIG_REISER4_BADBLOCKS
16850 + if (sbinfo->altsuper)
16851 + /*
16852 + * read reiser4 master super block at position specified by
16853 + * mount option
16854 + */
16855 + super_bh = sb_bread(super,
16856 + (sector_t)(sbinfo->altsuper / super->s_blocksize));
16857 + else
16858 +#endif
16859 + /* read reiser4 master super block at 16-th 4096 block */
16860 + super_bh = sb_bread(super,
16861 + (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16862 + if (!super_bh)
16863 + return RETERR(-EIO);
16864 +
16865 + master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16866 + /* check reiser4 magic string */
16867 + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16868 + sizeof(REISER4_SUPER_MAGIC_STRING))) {
16869 + /* reiser4 master super block contains filesystem blocksize */
16870 + blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16871 +
16872 + if (blocksize != PAGE_CACHE_SIZE) {
16873 + /*
16874 + * currenly reiser4's blocksize must be equal to
16875 + * pagesize
16876 + */
16877 + if (!silent)
16878 + warning("nikita-2609",
16879 + "%s: wrong block size %ld\n", super->s_id,
16880 + blocksize);
16881 + brelse(super_bh);
16882 + return RETERR(-EINVAL);
16883 + }
16884 + if (blocksize != super->s_blocksize) {
16885 + /*
16886 + * filesystem uses different blocksize. Reread master
16887 + * super block with correct blocksize
16888 + */
16889 + brelse(super_bh);
16890 + if (!sb_set_blocksize(super, (int)blocksize))
16891 + return RETERR(-EINVAL);
16892 + goto read_super_block;
16893 + }
16894 +
16895 + sbinfo->df_plug =
16896 + disk_format_plugin_by_id(
16897 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16898 + if (sbinfo->df_plug == NULL) {
16899 + if (!silent)
16900 + warning("nikita-26091",
16901 + "%s: unknown disk format plugin %d\n",
16902 + super->s_id,
16903 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16904 + brelse(super_bh);
16905 + return RETERR(-EINVAL);
16906 + }
16907 + sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16908 + brelse(super_bh);
16909 + return 0;
16910 + }
16911 +
16912 + /* there is no reiser4 on the device */
16913 + if (!silent)
16914 + warning("nikita-2608",
16915 + "%s: wrong master super block magic", super->s_id);
16916 + brelse(super_bh);
16917 + return RETERR(-EINVAL);
16918 +}
16919 +
16920 +static struct {
16921 + reiser4_plugin_type type;
16922 + reiser4_plugin_id id;
16923 +} default_plugins[PSET_LAST] = {
16924 + [PSET_FILE] = {
16925 + .type = REISER4_FILE_PLUGIN_TYPE,
16926 + .id = UNIX_FILE_PLUGIN_ID
16927 + },
16928 + [PSET_DIR] = {
16929 + .type = REISER4_DIR_PLUGIN_TYPE,
16930 + .id = HASHED_DIR_PLUGIN_ID
16931 + },
16932 + [PSET_HASH] = {
16933 + .type = REISER4_HASH_PLUGIN_TYPE,
16934 + .id = R5_HASH_ID
16935 + },
16936 + [PSET_FIBRATION] = {
16937 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
16938 + .id = FIBRATION_DOT_O
16939 + },
16940 + [PSET_PERM] = {
16941 + .type = REISER4_PERM_PLUGIN_TYPE,
16942 + .id = NULL_PERM_ID
16943 + },
16944 + [PSET_FORMATTING] = {
16945 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
16946 + .id = SMALL_FILE_FORMATTING_ID
16947 + },
16948 + [PSET_SD] = {
16949 + .type = REISER4_ITEM_PLUGIN_TYPE,
16950 + .id = STATIC_STAT_DATA_ID
16951 + },
16952 + [PSET_DIR_ITEM] = {
16953 + .type = REISER4_ITEM_PLUGIN_TYPE,
16954 + .id = COMPOUND_DIR_ID
16955 + },
16956 + [PSET_CIPHER] = {
16957 + .type = REISER4_CIPHER_PLUGIN_TYPE,
16958 + .id = NONE_CIPHER_ID
16959 + },
16960 + [PSET_DIGEST] = {
16961 + .type = REISER4_DIGEST_PLUGIN_TYPE,
16962 + .id = SHA256_32_DIGEST_ID
16963 + },
16964 + [PSET_COMPRESSION] = {
16965 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
16966 + .id = LZO1_COMPRESSION_ID
16967 + },
16968 + [PSET_COMPRESSION_MODE] = {
16969 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
16970 + .id = CONVX_COMPRESSION_MODE_ID
16971 + },
16972 + [PSET_CLUSTER] = {
16973 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
16974 + .id = CLUSTER_64K_ID
16975 + },
16976 + [PSET_CREATE] = {
16977 + .type = REISER4_FILE_PLUGIN_TYPE,
16978 + .id = UNIX_FILE_PLUGIN_ID
16979 + }
16980 +};
16981 +
16982 +/* access to default plugin table */
16983 +reiser4_plugin *get_default_plugin(pset_member memb)
16984 +{
16985 + return plugin_by_id(default_plugins[memb].type,
16986 + default_plugins[memb].id);
16987 +}
16988 +
16989 +/**
16990 + * reiser4_init_root_inode - obtain inode of root directory
16991 + * @super: super block of filesystem
16992 + *
16993 + * Obtains inode of root directory (reading it from disk), initializes plugin
16994 + * set it was not initialized.
16995 + */
16996 +int reiser4_init_root_inode(struct super_block *super)
16997 +{
16998 + reiser4_super_info_data *sbinfo = get_super_private(super);
16999 + struct inode *inode;
17000 + int result = 0;
17001 +
17002 + inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17003 + if (IS_ERR(inode))
17004 + return RETERR(PTR_ERR(inode));
17005 +
17006 + super->s_root = d_alloc_root(inode);
17007 + if (!super->s_root) {
17008 + iput(inode);
17009 + return RETERR(-ENOMEM);
17010 + }
17011 +
17012 + super->s_root->d_op = &sbinfo->ops.dentry;
17013 +
17014 + if (!is_inode_loaded(inode)) {
17015 + pset_member memb;
17016 + plugin_set *pset;
17017 +
17018 + pset = reiser4_inode_data(inode)->pset;
17019 + for (memb = 0; memb < PSET_LAST; ++memb) {
17020 +
17021 + if (aset_get(pset, memb) != NULL)
17022 + continue;
17023 +
17024 + result = grab_plugin_pset(inode, NULL, memb);
17025 + if (result != 0)
17026 + break;
17027 +
17028 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17029 + }
17030 +
17031 + if (result == 0) {
17032 + if (REISER4_DEBUG) {
17033 + for (memb = 0; memb < PSET_LAST; ++memb)
17034 + assert("nikita-3500",
17035 + aset_get(pset, memb) != NULL);
17036 + }
17037 + } else
17038 + warning("nikita-3448", "Cannot set plugins of root: %i",
17039 + result);
17040 + reiser4_iget_complete(inode);
17041 +
17042 + /* As the default pset kept in the root dir may has been changed
17043 + (length is unknown), call update_sd. */
17044 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17045 + result = reiser4_grab_space(
17046 + inode_file_plugin(inode)->estimate.update(inode),
17047 + BA_CAN_COMMIT);
17048 +
17049 + if (result == 0)
17050 + result = reiser4_update_sd(inode);
17051 +
17052 + all_grabbed2free();
17053 + }
17054 + }
17055 +
17056 + super->s_maxbytes = MAX_LFS_FILESIZE;
17057 + return result;
17058 +}
17059 +
17060 +/*
17061 + * Local variables:
17062 + * c-indentation-style: "K&R"
17063 + * mode-name: "LC"
17064 + * c-basic-offset: 8
17065 + * tab-width: 8
17066 + * fill-column: 79
17067 + * End:
17068 + */
17069 diff -urN linux-2.6.20.orig/fs/reiser4/inode.c linux-2.6.20/fs/reiser4/inode.c
17070 --- linux-2.6.20.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17071 +++ linux-2.6.20/fs/reiser4/inode.c 2007-05-06 14:50:43.726984474 +0400
17072 @@ -0,0 +1,709 @@
17073 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17074 +
17075 +/* Inode specific operations. */
17076 +
17077 +#include "forward.h"
17078 +#include "debug.h"
17079 +#include "key.h"
17080 +#include "kassign.h"
17081 +#include "coord.h"
17082 +#include "seal.h"
17083 +#include "dscale.h"
17084 +#include "plugin/item/item.h"
17085 +#include "plugin/security/perm.h"
17086 +#include "plugin/plugin.h"
17087 +#include "plugin/object.h"
17088 +#include "znode.h"
17089 +#include "vfs_ops.h"
17090 +#include "inode.h"
17091 +#include "super.h"
17092 +#include "reiser4.h"
17093 +
17094 +#include <linux/fs.h> /* for struct super_block, address_space */
17095 +
17096 +/* return reiser4 internal tree which inode belongs to */
17097 +/* Audited by: green(2002.06.17) */
17098 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17099 +{
17100 + assert("nikita-256", inode != NULL);
17101 + assert("nikita-257", inode->i_sb != NULL);
17102 + return reiser4_get_tree(inode->i_sb);
17103 +}
17104 +
17105 +/* return reiser4-specific inode flags */
17106 +static inline unsigned long *inode_flags(const struct inode *const inode)
17107 +{
17108 + assert("nikita-2842", inode != NULL);
17109 + return &reiser4_inode_data(inode)->flags;
17110 +}
17111 +
17112 +/* set reiser4-specific flag @f in @inode */
17113 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17114 +{
17115 + assert("nikita-2248", inode != NULL);
17116 + set_bit((int)f, inode_flags(inode));
17117 +}
17118 +
17119 +/* clear reiser4-specific flag @f in @inode */
17120 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17121 +{
17122 + assert("nikita-2250", inode != NULL);
17123 + clear_bit((int)f, inode_flags(inode));
17124 +}
17125 +
17126 +/* true if reiser4-specific flag @f is set in @inode */
17127 +int reiser4_inode_get_flag(const struct inode *inode,
17128 + reiser4_file_plugin_flags f)
17129 +{
17130 + assert("nikita-2251", inode != NULL);
17131 + return test_bit((int)f, inode_flags(inode));
17132 +}
17133 +
17134 +/* convert oid to inode number */
17135 +ino_t oid_to_ino(oid_t oid)
17136 +{
17137 + return (ino_t) oid;
17138 +}
17139 +
17140 +/* convert oid to user visible inode number */
17141 +ino_t oid_to_uino(oid_t oid)
17142 +{
17143 + /* reiser4 object is uniquely identified by oid which is 64 bit
17144 + quantity. Kernel in-memory inode is indexed (in the hash table) by
17145 + 32 bit i_ino field, but this is not a problem, because there is a
17146 + way to further distinguish inodes with identical inode numbers
17147 + (find_actor supplied to iget()).
17148 +
17149 + But user space expects unique 32 bit inode number. Obviously this
17150 + is impossible. Work-around is to somehow hash oid into user visible
17151 + inode number.
17152 + */
17153 + oid_t max_ino = (ino_t) ~ 0;
17154 +
17155 + if (REISER4_INO_IS_OID || (oid <= max_ino))
17156 + return oid;
17157 + else
17158 + /* this is remotely similar to algorithm used to find next pid
17159 + to use for process: after wrap-around start from some
17160 + offset rather than from 0. Idea is that there are some long
17161 + living objects with which we don't want to collide.
17162 + */
17163 + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17164 +}
17165 +
17166 +/* check that "inode" is on reiser4 file-system */
17167 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17168 +{
17169 + return inode != NULL && is_reiser4_super(inode->i_sb);
17170 +}
17171 +
17172 +/* Maximal length of a name that can be stored in directory @inode.
17173 +
17174 + This is used in check during file creation and lookup. */
17175 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17176 +{
17177 + assert("nikita-287", is_reiser4_inode(inode));
17178 + assert("nikita-1710", inode_dir_item_plugin(inode));
17179 + if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17180 + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17181 + else
17182 + return 255;
17183 +}
17184 +
17185 +#if REISER4_USE_COLLISION_LIMIT
17186 +/* Maximal number of hash collisions for this directory. */
17187 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17188 +{
17189 + assert("nikita-1711", dir != NULL);
17190 + return reiser4_inode_data(dir)->plugin.max_collisions;
17191 +}
17192 +#endif /* REISER4_USE_COLLISION_LIMIT */
17193 +
17194 +/* Install file, inode, and address_space operation on @inode, depending on
17195 + its mode. */
17196 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17197 + reiser4_object_create_data * data /* parameters to create
17198 + * object */ )
17199 +{
17200 + reiser4_super_info_data *sinfo;
17201 + file_plugin *fplug;
17202 + dir_plugin *dplug;
17203 +
17204 + fplug = inode_file_plugin(inode);
17205 + dplug = inode_dir_plugin(inode);
17206 +
17207 + sinfo = get_super_private(inode->i_sb);
17208 +
17209 + switch (inode->i_mode & S_IFMT) {
17210 + case S_IFSOCK:
17211 + case S_IFBLK:
17212 + case S_IFCHR:
17213 + case S_IFIFO:
17214 + {
17215 + dev_t rdev; /* to keep gcc happy */
17216 +
17217 + assert("vs-46", fplug != NULL);
17218 + /* ugly hack with rdev */
17219 + if (data == NULL) {
17220 + rdev = inode->i_rdev;
17221 + inode->i_rdev = 0;
17222 + } else
17223 + rdev = data->rdev;
17224 + inode->i_blocks = 0;
17225 + assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17226 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17227 + /* initialize inode->i_fop and inode->i_rdev for block and char
17228 + devices */
17229 + init_special_inode(inode, inode->i_mode, rdev);
17230 + /* all address space operations are null */
17231 + inode->i_mapping->a_ops =
17232 + &file_plugins[fplug->h.id].as_ops;
17233 + break;
17234 + }
17235 + case S_IFLNK:
17236 + assert("vs-46", fplug != NULL);
17237 + assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17238 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17239 + inode->i_fop = NULL;
17240 + /* all address space operations are null */
17241 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17242 + break;
17243 + case S_IFDIR:
17244 + assert("vs-46", dplug != NULL);
17245 + assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17246 + dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17247 + inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17248 + inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17249 + inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17250 + break;
17251 + case S_IFREG:
17252 + assert("vs-46", fplug != NULL);
17253 + assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17254 + fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17255 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17256 + inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17257 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17258 + break;
17259 + default:
17260 + warning("nikita-291", "wrong file mode: %o for %llu",
17261 + inode->i_mode,
17262 + (unsigned long long)get_inode_oid(inode));
17263 + reiser4_make_bad_inode(inode);
17264 + return RETERR(-EINVAL);
17265 + }
17266 + return 0;
17267 +}
17268 +
17269 +/* Initialize inode from disk data. Called with inode locked.
17270 + Return inode locked. */
17271 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17272 + coord_t * coord /* coord of stat data */ )
17273 +{
17274 + int result;
17275 + item_plugin *iplug;
17276 + void *body;
17277 + int length;
17278 + reiser4_inode *state;
17279 +
17280 + assert("nikita-292", coord != NULL);
17281 + assert("nikita-293", inode != NULL);
17282 +
17283 + coord_clear_iplug(coord);
17284 + result = zload(coord->node);
17285 + if (result)
17286 + return result;
17287 + iplug = item_plugin_by_coord(coord);
17288 + body = item_body_by_coord(coord);
17289 + length = item_length_by_coord(coord);
17290 +
17291 + assert("nikita-295", iplug != NULL);
17292 + assert("nikita-296", body != NULL);
17293 + assert("nikita-297", length > 0);
17294 +
17295 + /* inode is under I_LOCK now */
17296 +
17297 + state = reiser4_inode_data(inode);
17298 + /* call stat-data plugin method to load sd content into inode */
17299 + result = iplug->s.sd.init_inode(inode, body, length);
17300 + set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17301 + if (result == 0) {
17302 + result = setup_inode_ops(inode, NULL);
17303 + if (result == 0 && inode->i_sb->s_root &&
17304 + inode->i_sb->s_root->d_inode)
17305 + result = finish_pset(inode);
17306 + }
17307 + zrelse(coord->node);
17308 + return result;
17309 +}
17310 +
17311 +/* read `inode' from the disk. This is what was previously in
17312 + reiserfs_read_inode2().
17313 +
17314 + Must be called with inode locked. Return inode still locked.
17315 +*/
17316 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17317 + const reiser4_key * key /* key of stat data */ ,
17318 + int silent)
17319 +{
17320 + int result;
17321 + lock_handle lh;
17322 + reiser4_inode *info;
17323 + coord_t coord;
17324 +
17325 + assert("nikita-298", inode != NULL);
17326 + assert("nikita-1945", !is_inode_loaded(inode));
17327 +
17328 + info = reiser4_inode_data(inode);
17329 + assert("nikita-300", info->locality_id != 0);
17330 +
17331 + coord_init_zero(&coord);
17332 + init_lh(&lh);
17333 + /* locate stat-data in a tree and return znode locked */
17334 + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17335 + assert("nikita-301", !is_inode_loaded(inode));
17336 + if (result == 0) {
17337 + /* use stat-data plugin to load sd into inode. */
17338 + result = init_inode(inode, &coord);
17339 + if (result == 0) {
17340 + /* initialize stat-data seal */
17341 + spin_lock_inode(inode);
17342 + reiser4_seal_init(&info->sd_seal, &coord, key);
17343 + info->sd_coord = coord;
17344 + spin_unlock_inode(inode);
17345 +
17346 + /* call file plugin's method to initialize plugin
17347 + * specific part of inode */
17348 + if (inode_file_plugin(inode)->init_inode_data)
17349 + inode_file_plugin(inode)->init_inode_data(inode,
17350 + NULL,
17351 + 0);
17352 + /* load detached directory cursors for stateless
17353 + * directory readers (NFS). */
17354 + reiser4_load_cursors(inode);
17355 +
17356 + /* Check the opened inode for consistency. */
17357 + result =
17358 + get_super_private(inode->i_sb)->df_plug->
17359 + check_open(inode);
17360 + }
17361 + }
17362 + /* lookup_sd() doesn't release coord because we want znode
17363 + stay read-locked while stat-data fields are accessed in
17364 + init_inode() */
17365 + done_lh(&lh);
17366 +
17367 + if (result != 0)
17368 + reiser4_make_bad_inode(inode);
17369 + return result;
17370 +}
17371 +
17372 +/* initialise new reiser4 inode being inserted into hash table. */
17373 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17374 + void *opaque /* key of stat data passed to the
17375 + * iget5_locked as cookie */ )
17376 +{
17377 + reiser4_key *key;
17378 +
17379 + assert("nikita-1995", inode != NULL);
17380 + assert("nikita-1996", opaque != NULL);
17381 + key = opaque;
17382 + set_inode_oid(inode, get_key_objectid(key));
17383 + reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17384 + return 0;
17385 +}
17386 +
17387 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17388 +
17389 + This function is called by iget5_locked() to distinguish reiser4 inodes
17390 + having the same inode numbers. Such inodes can only exist due to some error
17391 + condition. One of them should be bad. Inodes with identical inode numbers
17392 + (objectids) are distinguished by their packing locality.
17393 +
17394 +*/
17395 +static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17396 + * check */ ,
17397 + void *opaque /* "cookie" passed to
17398 + * iget5_locked(). This is stat data
17399 + * key */ )
17400 +{
17401 + reiser4_key *key;
17402 +
17403 + key = opaque;
17404 + return
17405 + /* oid is unique, so first term is enough, actually. */
17406 + get_inode_oid(inode) == get_key_objectid(key) &&
17407 + /*
17408 + * also, locality should be checked, but locality is stored in
17409 + * the reiser4-specific part of the inode, and actor can be
17410 + * called against arbitrary inode that happened to be in this
17411 + * hash chain. Hence we first have to check that this is
17412 + * reiser4 inode at least. is_reiser4_inode() is probably too
17413 + * early to call, as inode may have ->i_op not yet
17414 + * initialised.
17415 + */
17416 + is_reiser4_super(inode->i_sb) &&
17417 + /*
17418 + * usually objectid is unique, but pseudo files use counter to
17419 + * generate objectid. All pseudo files are placed into special
17420 + * (otherwise unused) locality.
17421 + */
17422 + reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17423 +}
17424 +
17425 +/* hook for kmem_cache_create */
17426 +void loading_init_once(reiser4_inode * info)
17427 +{
17428 + mutex_init(&info->loading);
17429 +}
17430 +
17431 +/* for reiser4_alloc_inode */
17432 +void loading_alloc(reiser4_inode * info)
17433 +{
17434 + assert("vs-1717", !mutex_is_locked(&info->loading));
17435 +}
17436 +
17437 +/* for reiser4_destroy */
17438 +void loading_destroy(reiser4_inode * info)
17439 +{
17440 + assert("vs-1717a", !mutex_is_locked(&info->loading));
17441 +}
17442 +
17443 +static void loading_begin(reiser4_inode * info)
17444 +{
17445 + mutex_lock(&info->loading);
17446 +}
17447 +
17448 +static void loading_end(reiser4_inode * info)
17449 +{
17450 + mutex_unlock(&info->loading);
17451 +}
17452 +
17453 +/**
17454 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17455 + * @super: super block of filesystem
17456 + * @key: key of inode's stat-data
17457 + * @silent:
17458 + *
17459 + * This is our helper function a la iget(). This is be called by
17460 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17461 + * encountered.
17462 + */
17463 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17464 + int silent)
17465 +{
17466 + struct inode *inode;
17467 + int result;
17468 + reiser4_inode *info;
17469 +
17470 + assert("nikita-302", super != NULL);
17471 + assert("nikita-303", key != NULL);
17472 +
17473 + result = 0;
17474 +
17475 + /* call iget(). Our ->read_inode() is dummy, so this will either
17476 + find inode in cache or return uninitialised inode */
17477 + inode = iget5_locked(super,
17478 + (unsigned long)get_key_objectid(key),
17479 + reiser4_inode_find_actor,
17480 + init_locked_inode, (reiser4_key *) key);
17481 + if (inode == NULL)
17482 + return ERR_PTR(RETERR(-ENOMEM));
17483 + if (is_bad_inode(inode)) {
17484 + warning("nikita-304", "Bad inode found");
17485 + reiser4_print_key("key", key);
17486 + iput(inode);
17487 + return ERR_PTR(RETERR(-EIO));
17488 + }
17489 +
17490 + info = reiser4_inode_data(inode);
17491 +
17492 + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17493 + loaded and initialized inode from just allocated inode. If
17494 + REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17495 + info->loading. The place in reiser4 which uses not initialized inode
17496 + is the reiser4 repacker, see repacker-related functions in
17497 + plugin/item/extent.c */
17498 + if (!is_inode_loaded(inode)) {
17499 + loading_begin(info);
17500 + if (!is_inode_loaded(inode)) {
17501 + /* locking: iget5_locked returns locked inode */
17502 + assert("nikita-1941", !is_inode_loaded(inode));
17503 + assert("nikita-1949",
17504 + reiser4_inode_find_actor(inode,
17505 + (reiser4_key *) key));
17506 + /* now, inode has objectid as ->i_ino and locality in
17507 + reiser4-specific part. This is enough for
17508 + read_inode() to read stat data from the disk */
17509 + result = read_inode(inode, key, silent);
17510 + } else
17511 + loading_end(info);
17512 + }
17513 +
17514 + if (inode->i_state & I_NEW)
17515 + unlock_new_inode(inode);
17516 +
17517 + if (is_bad_inode(inode)) {
17518 + assert("vs-1717", result != 0);
17519 + loading_end(info);
17520 + iput(inode);
17521 + inode = ERR_PTR(result);
17522 + } else if (REISER4_DEBUG) {
17523 + reiser4_key found_key;
17524 +
17525 + assert("vs-1717", result == 0);
17526 + build_sd_key(inode, &found_key);
17527 + if (!keyeq(&found_key, key)) {
17528 + warning("nikita-305", "Wrong key in sd");
17529 + reiser4_print_key("sought for", key);
17530 + reiser4_print_key("found", &found_key);
17531 + }
17532 + if (inode->i_nlink == 0) {
17533 + warning("nikita-3559", "Unlinked inode found: %llu\n",
17534 + (unsigned long long)get_inode_oid(inode));
17535 + }
17536 + }
17537 + return inode;
17538 +}
17539 +
17540 +/* reiser4_iget() may return not fully initialized inode, this function should
17541 + * be called after one completes reiser4 inode initializing. */
17542 +void reiser4_iget_complete(struct inode *inode)
17543 +{
17544 + assert("zam-988", is_reiser4_inode(inode));
17545 +
17546 + if (!is_inode_loaded(inode)) {
17547 + reiser4_inode_set_flag(inode, REISER4_LOADED);
17548 + loading_end(reiser4_inode_data(inode));
17549 + }
17550 +}
17551 +
17552 +void reiser4_make_bad_inode(struct inode *inode)
17553 +{
17554 + assert("nikita-1934", inode != NULL);
17555 +
17556 + /* clear LOADED bit */
17557 + reiser4_inode_clr_flag(inode, REISER4_LOADED);
17558 + make_bad_inode(inode);
17559 + return;
17560 +}
17561 +
17562 +file_plugin *inode_file_plugin(const struct inode * inode)
17563 +{
17564 + assert("nikita-1997", inode != NULL);
17565 + return reiser4_inode_data(inode)->pset->file;
17566 +}
17567 +
17568 +dir_plugin *inode_dir_plugin(const struct inode * inode)
17569 +{
17570 + assert("nikita-1998", inode != NULL);
17571 + return reiser4_inode_data(inode)->pset->dir;
17572 +}
17573 +
17574 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17575 +{
17576 + assert("nikita-2000", inode != NULL);
17577 + return reiser4_inode_data(inode)->pset->formatting;
17578 +}
17579 +
17580 +hash_plugin *inode_hash_plugin(const struct inode * inode)
17581 +{
17582 + assert("nikita-2001", inode != NULL);
17583 + return reiser4_inode_data(inode)->pset->hash;
17584 +}
17585 +
17586 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17587 +{
17588 + assert("nikita-2001", inode != NULL);
17589 + return reiser4_inode_data(inode)->pset->fibration;
17590 +}
17591 +
17592 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17593 +{
17594 + assert("edward-36", inode != NULL);
17595 + return reiser4_inode_data(inode)->pset->cipher;
17596 +}
17597 +
17598 +compression_plugin *inode_compression_plugin(const struct inode * inode)
17599 +{
17600 + assert("edward-37", inode != NULL);
17601 + return reiser4_inode_data(inode)->pset->compression;
17602 +}
17603 +
17604 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17605 + inode)
17606 +{
17607 + assert("edward-1330", inode != NULL);
17608 + return reiser4_inode_data(inode)->pset->compression_mode;
17609 +}
17610 +
17611 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17612 +{
17613 + assert("edward-1328", inode != NULL);
17614 + return reiser4_inode_data(inode)->pset->cluster;
17615 +}
17616 +
17617 +file_plugin *inode_create_plugin(const struct inode * inode)
17618 +{
17619 + assert("edward-1329", inode != NULL);
17620 + return reiser4_inode_data(inode)->pset->create;
17621 +}
17622 +
17623 +digest_plugin *inode_digest_plugin(const struct inode * inode)
17624 +{
17625 + assert("edward-86", inode != NULL);
17626 + return reiser4_inode_data(inode)->pset->digest;
17627 +}
17628 +
17629 +item_plugin *inode_sd_plugin(const struct inode * inode)
17630 +{
17631 + assert("vs-534", inode != NULL);
17632 + return reiser4_inode_data(inode)->pset->sd;
17633 +}
17634 +
17635 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
17636 +{
17637 + assert("vs-534", inode != NULL);
17638 + return reiser4_inode_data(inode)->pset->dir_item;
17639 +}
17640 +
17641 +file_plugin *child_create_plugin(const struct inode * inode)
17642 +{
17643 + assert("edward-1329", inode != NULL);
17644 + return reiser4_inode_data(inode)->hset->create;
17645 +}
17646 +
17647 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17648 +{
17649 + reiser4_inode *state;
17650 +
17651 + assert("nikita-2716", inode != NULL);
17652 + assert("nikita-2717", ext < LAST_SD_EXTENSION);
17653 + assert("nikita-3491", spin_inode_is_locked(inode));
17654 +
17655 + state = reiser4_inode_data(inode);
17656 + state->extmask |= 1 << ext;
17657 + /* force re-calculation of stat-data length on next call to
17658 + update_sd(). */
17659 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17660 +}
17661 +
17662 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17663 +{
17664 + reiser4_inode *state;
17665 +
17666 + assert("vpf-1926", inode != NULL);
17667 + assert("vpf-1927", ext < LAST_SD_EXTENSION);
17668 + assert("vpf-1928", spin_inode_is_locked(inode));
17669 +
17670 + state = reiser4_inode_data(inode);
17671 + state->extmask &= ~(1 << ext);
17672 + /* force re-calculation of stat-data length on next call to
17673 + update_sd(). */
17674 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17675 +}
17676 +
17677 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17678 +{
17679 + assert("edward-1287", inode != NULL);
17680 + if (!dscale_fit(old, new))
17681 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17682 + return;
17683 +}
17684 +
17685 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17686 +{
17687 + assert("nikita-2875", inode != NULL);
17688 + spin_lock_inode(inode);
17689 + inode_check_scale_nolock(inode, old, new);
17690 + spin_unlock_inode(inode);
17691 +}
17692 +
17693 +/*
17694 + * initialize ->ordering field of inode. This field defines how file stat-data
17695 + * and body is ordered within a tree with respect to other objects within the
17696 + * same parent directory.
17697 + */
17698 +void
17699 +init_inode_ordering(struct inode *inode,
17700 + reiser4_object_create_data * crd, int create)
17701 +{
17702 + reiser4_key key;
17703 +
17704 + if (create) {
17705 + struct inode *parent;
17706 +
17707 + parent = crd->parent;
17708 + assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17709 + inode_dir_plugin(parent)->build_entry_key(parent,
17710 + &crd->dentry->d_name,
17711 + &key);
17712 + } else {
17713 + coord_t *coord;
17714 +
17715 + coord = &reiser4_inode_data(inode)->sd_coord;
17716 + coord_clear_iplug(coord);
17717 + /* safe to use ->sd_coord, because node is under long term
17718 + * lock */
17719 + WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17720 + }
17721 +
17722 + set_inode_ordering(inode, get_key_ordering(&key));
17723 +}
17724 +
17725 +znode *inode_get_vroot(struct inode *inode)
17726 +{
17727 + reiser4_block_nr blk;
17728 + znode *result;
17729 +
17730 + spin_lock_inode(inode);
17731 + blk = reiser4_inode_data(inode)->vroot;
17732 + spin_unlock_inode(inode);
17733 + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17734 + result = zlook(reiser4_tree_by_inode(inode), &blk);
17735 + else
17736 + result = NULL;
17737 + return result;
17738 +}
17739 +
17740 +void inode_set_vroot(struct inode *inode, znode *vroot)
17741 +{
17742 + spin_lock_inode(inode);
17743 + reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17744 + spin_unlock_inode(inode);
17745 +}
17746 +
17747 +#if REISER4_DEBUG
17748 +
17749 +void reiser4_inode_invariant(const struct inode *inode)
17750 +{
17751 + assert("nikita-3077", spin_inode_is_locked(inode));
17752 +}
17753 +
17754 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
17755 +{
17756 + return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17757 + r4_inode->nr_jnodes == 0;
17758 +}
17759 +
17760 +#endif
17761 +
17762 +/* true if directory is empty (only contains dot and dotdot) */
17763 +/* FIXME: shouldn't it be dir plugin method? */
17764 +int is_dir_empty(const struct inode *dir)
17765 +{
17766 + assert("nikita-1976", dir != NULL);
17767 +
17768 + /* rely on our method to maintain directory i_size being equal to the
17769 + number of entries. */
17770 + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17771 +}
17772 +
17773 +/* Make Linus happy.
17774 + Local variables:
17775 + c-indentation-style: "K&R"
17776 + mode-name: "LC"
17777 + c-basic-offset: 8
17778 + tab-width: 8
17779 + fill-column: 120
17780 + End:
17781 +*/
17782 diff -urN linux-2.6.20.orig/fs/reiser4/inode.h linux-2.6.20/fs/reiser4/inode.h
17783 --- linux-2.6.20.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17784 +++ linux-2.6.20/fs/reiser4/inode.h 2007-05-06 14:50:43.726984474 +0400
17785 @@ -0,0 +1,438 @@
17786 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17787 +
17788 +/* Inode functions. */
17789 +
17790 +#if !defined( __REISER4_INODE_H__ )
17791 +#define __REISER4_INODE_H__
17792 +
17793 +#include "forward.h"
17794 +#include "debug.h"
17795 +#include "key.h"
17796 +#include "seal.h"
17797 +#include "plugin/plugin.h"
17798 +#include "plugin/file/cryptcompress.h"
17799 +#include "plugin/file/file.h"
17800 +#include "plugin/dir/dir.h"
17801 +#include "plugin/plugin_set.h"
17802 +#include "plugin/security/perm.h"
17803 +#include "vfs_ops.h"
17804 +#include "jnode.h"
17805 +#include "fsdata.h"
17806 +
17807 +#include <linux/types.h> /* for __u?? , ino_t */
17808 +#include <linux/fs.h> /* for struct super_block, struct
17809 + * rw_semaphore, etc */
17810 +#include <linux/spinlock.h>
17811 +#include <asm/types.h>
17812 +
17813 +/* reiser4-specific inode flags. They are "transient" and are not
17814 + supposed to be stored on disk. Used to trace "state" of
17815 + inode
17816 +*/
17817 +typedef enum {
17818 + /* this is light-weight inode, inheriting some state from its
17819 + parent */
17820 + REISER4_LIGHT_WEIGHT = 0,
17821 + /* stat data wasn't yet created */
17822 + REISER4_NO_SD = 1,
17823 + /* internal immutable flag. Currently is only used
17824 + to avoid race condition during file creation.
17825 + See comment in create_object(). */
17826 + REISER4_IMMUTABLE = 2,
17827 + /* inode was read from storage */
17828 + REISER4_LOADED = 3,
17829 + /* this bit is set for symlinks. inode->i_private points to target
17830 + name of symlink. */
17831 + REISER4_GENERIC_PTR_USED = 4,
17832 + /* set if size of stat-data item for this inode is known. If this is
17833 + * set we can avoid recalculating size of stat-data on each update. */
17834 + REISER4_SDLEN_KNOWN = 5,
17835 + /* reiser4_inode->crypt points to the crypto stat */
17836 + REISER4_CRYPTO_STAT_LOADED = 6,
17837 + /* cryptcompress_inode_data points to the secret key */
17838 + REISER4_SECRET_KEY_INSTALLED = 7,
17839 + /* File (possibly) has pages corresponding to the tail items, that
17840 + * were created by ->readpage. It is set by mmap_unix_file() and
17841 + * sendfile_unix_file(). This bit is inspected by write_unix_file and
17842 + * kill-hook of tail items. It is never cleared once set. This bit is
17843 + * modified and inspected under i_mutex. */
17844 + REISER4_HAS_MMAP = 8,
17845 + REISER4_PART_MIXED = 9,
17846 + REISER4_PART_IN_CONV = 10,
17847 + /* This flag indicates that file plugin conversion is in progress */
17848 + REISER4_FILE_CONV_IN_PROGRESS = 11
17849 +} reiser4_file_plugin_flags;
17850 +
17851 +/* state associated with each inode.
17852 + reiser4 inode.
17853 +
17854 + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17855 + be of the same size. File-system allocates inodes by itself through
17856 + s_op->allocate_inode() method. So, it is possible to adjust size of inode
17857 + at the time of its creation.
17858 +
17859 + Invariants involving parts of this data-type:
17860 +
17861 + [inode->eflushed]
17862 +
17863 +*/
17864 +
17865 +typedef struct reiser4_inode reiser4_inode;
17866 +/* return pointer to reiser4-specific part of inode */
17867 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17868 + /* inode queried */ );
17869 +
17870 +#if BITS_PER_LONG == 64
17871 +
17872 +#define REISER4_INO_IS_OID (1)
17873 +typedef struct {;
17874 +} oid_hi_t;
17875 +
17876 +/* BITS_PER_LONG == 64 */
17877 +#else
17878 +
17879 +#define REISER4_INO_IS_OID (0)
17880 +typedef __u32 oid_hi_t;
17881 +
17882 +/* BITS_PER_LONG == 64 */
17883 +#endif
17884 +
17885 +struct reiser4_inode {
17886 + /* spin lock protecting fields of this structure. */
17887 + spinlock_t guard;
17888 + /* main plugin set that control the file
17889 + (see comments in plugin/plugin_set.c) */
17890 + plugin_set *pset;
17891 + /* plugin set for inheritance
17892 + (see comments in plugin/plugin_set.c) */
17893 + plugin_set *hset;
17894 + /* high 32 bits of object id */
17895 + oid_hi_t oid_hi;
17896 + /* seal for stat-data */
17897 + seal_t sd_seal;
17898 + /* locality id for this file */
17899 + oid_t locality_id;
17900 +#if REISER4_LARGE_KEY
17901 + __u64 ordering;
17902 +#endif
17903 + /* coord of stat-data in sealed node */
17904 + coord_t sd_coord;
17905 + /* bit-mask of stat-data extentions used by this file */
17906 + __u64 extmask;
17907 + /* bitmask of non-default plugins for this inode */
17908 + __u16 plugin_mask;
17909 + /* bitmask of set heir plugins for this inode. */
17910 + __u16 heir_mask;
17911 + union {
17912 + struct list_head readdir_list;
17913 + struct list_head not_used;
17914 + } lists;
17915 + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17916 + unsigned long flags;
17917 + union {
17918 + /* fields specific to unix_file plugin */
17919 + unix_file_info_t unix_file_info;
17920 + /* fields specific to cryptcompress plugin */
17921 + cryptcompress_info_t cryptcompress_info;
17922 + } file_plugin_data;
17923 +
17924 + /* this semaphore is to serialize readers and writers of @pset->file
17925 + * when file plugin conversion is enabled
17926 + */
17927 + struct rw_semaphore conv_sem;
17928 +
17929 + /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
17930 + tagged in that tree by EFLUSH_TAG_ANONYMOUS */
17931 + struct radix_tree_root jnodes_tree;
17932 +#if REISER4_DEBUG
17933 + /* number of unformatted node jnodes of this file in jnode hash table */
17934 + unsigned long nr_jnodes;
17935 +#endif
17936 +
17937 + /* block number of virtual root for this object. See comment above
17938 + * fs/reiser4/search.c:handle_vroot() */
17939 + reiser4_block_nr vroot;
17940 + struct mutex loading;
17941 +};
17942 +
17943 +void loading_init_once(reiser4_inode *);
17944 +void loading_alloc(reiser4_inode *);
17945 +void loading_destroy(reiser4_inode *);
17946 +
17947 +typedef struct reiser4_inode_object {
17948 + /* private part */
17949 + reiser4_inode p;
17950 + /* generic fields not specific to reiser4, but used by VFS */
17951 + struct inode vfs_inode;
17952 +} reiser4_inode_object;
17953 +
17954 +/* return pointer to the reiser4 specific portion of @inode */
17955 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17956 + /* inode queried */ )
17957 +{
17958 + assert("nikita-254", inode != NULL);
17959 + return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
17960 +}
17961 +
17962 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
17963 + r4_inode /* inode queried */
17964 + )
17965 +{
17966 + return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
17967 +}
17968 +
17969 +/*
17970 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
17971 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
17972 + * bits.
17973 + *
17974 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
17975 + * of inode, otherwise whole oid is stored in i_ino.
17976 + *
17977 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
17978 + */
17979 +
17980 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
17981 +
17982 +#if REISER4_INO_IS_OID
17983 +
17984 +static inline oid_t get_inode_oid(const struct inode *inode)
17985 +{
17986 + return inode->i_ino;
17987 +}
17988 +
17989 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
17990 +{
17991 + inode->i_ino = oid;
17992 +}
17993 +
17994 +/* REISER4_INO_IS_OID */
17995 +#else
17996 +
17997 +static inline oid_t get_inode_oid(const struct inode *inode)
17998 +{
17999 + return
18000 + ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18001 + inode->i_ino;
18002 +}
18003 +
18004 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18005 +{
18006 + assert("nikita-2519", inode != NULL);
18007 + inode->i_ino = (ino_t) (oid);
18008 + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18009 + assert("nikita-2521", get_inode_oid(inode) == (oid));
18010 +}
18011 +
18012 +/* REISER4_INO_IS_OID */
18013 +#endif
18014 +
18015 +static inline oid_t get_inode_locality(const struct inode *inode)
18016 +{
18017 + return reiser4_inode_data(inode)->locality_id;
18018 +}
18019 +
18020 +#if REISER4_LARGE_KEY
18021 +static inline __u64 get_inode_ordering(const struct inode *inode)
18022 +{
18023 + return reiser4_inode_data(inode)->ordering;
18024 +}
18025 +
18026 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18027 +{
18028 + reiser4_inode_data(inode)->ordering = ordering;
18029 +}
18030 +
18031 +#else
18032 +
18033 +#define get_inode_ordering(inode) (0)
18034 +#define set_inode_ordering(inode, val) noop
18035 +
18036 +#endif
18037 +
18038 +/* return inode in which @uf_info is embedded */
18039 +static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18040 + uf_info)
18041 +{
18042 + return &container_of(uf_info, reiser4_inode_object,
18043 + p.file_plugin_data.unix_file_info)->vfs_inode;
18044 +}
18045 +
18046 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18047 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18048 +
18049 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18050 +
18051 +#if REISER4_DEBUG
18052 +extern void reiser4_inode_invariant(const struct inode *inode);
18053 +extern int inode_has_no_jnodes(reiser4_inode *);
18054 +#else
18055 +#define reiser4_inode_invariant(inode) noop
18056 +#endif
18057 +
18058 +static inline int spin_inode_is_locked(const struct inode *inode)
18059 +{
18060 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18061 + return 1;
18062 +}
18063 +
18064 +/**
18065 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18066 + * @inode: inode to lock
18067 + *
18068 + * In debug mode it checks that lower priority locks are not held and
18069 + * increments reiser4_context's lock counters on which lock ordering checking
18070 + * is based.
18071 + */
18072 +static inline void spin_lock_inode(struct inode *inode)
18073 +{
18074 + assert("", LOCK_CNT_NIL(spin_locked));
18075 + /* check lock ordering */
18076 + assert_spin_not_locked(&d_lock);
18077 +
18078 + spin_lock(&reiser4_inode_data(inode)->guard);
18079 +
18080 + LOCK_CNT_INC(spin_locked_inode);
18081 + LOCK_CNT_INC(spin_locked);
18082 +
18083 + reiser4_inode_invariant(inode);
18084 +}
18085 +
18086 +/**
18087 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18088 + * @inode: inode to unlock
18089 + *
18090 + * In debug mode it checks that spinlock is held and decrements
18091 + * reiser4_context's lock counters on which lock ordering checking is based.
18092 + */
18093 +static inline void spin_unlock_inode(struct inode *inode)
18094 +{
18095 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18096 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18097 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18098 +
18099 + reiser4_inode_invariant(inode);
18100 +
18101 + LOCK_CNT_DEC(spin_locked_inode);
18102 + LOCK_CNT_DEC(spin_locked);
18103 +
18104 + spin_unlock(&reiser4_inode_data(inode)->guard);
18105 +}
18106 +
18107 +extern znode *inode_get_vroot(struct inode *inode);
18108 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18109 +
18110 +extern int reiser4_max_filename_len(const struct inode *inode);
18111 +extern int max_hash_collisions(const struct inode *dir);
18112 +extern void reiser4_unlock_inode(struct inode *inode);
18113 +extern int is_reiser4_inode(const struct inode *inode);
18114 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18115 +extern struct inode *reiser4_iget(struct super_block *super,
18116 + const reiser4_key * key, int silent);
18117 +extern void reiser4_iget_complete(struct inode *inode);
18118 +extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18119 +extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18120 +extern int reiser4_inode_get_flag(const struct inode *inode,
18121 + reiser4_file_plugin_flags f);
18122 +
18123 +/* has inode been initialized? */
18124 +static inline int
18125 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18126 +{
18127 + assert("nikita-1120", inode != NULL);
18128 + return reiser4_inode_get_flag(inode, REISER4_LOADED);
18129 +}
18130 +
18131 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18132 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18133 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18134 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18135 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18136 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18137 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18138 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18139 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18140 + *inode);
18141 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18142 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18143 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18144 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18145 +extern file_plugin *child_create_plugin(const struct inode *inode);
18146 +
18147 +extern void reiser4_make_bad_inode(struct inode *inode);
18148 +
18149 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18150 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18151 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18152 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18153 +
18154 +/*
18155 + * update field @field in inode @i to contain value @value.
18156 + */
18157 +#define INODE_SET_FIELD(i, field, value) \
18158 +({ \
18159 + struct inode *__i; \
18160 + typeof(value) __v; \
18161 + \
18162 + __i = (i); \
18163 + __v = (value); \
18164 + inode_check_scale(__i, __i->field, __v); \
18165 + __i->field = __v; \
18166 +})
18167 +
18168 +#define INODE_INC_FIELD(i, field) \
18169 +({ \
18170 + struct inode *__i; \
18171 + \
18172 + __i = (i); \
18173 + inode_check_scale(__i, __i->field, __i->field + 1); \
18174 + ++ __i->field; \
18175 +})
18176 +
18177 +#define INODE_DEC_FIELD(i, field) \
18178 +({ \
18179 + struct inode *__i; \
18180 + \
18181 + __i = (i); \
18182 + inode_check_scale(__i, __i->field, __i->field - 1); \
18183 + -- __i->field; \
18184 +})
18185 +
18186 +/* See comment before reiser4_readdir_common() for description. */
18187 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18188 +{
18189 + return &reiser4_inode_data(inode)->lists.readdir_list;
18190 +}
18191 +
18192 +extern void init_inode_ordering(struct inode *inode,
18193 + reiser4_object_create_data * crd, int create);
18194 +
18195 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18196 +{
18197 + return &reiser4_inode_data(inode)->jnodes_tree;
18198 +}
18199 +
18200 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18201 + * r4_inode)
18202 +{
18203 + return &r4_inode->jnodes_tree;
18204 +}
18205 +
18206 +#if REISER4_DEBUG
18207 +extern void print_inode(const char *prefix, const struct inode *i);
18208 +#endif
18209 +
18210 +int is_dir_empty(const struct inode *);
18211 +
18212 +/* __REISER4_INODE_H__ */
18213 +#endif
18214 +
18215 +/* Make Linus happy.
18216 + Local variables:
18217 + c-indentation-style: "K&R"
18218 + mode-name: "LC"
18219 + c-basic-offset: 8
18220 + tab-width: 8
18221 + fill-column: 120
18222 + End:
18223 +*/
18224 diff -urN linux-2.6.20.orig/fs/reiser4/ioctl.h linux-2.6.20/fs/reiser4/ioctl.h
18225 --- linux-2.6.20.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18226 +++ linux-2.6.20/fs/reiser4/ioctl.h 2007-05-06 14:50:43.726984474 +0400
18227 @@ -0,0 +1,41 @@
18228 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18229 + * reiser4/README */
18230 +
18231 +#if !defined( __REISER4_IOCTL_H__ )
18232 +#define __REISER4_IOCTL_H__
18233 +
18234 +#include <linux/fs.h>
18235 +
18236 +/*
18237 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18238 + * extents and fix in this state. This is used by applications that rely on
18239 + *
18240 + * . files being block aligned, and
18241 + *
18242 + * . files never migrating on disk
18243 + *
18244 + * for example, boot loaders (LILO) need this.
18245 + *
18246 + * This ioctl should be used as
18247 + *
18248 + * result = ioctl(fd, REISER4_IOC_UNPACK);
18249 + *
18250 + * File behind fd descriptor will be converted to the extents (if necessary),
18251 + * and its stat-data will be updated so that it will never be converted back
18252 + * into tails again.
18253 + */
18254 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18255 +
18256 +/* __REISER4_IOCTL_H__ */
18257 +#endif
18258 +
18259 +/* Make Linus happy.
18260 + Local variables:
18261 + c-indentation-style: "K&R"
18262 + mode-name: "LC"
18263 + c-basic-offset: 8
18264 + tab-width: 8
18265 + fill-column: 120
18266 + scroll-step: 1
18267 + End:
18268 +*/
18269 diff -urN linux-2.6.20.orig/fs/reiser4/jnode.c linux-2.6.20/fs/reiser4/jnode.c
18270 --- linux-2.6.20.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18271 +++ linux-2.6.20/fs/reiser4/jnode.c 2007-05-06 14:50:43.730985723 +0400
18272 @@ -0,0 +1,1925 @@
18273 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18274 + * reiser4/README */
18275 +/* Jnode manipulation functions. */
18276 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18277 +
18278 + In particular, jnodes are used to track transactional information
18279 + associated with each block. Each znode contains jnode as ->zjnode field.
18280 +
18281 + Jnode stands for either Josh or Journal node.
18282 +*/
18283 +
18284 +/*
18285 + * Taxonomy.
18286 + *
18287 + * Jnode represents block containing data or meta-data. There are jnodes
18288 + * for:
18289 + *
18290 + * unformatted blocks (jnodes proper). There are plans, however to
18291 + * have a handle per extent unit rather than per each unformatted
18292 + * block, because there are so many of them.
18293 + *
18294 + * For bitmaps. Each bitmap is actually represented by two jnodes--one
18295 + * for working and another for "commit" data, together forming bnode.
18296 + *
18297 + * For io-heads. These are used by log writer.
18298 + *
18299 + * For formatted nodes (znode). See comment at the top of znode.c for
18300 + * details specific to the formatted nodes (znodes).
18301 + *
18302 + * Node data.
18303 + *
18304 + * Jnode provides access to the data of node it represents. Data are
18305 + * stored in a page. Page is kept in a page cache. This means, that jnodes
18306 + * are highly interconnected with page cache and VM internals.
18307 + *
18308 + * jnode has a pointer to page (->pg) containing its data. Pointer to data
18309 + * themselves is cached in ->data field to avoid frequent calls to
18310 + * page_address().
18311 + *
18312 + * jnode and page are attached to each other by jnode_attach_page(). This
18313 + * function places pointer to jnode in set_page_private(), sets PG_private
18314 + * flag and increments page counter.
18315 + *
18316 + * Opposite operation is performed by page_clear_jnode().
18317 + *
18318 + * jnode->pg is protected by jnode spin lock, and page->private is
18319 + * protected by page lock. See comment at the top of page_cache.c for
18320 + * more.
18321 + *
18322 + * page can be detached from jnode for two reasons:
18323 + *
18324 + * . jnode is removed from a tree (file is truncated, of formatted
18325 + * node is removed by balancing).
18326 + *
18327 + * . during memory pressure, VM calls ->releasepage() method
18328 + * (reiser4_releasepage()) to evict page from memory.
18329 + *
18330 + * (there, of course, is also umount, but this is special case we are not
18331 + * concerned with here).
18332 + *
18333 + * To protect jnode page from eviction, one calls jload() function that
18334 + * "pins" page in memory (loading it if necessary), increments
18335 + * jnode->d_count, and kmap()s page. Page is unpinned through call to
18336 + * jrelse().
18337 + *
18338 + * Jnode life cycle.
18339 + *
18340 + * jnode is created, placed in hash table, and, optionally, in per-inode
18341 + * radix tree. Page can be attached to jnode, pinned, released, etc.
18342 + *
18343 + * When jnode is captured into atom its reference counter is
18344 + * increased. While being part of an atom, jnode can be "early
18345 + * flushed". This means that as part of flush procedure, jnode is placed
18346 + * into "relocate set", and its page is submitted to the disk. After io
18347 + * completes, page can be detached, then loaded again, re-dirtied, etc.
18348 + *
18349 + * Thread acquired reference to jnode by calling jref() and releases it by
18350 + * jput(). When last reference is removed, jnode is still retained in
18351 + * memory (cached) if it has page attached, _unless_ it is scheduled for
18352 + * destruction (has JNODE_HEARD_BANSHEE bit set).
18353 + *
18354 + * Tree read-write lock was used as "existential" lock for jnodes. That is,
18355 + * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18356 + * that is, tree lock protected unreferenced jnodes stored in the hash
18357 + * table, from recycling.
18358 + *
18359 + * This resulted in high contention on tree lock, because jref()/jput() is
18360 + * frequent operation. To ameliorate this problem, RCU is used: when jput()
18361 + * is just about to release last reference on jnode it sets JNODE_RIP bit
18362 + * on it, and then proceed with jnode destruction (removing jnode from hash
18363 + * table, cbk_cache, detaching page, etc.). All places that change jnode
18364 + * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18365 + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18366 + * jnode_rip_check() function), and pretend that nothing was found in hash
18367 + * table if bit is set.
18368 + *
18369 + * jput defers actual return of jnode into slab cache to some later time
18370 + * (by call_rcu()), this guarantees that other threads can safely continue
18371 + * working with JNODE_RIP-ped jnode.
18372 + *
18373 + */
18374 +
18375 +#include "reiser4.h"
18376 +#include "debug.h"
18377 +#include "dformat.h"
18378 +#include "jnode.h"
18379 +#include "plugin/plugin_header.h"
18380 +#include "plugin/plugin.h"
18381 +#include "txnmgr.h"
18382 +/*#include "jnode.h"*/
18383 +#include "znode.h"
18384 +#include "tree.h"
18385 +#include "tree_walk.h"
18386 +#include "super.h"
18387 +#include "inode.h"
18388 +#include "page_cache.h"
18389 +
18390 +#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18391 +#include <linux/types.h>
18392 +#include <linux/slab.h>
18393 +#include <linux/pagemap.h>
18394 +#include <linux/swap.h>
18395 +#include <linux/fs.h> /* for struct address_space */
18396 +#include <linux/writeback.h> /* for inode_lock */
18397 +
18398 +static struct kmem_cache *_jnode_slab = NULL;
18399 +
18400 +static void jnode_set_type(jnode * node, jnode_type type);
18401 +static int jdelete(jnode * node);
18402 +static int jnode_try_drop(jnode * node);
18403 +
18404 +#if REISER4_DEBUG
18405 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18406 +#endif
18407 +
18408 +/* true if valid page is attached to jnode */
18409 +static inline int jnode_is_parsed(jnode * node)
18410 +{
18411 + return JF_ISSET(node, JNODE_PARSED);
18412 +}
18413 +
18414 +/* hash table support */
18415 +
18416 +/* compare two jnode keys for equality. Used by hash-table macros */
18417 +static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18418 +{
18419 + assert("nikita-2350", k1 != NULL);
18420 + assert("nikita-2351", k2 != NULL);
18421 +
18422 + return (k1->index == k2->index && k1->objectid == k2->objectid);
18423 +}
18424 +
18425 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18426 +static inline __u32
18427 +jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18428 +{
18429 + assert("nikita-2352", key != NULL);
18430 + assert("nikita-3346", IS_POW(table->_buckets));
18431 +
18432 + /* yes, this is remarkable simply (where not stupid) hash function. */
18433 + return (key->objectid + key->index) & (table->_buckets - 1);
18434 +}
18435 +
18436 +/* The hash table definition */
18437 +#define KMALLOC(size) reiser4_vmalloc(size)
18438 +#define KFREE(ptr, size) vfree(ptr)
18439 +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18440 + jnode_key_eq);
18441 +#undef KFREE
18442 +#undef KMALLOC
18443 +
18444 +/* call this to initialise jnode hash table */
18445 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18446 +{
18447 + assert("nikita-2359", tree != NULL);
18448 + return j_hash_init(&tree->jhash_table, 16384);
18449 +}
18450 +
18451 +/* call this to destroy jnode hash table. This is called during umount. */
18452 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18453 +{
18454 + j_hash_table *jtable;
18455 + jnode *node;
18456 + jnode *next;
18457 +
18458 + assert("nikita-2360", tree != NULL);
18459 +
18460 + /*
18461 + * Scan hash table and free all jnodes.
18462 + */
18463 + jtable = &tree->jhash_table;
18464 + if (jtable->_table) {
18465 + for_all_in_htable(jtable, j, node, next) {
18466 + assert("nikita-2361", !atomic_read(&node->x_count));
18467 + jdrop(node);
18468 + }
18469 +
18470 + j_hash_done(&tree->jhash_table);
18471 + }
18472 + return 0;
18473 +}
18474 +
18475 +/**
18476 + * init_jnodes - create jnode cache
18477 + *
18478 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18479 + */
18480 +int init_jnodes(void)
18481 +{
18482 + assert("umka-168", _jnode_slab == NULL);
18483 +
18484 + _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18485 + SLAB_HWCACHE_ALIGN |
18486 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18487 + if (_jnode_slab == NULL)
18488 + return RETERR(-ENOMEM);
18489 +
18490 + return 0;
18491 +}
18492 +
18493 +/**
18494 + * done_znodes - delete znode cache
18495 + *
18496 + * This is called on reiser4 module unloading or system shutdown.
18497 + */
18498 +void done_jnodes(void)
18499 +{
18500 + destroy_reiser4_cache(&_jnode_slab);
18501 +}
18502 +
18503 +/* Initialize a jnode. */
18504 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18505 +{
18506 + assert("umka-175", node != NULL);
18507 +
18508 + memset(node, 0, sizeof(jnode));
18509 + ON_DEBUG(node->magic = JMAGIC);
18510 + jnode_set_type(node, type);
18511 + atomic_set(&node->d_count, 0);
18512 + atomic_set(&node->x_count, 0);
18513 + spin_lock_init(&node->guard);
18514 + spin_lock_init(&node->load);
18515 + node->atom = NULL;
18516 + node->tree = tree;
18517 + INIT_LIST_HEAD(&node->capture_link);
18518 +
18519 + ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18520 +
18521 + INIT_RCU_HEAD(&node->rcu);
18522 +
18523 +#if REISER4_DEBUG
18524 + {
18525 + reiser4_super_info_data *sbinfo;
18526 +
18527 + sbinfo = get_super_private(tree->super);
18528 + spin_lock_irq(&sbinfo->all_guard);
18529 + list_add(&node->jnodes, &sbinfo->all_jnodes);
18530 + spin_unlock_irq(&sbinfo->all_guard);
18531 + }
18532 +#endif
18533 +}
18534 +
18535 +#if REISER4_DEBUG
18536 +/*
18537 + * Remove jnode from ->all_jnodes list.
18538 + */
18539 +static void jnode_done(jnode * node, reiser4_tree * tree)
18540 +{
18541 + reiser4_super_info_data *sbinfo;
18542 +
18543 + sbinfo = get_super_private(tree->super);
18544 +
18545 + spin_lock_irq(&sbinfo->all_guard);
18546 + assert("nikita-2422", !list_empty(&node->jnodes));
18547 + list_del_init(&node->jnodes);
18548 + spin_unlock_irq(&sbinfo->all_guard);
18549 +}
18550 +#endif
18551 +
18552 +/* return already existing jnode of page */
18553 +jnode *jnode_by_page(struct page *pg)
18554 +{
18555 + assert("nikita-2066", pg != NULL);
18556 + assert("nikita-2400", PageLocked(pg));
18557 + assert("nikita-2068", PagePrivate(pg));
18558 + assert("nikita-2067", jprivate(pg) != NULL);
18559 + return jprivate(pg);
18560 +}
18561 +
18562 +/* exported functions to allocate/free jnode objects outside this file */
18563 +jnode *jalloc(void)
18564 +{
18565 + jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18566 + return jal;
18567 +}
18568 +
18569 +/* return jnode back to the slab allocator */
18570 +inline void jfree(jnode * node)
18571 +{
18572 + assert("zam-449", node != NULL);
18573 +
18574 + assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18575 + NODE_LIST(node) == NOT_CAPTURED));
18576 + assert("nikita-3222", list_empty(&node->jnodes));
18577 + assert("nikita-3221", jnode_page(node) == NULL);
18578 +
18579 + /* not yet phash_jnode_destroy(node); */
18580 +
18581 + kmem_cache_free(_jnode_slab, node);
18582 +}
18583 +
18584 +/*
18585 + * This function is supplied as RCU callback. It actually frees jnode when
18586 + * last reference to it is gone.
18587 + */
18588 +static void jnode_free_actor(struct rcu_head *head)
18589 +{
18590 + jnode *node;
18591 + jnode_type jtype;
18592 +
18593 + node = container_of(head, jnode, rcu);
18594 + jtype = jnode_get_type(node);
18595 +
18596 + ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18597 +
18598 + switch (jtype) {
18599 + case JNODE_IO_HEAD:
18600 + case JNODE_BITMAP:
18601 + case JNODE_UNFORMATTED_BLOCK:
18602 + jfree(node);
18603 + break;
18604 + case JNODE_FORMATTED_BLOCK:
18605 + zfree(JZNODE(node));
18606 + break;
18607 + case JNODE_INODE:
18608 + default:
18609 + wrong_return_value("nikita-3197", "Wrong jnode type");
18610 + }
18611 +}
18612 +
18613 +/*
18614 + * Free a jnode. Post a callback to be executed later through RCU when all
18615 + * references to @node are released.
18616 + */
18617 +static inline void jnode_free(jnode * node, jnode_type jtype)
18618 +{
18619 + if (jtype != JNODE_INODE) {
18620 + /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18621 + call_rcu(&node->rcu, jnode_free_actor);
18622 + } else
18623 + jnode_list_remove(node);
18624 +}
18625 +
18626 +/* allocate new unformatted jnode */
18627 +static jnode *jnew_unformatted(void)
18628 +{
18629 + jnode *jal;
18630 +
18631 + jal = jalloc();
18632 + if (jal == NULL)
18633 + return NULL;
18634 +
18635 + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18636 + jal->key.j.mapping = NULL;
18637 + jal->key.j.index = (unsigned long)-1;
18638 + jal->key.j.objectid = 0;
18639 + return jal;
18640 +}
18641 +
18642 +/* look for jnode with given mapping and offset within hash table */
18643 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18644 +{
18645 + jnode_key_t jkey;
18646 + jnode *node;
18647 +
18648 + assert("nikita-2353", tree != NULL);
18649 +
18650 + jkey.objectid = objectid;
18651 + jkey.index = index;
18652 +
18653 + /*
18654 + * hash table is _not_ protected by any lock during lookups. All we
18655 + * have to do is to disable preemption to keep RCU happy.
18656 + */
18657 +
18658 + rcu_read_lock();
18659 + node = j_hash_find(&tree->jhash_table, &jkey);
18660 + if (node != NULL) {
18661 + /* protect @node from recycling */
18662 + jref(node);
18663 + assert("nikita-2955", jnode_invariant(node, 0, 0));
18664 + node = jnode_rip_check(tree, node);
18665 + }
18666 + rcu_read_unlock();
18667 + return node;
18668 +}
18669 +
18670 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18671 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18672 +{
18673 + assert("vs-1694", mapping->host != NULL);
18674 +
18675 + return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18676 +}
18677 +
18678 +jnode *jfind(struct address_space * mapping, unsigned long index)
18679 +{
18680 + reiser4_tree *tree;
18681 + jnode *node;
18682 +
18683 + assert("vs-1694", mapping->host != NULL);
18684 + tree = reiser4_tree_by_inode(mapping->host);
18685 +
18686 + read_lock_tree(tree);
18687 + node = jfind_nolock(mapping, index);
18688 + if (node != NULL)
18689 + jref(node);
18690 + read_unlock_tree(tree);
18691 + return node;
18692 +}
18693 +
18694 +static void inode_attach_jnode(jnode * node)
18695 +{
18696 + struct inode *inode;
18697 + reiser4_inode *info;
18698 + struct radix_tree_root *rtree;
18699 +
18700 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18701 + assert("zam-1043", node->key.j.mapping != NULL);
18702 + inode = node->key.j.mapping->host;
18703 + info = reiser4_inode_data(inode);
18704 + rtree = jnode_tree_by_reiser4_inode(info);
18705 + if (rtree->rnode == NULL) {
18706 + /* prevent inode from being pruned when it has jnodes attached
18707 + to it */
18708 + write_lock_irq(&inode->i_data.tree_lock);
18709 + inode->i_data.nrpages++;
18710 + write_unlock_irq(&inode->i_data.tree_lock);
18711 + }
18712 + assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18713 + check_me("zam-1045",
18714 + !radix_tree_insert(rtree, node->key.j.index, node));
18715 + ON_DEBUG(info->nr_jnodes++);
18716 +}
18717 +
18718 +static void inode_detach_jnode(jnode * node)
18719 +{
18720 + struct inode *inode;
18721 + reiser4_inode *info;
18722 + struct radix_tree_root *rtree;
18723 +
18724 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18725 + assert("zam-1044", node->key.j.mapping != NULL);
18726 + inode = node->key.j.mapping->host;
18727 + info = reiser4_inode_data(inode);
18728 + rtree = jnode_tree_by_reiser4_inode(info);
18729 +
18730 + assert("zam-1051", info->nr_jnodes != 0);
18731 + assert("zam-1052", rtree->rnode != NULL);
18732 + ON_DEBUG(info->nr_jnodes--);
18733 +
18734 + /* delete jnode from inode's radix tree of jnodes */
18735 + check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18736 + if (rtree->rnode == NULL) {
18737 + /* inode can be pruned now */
18738 + write_lock_irq(&inode->i_data.tree_lock);
18739 + inode->i_data.nrpages--;
18740 + write_unlock_irq(&inode->i_data.tree_lock);
18741 + }
18742 +}
18743 +
18744 +/* put jnode into hash table (where they can be found by flush who does not know
18745 + mapping) and to inode's tree of jnodes (where they can be found (hopefully
18746 + faster) in places where mapping is known). Currently it is used by
18747 + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18748 + created */
18749 +static void
18750 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18751 + unsigned long index)
18752 +{
18753 + j_hash_table *jtable;
18754 +
18755 + assert("vs-1446", jnode_is_unformatted(node));
18756 + assert("vs-1442", node->key.j.mapping == 0);
18757 + assert("vs-1443", node->key.j.objectid == 0);
18758 + assert("vs-1444", node->key.j.index == (unsigned long)-1);
18759 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18760 +
18761 + node->key.j.mapping = mapping;
18762 + node->key.j.objectid = get_inode_oid(mapping->host);
18763 + node->key.j.index = index;
18764 +
18765 + jtable = &jnode_get_tree(node)->jhash_table;
18766 +
18767 + /* race with some other thread inserting jnode into the hash table is
18768 + * impossible, because we keep the page lock. */
18769 + /*
18770 + * following assertion no longer holds because of RCU: it is possible
18771 + * jnode is in the hash table, but with JNODE_RIP bit set.
18772 + */
18773 + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18774 + j_hash_insert_rcu(jtable, node);
18775 + inode_attach_jnode(node);
18776 +}
18777 +
18778 +static void unhash_unformatted_node_nolock(jnode * node)
18779 +{
18780 + assert("vs-1683", node->key.j.mapping != NULL);
18781 + assert("vs-1684",
18782 + node->key.j.objectid ==
18783 + get_inode_oid(node->key.j.mapping->host));
18784 +
18785 + /* remove jnode from hash-table */
18786 + j_hash_remove_rcu(&node->tree->jhash_table, node);
18787 + inode_detach_jnode(node);
18788 + node->key.j.mapping = NULL;
18789 + node->key.j.index = (unsigned long)-1;
18790 + node->key.j.objectid = 0;
18791 +
18792 +}
18793 +
18794 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18795 + reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18796 + reiser4_uncapture_jnode */
18797 +void unhash_unformatted_jnode(jnode * node)
18798 +{
18799 + assert("vs-1445", jnode_is_unformatted(node));
18800 +
18801 + write_lock_tree(node->tree);
18802 + unhash_unformatted_node_nolock(node);
18803 + write_unlock_tree(node->tree);
18804 +}
18805 +
18806 +/*
18807 + * search hash table for a jnode with given oid and index. If not found,
18808 + * allocate new jnode, insert it, and also insert into radix tree for the
18809 + * given inode/mapping.
18810 + */
18811 +static jnode *find_get_jnode(reiser4_tree * tree,
18812 + struct address_space *mapping,
18813 + oid_t oid, unsigned long index)
18814 +{
18815 + jnode *result;
18816 + jnode *shadow;
18817 + int preload;
18818 +
18819 + result = jnew_unformatted();
18820 +
18821 + if (unlikely(result == NULL))
18822 + return ERR_PTR(RETERR(-ENOMEM));
18823 +
18824 + preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18825 + if (preload != 0)
18826 + return ERR_PTR(preload);
18827 +
18828 + write_lock_tree(tree);
18829 + shadow = jfind_nolock(mapping, index);
18830 + if (likely(shadow == NULL)) {
18831 + /* add new jnode to hash table and inode's radix tree of jnodes */
18832 + jref(result);
18833 + hash_unformatted_jnode(result, mapping, index);
18834 + } else {
18835 + /* jnode is found in inode's radix tree of jnodes */
18836 + jref(shadow);
18837 + jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18838 + assert("vs-1498", shadow->key.j.mapping == mapping);
18839 + result = shadow;
18840 + }
18841 + write_unlock_tree(tree);
18842 +
18843 + assert("nikita-2955",
18844 + ergo(result != NULL, jnode_invariant(result, 0, 0)));
18845 + radix_tree_preload_end();
18846 + return result;
18847 +}
18848 +
18849 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18850 + creates) jnode corresponding to page @pg. jnode is attached to page and
18851 + inserted into jnode hash-table. */
18852 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18853 +{
18854 + /*
18855 + * There are two ways to create jnode: starting with pre-existing page
18856 + * and without page.
18857 + *
18858 + * When page already exists, jnode is created
18859 + * (jnode_of_page()->do_jget()) under page lock. This is done in
18860 + * ->writepage(), or when capturing anonymous page dirtied through
18861 + * mmap.
18862 + *
18863 + * Jnode without page is created by index_extent_jnode().
18864 + *
18865 + */
18866 +
18867 + jnode *result;
18868 + oid_t oid = get_inode_oid(pg->mapping->host);
18869 +
18870 + assert("umka-176", pg != NULL);
18871 + assert("nikita-2394", PageLocked(pg));
18872 +
18873 + result = jprivate(pg);
18874 + if (likely(result != NULL))
18875 + return jref(result);
18876 +
18877 + tree = reiser4_tree_by_page(pg);
18878 +
18879 + /* check hash-table first */
18880 + result = jfind(pg->mapping, pg->index);
18881 + if (unlikely(result != NULL)) {
18882 + spin_lock_jnode(result);
18883 + jnode_attach_page(result, pg);
18884 + spin_unlock_jnode(result);
18885 + result->key.j.mapping = pg->mapping;
18886 + return result;
18887 + }
18888 +
18889 + /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18890 + reiser4_ctx_gfp_mask_force(GFP_NOFS);
18891 + result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18892 + if (unlikely(IS_ERR(result)))
18893 + return result;
18894 + /* attach jnode to page */
18895 + spin_lock_jnode(result);
18896 + jnode_attach_page(result, pg);
18897 + spin_unlock_jnode(result);
18898 + return result;
18899 +}
18900 +
18901 +/*
18902 + * return jnode for @pg, creating it if necessary.
18903 + */
18904 +jnode *jnode_of_page(struct page * pg)
18905 +{
18906 + jnode *result;
18907 +
18908 + assert("umka-176", pg != NULL);
18909 + assert("nikita-2394", PageLocked(pg));
18910 +
18911 + result = do_jget(reiser4_tree_by_page(pg), pg);
18912 +
18913 + if (REISER4_DEBUG && !IS_ERR(result)) {
18914 + assert("nikita-3210", result == jprivate(pg));
18915 + assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18916 + if (jnode_is_unformatted(jprivate(pg))) {
18917 + assert("nikita-2364",
18918 + jprivate(pg)->key.j.index == pg->index);
18919 + assert("nikita-2367",
18920 + jprivate(pg)->key.j.mapping == pg->mapping);
18921 + assert("nikita-2365",
18922 + jprivate(pg)->key.j.objectid ==
18923 + get_inode_oid(pg->mapping->host));
18924 + assert("vs-1200",
18925 + jprivate(pg)->key.j.objectid ==
18926 + pg->mapping->host->i_ino);
18927 + assert("nikita-2356",
18928 + jnode_is_unformatted(jnode_by_page(pg)));
18929 + }
18930 + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
18931 + }
18932 + return result;
18933 +}
18934 +
18935 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
18936 + * page.*/
18937 +void jnode_attach_page(jnode * node, struct page *pg)
18938 +{
18939 + assert("nikita-2060", node != NULL);
18940 + assert("nikita-2061", pg != NULL);
18941 +
18942 + assert("nikita-2050", jprivate(pg) == 0ul);
18943 + assert("nikita-2393", !PagePrivate(pg));
18944 + assert("vs-1741", node->pg == NULL);
18945 +
18946 + assert("nikita-2396", PageLocked(pg));
18947 + assert_spin_locked(&(node->guard));
18948 +
18949 + page_cache_get(pg);
18950 + set_page_private(pg, (unsigned long)node);
18951 + node->pg = pg;
18952 + SetPagePrivate(pg);
18953 +}
18954 +
18955 +/* Dual to jnode_attach_page: break a binding between page and jnode */
18956 +void page_clear_jnode(struct page *page, jnode * node)
18957 +{
18958 + assert("nikita-2424", page != NULL);
18959 + assert("nikita-2425", PageLocked(page));
18960 + assert("nikita-2426", node != NULL);
18961 + assert_spin_locked(&(node->guard));
18962 + assert("nikita-2428", PagePrivate(page));
18963 +
18964 + assert("nikita-3551", !PageWriteback(page));
18965 +
18966 + JF_CLR(node, JNODE_PARSED);
18967 + set_page_private(page, 0ul);
18968 + ClearPagePrivate(page);
18969 + node->pg = NULL;
18970 + page_cache_release(page);
18971 +}
18972 +
18973 +#if 0
18974 +/* it is only used in one place to handle error */
18975 +void
18976 +page_detach_jnode(struct page *page, struct address_space *mapping,
18977 + unsigned long index)
18978 +{
18979 + assert("nikita-2395", page != NULL);
18980 +
18981 + lock_page(page);
18982 + if ((page->mapping == mapping) && (page->index == index)
18983 + && PagePrivate(page)) {
18984 + jnode *node;
18985 +
18986 + node = jprivate(page);
18987 + spin_lock_jnode(node);
18988 + page_clear_jnode(page, node);
18989 + spin_unlock_jnode(node);
18990 + }
18991 + unlock_page(page);
18992 +}
18993 +#endif /* 0 */
18994 +
18995 +/* return @node page locked.
18996 +
18997 + Locking ordering requires that one first takes page lock and afterwards
18998 + spin lock on node attached to this page. Sometimes it is necessary to go in
18999 + the opposite direction. This is done through standard trylock-and-release
19000 + loop.
19001 +*/
19002 +static struct page *jnode_lock_page(jnode * node)
19003 +{
19004 + struct page *page;
19005 +
19006 + assert("nikita-2052", node != NULL);
19007 + assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19008 +
19009 + while (1) {
19010 +
19011 + spin_lock_jnode(node);
19012 + page = jnode_page(node);
19013 + if (page == NULL) {
19014 + break;
19015 + }
19016 +
19017 + /* no need to page_cache_get( page ) here, because page cannot
19018 + be evicted from memory without detaching it from jnode and
19019 + this requires spin lock on jnode that we already hold.
19020 + */
19021 + if (!TestSetPageLocked(page)) {
19022 + /* We won a lock on jnode page, proceed. */
19023 + break;
19024 + }
19025 +
19026 + /* Page is locked by someone else. */
19027 + page_cache_get(page);
19028 + spin_unlock_jnode(node);
19029 + wait_on_page_locked(page);
19030 + /* it is possible that page was detached from jnode and
19031 + returned to the free pool, or re-assigned while we were
19032 + waiting on locked bit. This will be rechecked on the next
19033 + loop iteration.
19034 + */
19035 + page_cache_release(page);
19036 +
19037 + /* try again */
19038 + }
19039 + return page;
19040 +}
19041 +
19042 +/*
19043 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19044 + * validness of jnode content.
19045 + */
19046 +static inline int jparse(jnode * node)
19047 +{
19048 + int result;
19049 +
19050 + assert("nikita-2466", node != NULL);
19051 +
19052 + spin_lock_jnode(node);
19053 + if (likely(!jnode_is_parsed(node))) {
19054 + result = jnode_ops(node)->parse(node);
19055 + if (likely(result == 0))
19056 + JF_SET(node, JNODE_PARSED);
19057 + } else
19058 + result = 0;
19059 + spin_unlock_jnode(node);
19060 + return result;
19061 +}
19062 +
19063 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19064 + * one. */
19065 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19066 +{
19067 + struct page *page;
19068 +
19069 + spin_lock_jnode(node);
19070 + page = jnode_page(node);
19071 +
19072 + if (page == NULL) {
19073 + spin_unlock_jnode(node);
19074 + page = find_or_create_page(jnode_get_mapping(node),
19075 + jnode_get_index(node), gfp_flags);
19076 + if (page == NULL)
19077 + return ERR_PTR(RETERR(-ENOMEM));
19078 + } else {
19079 + if (!TestSetPageLocked(page)) {
19080 + spin_unlock_jnode(node);
19081 + return page;
19082 + }
19083 + page_cache_get(page);
19084 + spin_unlock_jnode(node);
19085 + lock_page(page);
19086 + assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19087 + }
19088 +
19089 + spin_lock_jnode(node);
19090 + if (!jnode_page(node))
19091 + jnode_attach_page(node, page);
19092 + spin_unlock_jnode(node);
19093 +
19094 + page_cache_release(page);
19095 + assert("zam-894", jnode_page(node) == page);
19096 + return page;
19097 +}
19098 +
19099 +/* Start read operation for jnode's page if page is not up-to-date. */
19100 +static int jnode_start_read(jnode * node, struct page *page)
19101 +{
19102 + assert("zam-893", PageLocked(page));
19103 +
19104 + if (PageUptodate(page)) {
19105 + unlock_page(page);
19106 + return 0;
19107 + }
19108 + return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19109 +}
19110 +
19111 +#if REISER4_DEBUG
19112 +static void check_jload(jnode * node, struct page *page)
19113 +{
19114 + if (jnode_is_znode(node)) {
19115 + node40_header *nh;
19116 + znode *z;
19117 +
19118 + z = JZNODE(node);
19119 + if (znode_is_any_locked(z)) {
19120 + nh = (node40_header *) kmap(page);
19121 + /* this only works for node40-only file systems. For
19122 + * debugging. */
19123 + assert("nikita-3253",
19124 + z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19125 + kunmap(page);
19126 + }
19127 + assert("nikita-3565", znode_invariant(z));
19128 + }
19129 +}
19130 +#else
19131 +#define check_jload(node, page) noop
19132 +#endif
19133 +
19134 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19135 + * to call jload() shortly. This will bring appropriate portion of jnode into
19136 + * CPU cache. */
19137 +void jload_prefetch(jnode * node)
19138 +{
19139 + prefetchw(&node->x_count);
19140 +}
19141 +
19142 +/* load jnode's data into memory */
19143 +int jload_gfp(jnode * node /* node to load */ ,
19144 + gfp_t gfp_flags /* allocation flags */ ,
19145 + int do_kmap /* true if page should be kmapped */ )
19146 +{
19147 + struct page *page;
19148 + int result = 0;
19149 + int parsed;
19150 +
19151 + assert("nikita-3010", reiser4_schedulable());
19152 +
19153 + prefetchw(&node->pg);
19154 +
19155 + /* taking d-reference implies taking x-reference. */
19156 + jref(node);
19157 +
19158 + /*
19159 + * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19160 + * should be atomic, otherwise there is a race against
19161 + * reiser4_releasepage().
19162 + */
19163 + spin_lock(&(node->load));
19164 + add_d_ref(node);
19165 + parsed = jnode_is_parsed(node);
19166 + spin_unlock(&(node->load));
19167 +
19168 + if (unlikely(!parsed)) {
19169 + page = jnode_get_page_locked(node, gfp_flags);
19170 + if (unlikely(IS_ERR(page))) {
19171 + result = PTR_ERR(page);
19172 + goto failed;
19173 + }
19174 +
19175 + result = jnode_start_read(node, page);
19176 + if (unlikely(result != 0))
19177 + goto failed;
19178 +
19179 + wait_on_page_locked(page);
19180 + if (unlikely(!PageUptodate(page))) {
19181 + result = RETERR(-EIO);
19182 + goto failed;
19183 + }
19184 +
19185 + if (do_kmap)
19186 + node->data = kmap(page);
19187 +
19188 + result = jparse(node);
19189 + if (unlikely(result != 0)) {
19190 + if (do_kmap)
19191 + kunmap(page);
19192 + goto failed;
19193 + }
19194 + check_jload(node, page);
19195 + } else {
19196 + page = jnode_page(node);
19197 + check_jload(node, page);
19198 + if (do_kmap)
19199 + node->data = kmap(page);
19200 + }
19201 +
19202 + if (!is_writeout_mode())
19203 + /* We do not mark pages active if jload is called as a part of
19204 + * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19205 + * and write_logs() add no value to cached data, there is no
19206 + * sense to mark pages as active when they go to disk, it just
19207 + * confuses vm scanning routines because clean page could be
19208 + * moved out from inactive list as a result of this
19209 + * mark_page_accessed() call. */
19210 + mark_page_accessed(page);
19211 +
19212 + return 0;
19213 +
19214 + failed:
19215 + jrelse_tail(node);
19216 + return result;
19217 +
19218 +}
19219 +
19220 +/* start asynchronous reading for given jnode's page. */
19221 +int jstartio(jnode * node)
19222 +{
19223 + struct page *page;
19224 +
19225 + page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19226 + if (IS_ERR(page))
19227 + return PTR_ERR(page);
19228 +
19229 + return jnode_start_read(node, page);
19230 +}
19231 +
19232 +/* Initialize a node by calling appropriate plugin instead of reading
19233 + * node from disk as in jload(). */
19234 +int jinit_new(jnode * node, gfp_t gfp_flags)
19235 +{
19236 + struct page *page;
19237 + int result;
19238 +
19239 + jref(node);
19240 + add_d_ref(node);
19241 +
19242 + page = jnode_get_page_locked(node, gfp_flags);
19243 + if (IS_ERR(page)) {
19244 + result = PTR_ERR(page);
19245 + goto failed;
19246 + }
19247 +
19248 + SetPageUptodate(page);
19249 + unlock_page(page);
19250 +
19251 + node->data = kmap(page);
19252 +
19253 + if (!jnode_is_parsed(node)) {
19254 + jnode_plugin *jplug = jnode_ops(node);
19255 + spin_lock_jnode(node);
19256 + result = jplug->init(node);
19257 + spin_unlock_jnode(node);
19258 + if (result) {
19259 + kunmap(page);
19260 + goto failed;
19261 + }
19262 + JF_SET(node, JNODE_PARSED);
19263 + }
19264 +
19265 + return 0;
19266 +
19267 + failed:
19268 + jrelse(node);
19269 + return result;
19270 +}
19271 +
19272 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19273 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19274 +{
19275 + assert("nikita-489", atomic_read(&node->d_count) > 0);
19276 + atomic_dec(&node->d_count);
19277 + /* release reference acquired in jload_gfp() or jinit_new() */
19278 + jput(node);
19279 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
19280 + LOCK_CNT_DEC(d_refs);
19281 +}
19282 +
19283 +/* drop reference to node data. When last reference is dropped, data are
19284 + unloaded. */
19285 +void jrelse(jnode * node /* jnode to release references to */ )
19286 +{
19287 + struct page *page;
19288 +
19289 + assert("nikita-487", node != NULL);
19290 + assert_spin_not_locked(&(node->guard));
19291 +
19292 + page = jnode_page(node);
19293 + if (likely(page != NULL)) {
19294 + /*
19295 + * it is safe not to lock jnode here, because at this point
19296 + * @node->d_count is greater than zero (if jrelse() is used
19297 + * correctly, that is). JNODE_PARSED may be not set yet, if,
19298 + * for example, we got here as a result of error handling path
19299 + * in jload(). Anyway, page cannot be detached by
19300 + * reiser4_releasepage(). truncate will invalidate page
19301 + * regardless, but this should not be a problem.
19302 + */
19303 + kunmap(page);
19304 + }
19305 + jrelse_tail(node);
19306 +}
19307 +
19308 +/* called from jput() to wait for io completion */
19309 +static void jnode_finish_io(jnode * node)
19310 +{
19311 + struct page *page;
19312 +
19313 + assert("nikita-2922", node != NULL);
19314 +
19315 + spin_lock_jnode(node);
19316 + page = jnode_page(node);
19317 + if (page != NULL) {
19318 + page_cache_get(page);
19319 + spin_unlock_jnode(node);
19320 + wait_on_page_writeback(page);
19321 + page_cache_release(page);
19322 + } else
19323 + spin_unlock_jnode(node);
19324 +}
19325 +
19326 +/*
19327 + * This is called by jput() when last reference to jnode is released. This is
19328 + * separate function, because we want fast path of jput() to be inline and,
19329 + * therefore, small.
19330 + */
19331 +void jput_final(jnode * node)
19332 +{
19333 + int r_i_p;
19334 +
19335 + /* A fast check for keeping node in cache. We always keep node in cache
19336 + * if its page is present and node was not marked for deletion */
19337 + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19338 + rcu_read_unlock();
19339 + return;
19340 + }
19341 + assert("edward-1432", node->page_count == 0);
19342 +
19343 + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19344 + /*
19345 + * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19346 + * this case it is safe to access node after unlock.
19347 + */
19348 + rcu_read_unlock();
19349 + if (r_i_p) {
19350 + jnode_finish_io(node);
19351 + if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19352 + /* node is removed from the tree. */
19353 + jdelete(node);
19354 + else
19355 + jnode_try_drop(node);
19356 + }
19357 + /* if !r_i_p some other thread is already killing it */
19358 +}
19359 +
19360 +int jwait_io(jnode * node, int rw)
19361 +{
19362 + struct page *page;
19363 + int result;
19364 +
19365 + assert("zam-447", node != NULL);
19366 + assert("zam-448", jnode_page(node) != NULL);
19367 +
19368 + page = jnode_page(node);
19369 +
19370 + result = 0;
19371 + if (rw == READ) {
19372 + wait_on_page_locked(page);
19373 + } else {
19374 + assert("nikita-2227", rw == WRITE);
19375 + wait_on_page_writeback(page);
19376 + }
19377 + if (PageError(page))
19378 + result = RETERR(-EIO);
19379 +
19380 + return result;
19381 +}
19382 +
19383 +/*
19384 + * jnode types and plugins.
19385 + *
19386 + * jnode by itself is a "base type". There are several different jnode
19387 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19388 + * has to do different things based on jnode type. In the standard reiser4 way
19389 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19390 + *
19391 + * Functions below deal with jnode types and define methods of jnode plugin.
19392 + *
19393 + */
19394 +
19395 +/* set jnode type. This is done during jnode initialization. */
19396 +static void jnode_set_type(jnode * node, jnode_type type)
19397 +{
19398 + static unsigned long type_to_mask[] = {
19399 + [JNODE_UNFORMATTED_BLOCK] = 1,
19400 + [JNODE_FORMATTED_BLOCK] = 0,
19401 + [JNODE_BITMAP] = 2,
19402 + [JNODE_IO_HEAD] = 6,
19403 + [JNODE_INODE] = 4
19404 + };
19405 +
19406 + assert("zam-647", type < LAST_JNODE_TYPE);
19407 + assert("nikita-2815", !jnode_is_loaded(node));
19408 + assert("nikita-3386", node->state == 0);
19409 +
19410 + node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19411 +}
19412 +
19413 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19414 + * specific initialization. */
19415 +static int init_noinit(jnode * node UNUSED_ARG)
19416 +{
19417 + return 0;
19418 +}
19419 +
19420 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19421 + * specific pasring. */
19422 +static int parse_noparse(jnode * node UNUSED_ARG)
19423 +{
19424 + return 0;
19425 +}
19426 +
19427 +/* ->mapping() method for unformatted jnode */
19428 +struct address_space *mapping_jnode(const jnode * node)
19429 +{
19430 + struct address_space *map;
19431 +
19432 + assert("nikita-2713", node != NULL);
19433 +
19434 + /* mapping is stored in jnode */
19435 +
19436 + map = node->key.j.mapping;
19437 + assert("nikita-2714", map != NULL);
19438 + assert("nikita-2897", is_reiser4_inode(map->host));
19439 + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19440 + return map;
19441 +}
19442 +
19443 +/* ->index() method for unformatted jnodes */
19444 +unsigned long index_jnode(const jnode * node)
19445 +{
19446 + /* index is stored in jnode */
19447 + return node->key.j.index;
19448 +}
19449 +
19450 +/* ->remove() method for unformatted jnodes */
19451 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19452 +{
19453 + /* remove jnode from hash table and radix tree */
19454 + if (node->key.j.mapping)
19455 + unhash_unformatted_node_nolock(node);
19456 +}
19457 +
19458 +/* ->mapping() method for znodes */
19459 +static struct address_space *mapping_znode(const jnode * node)
19460 +{
19461 + /* all znodes belong to fake inode */
19462 + return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19463 +}
19464 +
19465 +/* ->index() method for znodes */
19466 +static unsigned long index_znode(const jnode * node)
19467 +{
19468 + unsigned long addr;
19469 + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19470 +
19471 + /* index of znode is just its address (shifted) */
19472 + addr = (unsigned long)node;
19473 + return (addr - PAGE_OFFSET) >> znode_shift_order;
19474 +}
19475 +
19476 +/* ->mapping() method for bitmap jnode */
19477 +static struct address_space *mapping_bitmap(const jnode * node)
19478 +{
19479 + /* all bitmap blocks belong to special bitmap inode */
19480 + return get_super_private(jnode_get_tree(node)->super)->bitmap->
19481 + i_mapping;
19482 +}
19483 +
19484 +/* ->index() method for jnodes that are indexed by address */
19485 +static unsigned long index_is_address(const jnode * node)
19486 +{
19487 + unsigned long ind;
19488 +
19489 + ind = (unsigned long)node;
19490 + return ind - PAGE_OFFSET;
19491 +}
19492 +
19493 +/* resolve race with jput */
19494 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19495 +{
19496 + /*
19497 + * This is used as part of RCU-based jnode handling.
19498 + *
19499 + * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19500 + * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19501 + * not protected during this, so concurrent thread may execute
19502 + * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19503 + * freed in jput_final(). To avoid such races, jput_final() sets
19504 + * JNODE_RIP on jnode (under tree lock). All places that work with
19505 + * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19506 + * (first without taking tree lock), and if this bit is set, released
19507 + * reference acquired by the current thread and returns NULL.
19508 + *
19509 + * As a result, if jnode is being concurrently freed, NULL is returned
19510 + * and caller should pretend that jnode wasn't found in the first
19511 + * place.
19512 + *
19513 + * Otherwise it's safe to release "rcu-read-lock" and continue with
19514 + * jnode.
19515 + */
19516 + if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19517 + read_lock_tree(tree);
19518 + if (JF_ISSET(node, JNODE_RIP)) {
19519 + dec_x_ref(node);
19520 + node = NULL;
19521 + }
19522 + read_unlock_tree(tree);
19523 + }
19524 + return node;
19525 +}
19526 +
19527 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19528 +{
19529 + struct inode *inode;
19530 + item_plugin *iplug;
19531 + loff_t off;
19532 +
19533 + assert("nikita-3092", node != NULL);
19534 + assert("nikita-3093", key != NULL);
19535 + assert("nikita-3094", jnode_is_unformatted(node));
19536 +
19537 + off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19538 + inode = mapping_jnode(node)->host;
19539 +
19540 + if (node->parent_item_id != 0)
19541 + iplug = item_plugin_by_id(node->parent_item_id);
19542 + else
19543 + iplug = NULL;
19544 +
19545 + if (iplug != NULL && iplug->f.key_by_offset)
19546 + iplug->f.key_by_offset(inode, off, key);
19547 + else {
19548 + file_plugin *fplug;
19549 +
19550 + fplug = inode_file_plugin(inode);
19551 + assert("zam-1007", fplug != NULL);
19552 + assert("zam-1008", fplug->key_by_inode != NULL);
19553 +
19554 + fplug->key_by_inode(inode, off, key);
19555 + }
19556 +
19557 + return key;
19558 +}
19559 +
19560 +/* ->parse() method for formatted nodes */
19561 +static int parse_znode(jnode * node)
19562 +{
19563 + return zparse(JZNODE(node));
19564 +}
19565 +
19566 +/* ->delete() method for formatted nodes */
19567 +static void delete_znode(jnode * node, reiser4_tree * tree)
19568 +{
19569 + znode *z;
19570 +
19571 + assert_rw_write_locked(&(tree->tree_lock));
19572 + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19573 +
19574 + z = JZNODE(node);
19575 + assert("vs-899", z->c_count == 0);
19576 +
19577 + /* delete znode from sibling list. */
19578 + sibling_list_remove(z);
19579 +
19580 + znode_remove(z, tree);
19581 +}
19582 +
19583 +/* ->remove() method for formatted nodes */
19584 +static int remove_znode(jnode * node, reiser4_tree * tree)
19585 +{
19586 + znode *z;
19587 +
19588 + assert_rw_write_locked(&(tree->tree_lock));
19589 + z = JZNODE(node);
19590 +
19591 + if (z->c_count == 0) {
19592 + /* detach znode from sibling list. */
19593 + sibling_list_drop(z);
19594 + /* this is called with tree spin-lock held, so call
19595 + znode_remove() directly (rather than znode_lock_remove()). */
19596 + znode_remove(z, tree);
19597 + return 0;
19598 + }
19599 + return RETERR(-EBUSY);
19600 +}
19601 +
19602 +/* ->init() method for formatted nodes */
19603 +static int init_znode(jnode * node)
19604 +{
19605 + znode *z;
19606 +
19607 + z = JZNODE(node);
19608 + /* call node plugin to do actual initialization */
19609 + return z->nplug->init(z);
19610 +}
19611 +
19612 +/* ->clone() method for formatted nodes */
19613 +static jnode *clone_formatted(jnode * node)
19614 +{
19615 + znode *clone;
19616 +
19617 + assert("vs-1430", jnode_is_znode(node));
19618 + clone = zalloc(reiser4_ctx_gfp_mask_get());
19619 + if (clone == NULL)
19620 + return ERR_PTR(RETERR(-ENOMEM));
19621 + zinit(clone, NULL, current_tree);
19622 + jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19623 + /* ZJNODE(clone)->key.z is not initialized */
19624 + clone->level = JZNODE(node)->level;
19625 +
19626 + return ZJNODE(clone);
19627 +}
19628 +
19629 +/* jplug->clone for unformatted nodes */
19630 +static jnode *clone_unformatted(jnode * node)
19631 +{
19632 + jnode *clone;
19633 +
19634 + assert("vs-1431", jnode_is_unformatted(node));
19635 + clone = jalloc();
19636 + if (clone == NULL)
19637 + return ERR_PTR(RETERR(-ENOMEM));
19638 +
19639 + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19640 + jnode_set_block(clone, jnode_get_block(node));
19641 +
19642 + return clone;
19643 +
19644 +}
19645 +
19646 +/*
19647 + * Setup jnode plugin methods for various jnode types.
19648 + */
19649 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19650 + [JNODE_UNFORMATTED_BLOCK] = {
19651 + .h = {
19652 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19653 + .id = JNODE_UNFORMATTED_BLOCK,
19654 + .pops = NULL,
19655 + .label = "unformatted",
19656 + .desc = "unformatted node",
19657 + .linkage = {NULL, NULL}
19658 + },
19659 + .init = init_noinit,
19660 + .parse = parse_noparse,
19661 + .mapping = mapping_jnode,
19662 + .index = index_jnode,
19663 + .clone = clone_unformatted
19664 + },
19665 + [JNODE_FORMATTED_BLOCK] = {
19666 + .h = {
19667 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19668 + .id = JNODE_FORMATTED_BLOCK,
19669 + .pops = NULL,
19670 + .label = "formatted",
19671 + .desc = "formatted tree node",
19672 + .linkage = {NULL, NULL}
19673 + },
19674 + .init = init_znode,
19675 + .parse = parse_znode,
19676 + .mapping = mapping_znode,
19677 + .index = index_znode,
19678 + .clone = clone_formatted
19679 + },
19680 + [JNODE_BITMAP] = {
19681 + .h = {
19682 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19683 + .id = JNODE_BITMAP,
19684 + .pops = NULL,
19685 + .label = "bitmap",
19686 + .desc = "bitmap node",
19687 + .linkage = {NULL, NULL}
19688 + },
19689 + .init = init_noinit,
19690 + .parse = parse_noparse,
19691 + .mapping = mapping_bitmap,
19692 + .index = index_is_address,
19693 + .clone = NULL
19694 + },
19695 + [JNODE_IO_HEAD] = {
19696 + .h = {
19697 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19698 + .id = JNODE_IO_HEAD,
19699 + .pops = NULL,
19700 + .label = "io head",
19701 + .desc = "io head",
19702 + .linkage = {NULL, NULL}
19703 + },
19704 + .init = init_noinit,
19705 + .parse = parse_noparse,
19706 + .mapping = mapping_bitmap,
19707 + .index = index_is_address,
19708 + .clone = NULL
19709 + },
19710 + [JNODE_INODE] = {
19711 + .h = {
19712 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19713 + .id = JNODE_INODE,
19714 + .pops = NULL,
19715 + .label = "inode",
19716 + .desc = "inode's builtin jnode",
19717 + .linkage = {NULL, NULL}
19718 + },
19719 + .init = NULL,
19720 + .parse = NULL,
19721 + .mapping = NULL,
19722 + .index = NULL,
19723 + .clone = NULL
19724 + }
19725 +};
19726 +
19727 +/*
19728 + * jnode destruction.
19729 + *
19730 + * Thread may use a jnode after it acquired a reference to it. References are
19731 + * counted in ->x_count field. Reference protects jnode from being
19732 + * recycled. This is different from protecting jnode data (that are stored in
19733 + * jnode page) from being evicted from memory. Data are protected by jload()
19734 + * and released by jrelse().
19735 + *
19736 + * If thread already possesses a reference to the jnode it can acquire another
19737 + * one through jref(). Initial reference is obtained (usually) by locating
19738 + * jnode in some indexing structure that depends on jnode type: formatted
19739 + * nodes are kept in global hash table, where they are indexed by block
19740 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19741 + * table, which is indexed by oid and offset within file, and in per-inode
19742 + * radix tree.
19743 + *
19744 + * Reference to jnode is released by jput(). If last reference is released,
19745 + * jput_final() is called. This function determines whether jnode has to be
19746 + * deleted (this happens when corresponding node is removed from the file
19747 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19748 + * should be just "removed" (deleted from memory).
19749 + *
19750 + * Jnode destruction is signally delicate dance because of locking and RCU.
19751 + */
19752 +
19753 +/*
19754 + * Returns true if jnode cannot be removed right now. This check is called
19755 + * under tree lock. If it returns true, jnode is irrevocably committed to be
19756 + * deleted/removed.
19757 + */
19758 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19759 +{
19760 + /* if other thread managed to acquire a reference to this jnode, don't
19761 + * free it. */
19762 + if (atomic_read(&node->x_count) > 0)
19763 + return 1;
19764 + /* also, don't free znode that has children in memory */
19765 + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19766 + return 1;
19767 + return 0;
19768 +}
19769 +
19770 +/*
19771 + * this is called as part of removing jnode. Based on jnode type, call
19772 + * corresponding function that removes jnode from indices and returns it back
19773 + * to the appropriate slab (through RCU).
19774 + */
19775 +static inline void
19776 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19777 +{
19778 + switch (jtype) {
19779 + case JNODE_UNFORMATTED_BLOCK:
19780 + remove_jnode(node, tree);
19781 + break;
19782 + case JNODE_IO_HEAD:
19783 + case JNODE_BITMAP:
19784 + break;
19785 + case JNODE_INODE:
19786 + break;
19787 + case JNODE_FORMATTED_BLOCK:
19788 + remove_znode(node, tree);
19789 + break;
19790 + default:
19791 + wrong_return_value("nikita-3196", "Wrong jnode type");
19792 + }
19793 +}
19794 +
19795 +/*
19796 + * this is called as part of deleting jnode. Based on jnode type, call
19797 + * corresponding function that removes jnode from indices and returns it back
19798 + * to the appropriate slab (through RCU).
19799 + *
19800 + * This differs from jnode_remove() only for formatted nodes---for them
19801 + * sibling list handling is different for removal and deletion.
19802 + */
19803 +static inline void
19804 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19805 +{
19806 + switch (jtype) {
19807 + case JNODE_UNFORMATTED_BLOCK:
19808 + remove_jnode(node, tree);
19809 + break;
19810 + case JNODE_IO_HEAD:
19811 + case JNODE_BITMAP:
19812 + break;
19813 + case JNODE_FORMATTED_BLOCK:
19814 + delete_znode(node, tree);
19815 + break;
19816 + case JNODE_INODE:
19817 + default:
19818 + wrong_return_value("nikita-3195", "Wrong jnode type");
19819 + }
19820 +}
19821 +
19822 +#if REISER4_DEBUG
19823 +/*
19824 + * remove jnode from the debugging list of all jnodes hanging off super-block.
19825 + */
19826 +void jnode_list_remove(jnode * node)
19827 +{
19828 + reiser4_super_info_data *sbinfo;
19829 +
19830 + sbinfo = get_super_private(jnode_get_tree(node)->super);
19831 +
19832 + spin_lock_irq(&sbinfo->all_guard);
19833 + assert("nikita-2422", !list_empty(&node->jnodes));
19834 + list_del_init(&node->jnodes);
19835 + spin_unlock_irq(&sbinfo->all_guard);
19836 +}
19837 +#endif
19838 +
19839 +/*
19840 + * this is called by jput_final() to remove jnode when last reference to it is
19841 + * released.
19842 + */
19843 +static int jnode_try_drop(jnode * node)
19844 +{
19845 + int result;
19846 + reiser4_tree *tree;
19847 + jnode_type jtype;
19848 +
19849 + assert("nikita-2491", node != NULL);
19850 + assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19851 +
19852 + tree = jnode_get_tree(node);
19853 + jtype = jnode_get_type(node);
19854 +
19855 + spin_lock_jnode(node);
19856 + write_lock_tree(tree);
19857 + /*
19858 + * if jnode has a page---leave it alone. Memory pressure will
19859 + * eventually kill page and jnode.
19860 + */
19861 + if (jnode_page(node) != NULL) {
19862 + write_unlock_tree(tree);
19863 + spin_unlock_jnode(node);
19864 + JF_CLR(node, JNODE_RIP);
19865 + return RETERR(-EBUSY);
19866 + }
19867 +
19868 + /* re-check ->x_count under tree lock. */
19869 + result = jnode_is_busy(node, jtype);
19870 + if (result == 0) {
19871 + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19872 + assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19873 +
19874 + spin_unlock_jnode(node);
19875 + /* no page and no references---despatch him. */
19876 + jnode_remove(node, jtype, tree);
19877 + write_unlock_tree(tree);
19878 + jnode_free(node, jtype);
19879 + } else {
19880 + /* busy check failed: reference was acquired by concurrent
19881 + * thread. */
19882 + write_unlock_tree(tree);
19883 + spin_unlock_jnode(node);
19884 + JF_CLR(node, JNODE_RIP);
19885 + }
19886 + return result;
19887 +}
19888 +
19889 +/* jdelete() -- Delete jnode from the tree and file system */
19890 +static int jdelete(jnode * node /* jnode to finish with */ )
19891 +{
19892 + struct page *page;
19893 + int result;
19894 + reiser4_tree *tree;
19895 + jnode_type jtype;
19896 +
19897 + assert("nikita-467", node != NULL);
19898 + assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19899 +
19900 + jtype = jnode_get_type(node);
19901 +
19902 + page = jnode_lock_page(node);
19903 + assert_spin_locked(&(node->guard));
19904 +
19905 + tree = jnode_get_tree(node);
19906 +
19907 + write_lock_tree(tree);
19908 + /* re-check ->x_count under tree lock. */
19909 + result = jnode_is_busy(node, jtype);
19910 + if (likely(!result)) {
19911 + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19912 + assert("jmacd-511", atomic_read(&node->d_count) == 0);
19913 +
19914 + /* detach page */
19915 + if (page != NULL) {
19916 + /*
19917 + * FIXME this is racy against jnode_extent_write().
19918 + */
19919 + page_clear_jnode(page, node);
19920 + }
19921 + spin_unlock_jnode(node);
19922 + /* goodbye */
19923 + jnode_delete(node, jtype, tree);
19924 + write_unlock_tree(tree);
19925 + jnode_free(node, jtype);
19926 + /* @node is no longer valid pointer */
19927 + if (page != NULL)
19928 + reiser4_drop_page(page);
19929 + } else {
19930 + /* busy check failed: reference was acquired by concurrent
19931 + * thread. */
19932 + JF_CLR(node, JNODE_RIP);
19933 + write_unlock_tree(tree);
19934 + spin_unlock_jnode(node);
19935 + if (page != NULL)
19936 + unlock_page(page);
19937 + }
19938 + return result;
19939 +}
19940 +
19941 +/* drop jnode on the floor.
19942 +
19943 + Return value:
19944 +
19945 + -EBUSY: failed to drop jnode, because there are still references to it
19946 +
19947 + 0: successfully dropped jnode
19948 +
19949 +*/
19950 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
19951 +{
19952 + struct page *page;
19953 + jnode_type jtype;
19954 + int result;
19955 +
19956 + assert("zam-602", node != NULL);
19957 + assert_rw_not_read_locked(&(tree->tree_lock));
19958 + assert_rw_not_write_locked(&(tree->tree_lock));
19959 + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19960 +
19961 + jtype = jnode_get_type(node);
19962 +
19963 + page = jnode_lock_page(node);
19964 + assert_spin_locked(&(node->guard));
19965 +
19966 + write_lock_tree(tree);
19967 +
19968 + /* re-check ->x_count under tree lock. */
19969 + result = jnode_is_busy(node, jtype);
19970 + if (!result) {
19971 + assert("nikita-2488", page == jnode_page(node));
19972 + assert("nikita-2533", atomic_read(&node->d_count) == 0);
19973 + if (page != NULL) {
19974 + assert("nikita-2126", !PageDirty(page));
19975 + assert("nikita-2127", PageUptodate(page));
19976 + assert("nikita-2181", PageLocked(page));
19977 + page_clear_jnode(page, node);
19978 + }
19979 + spin_unlock_jnode(node);
19980 + jnode_remove(node, jtype, tree);
19981 + write_unlock_tree(tree);
19982 + jnode_free(node, jtype);
19983 + if (page != NULL) {
19984 + reiser4_drop_page(page);
19985 + }
19986 + } else {
19987 + /* busy check failed: reference was acquired by concurrent
19988 + * thread. */
19989 + JF_CLR(node, JNODE_RIP);
19990 + write_unlock_tree(tree);
19991 + spin_unlock_jnode(node);
19992 + if (page != NULL)
19993 + unlock_page(page);
19994 + }
19995 + return result;
19996 +}
19997 +
19998 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
19999 + be 0 (where applicable). */
20000 +void jdrop(jnode * node)
20001 +{
20002 + jdrop_in_tree(node, jnode_get_tree(node));
20003 +}
20004 +
20005 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20006 + functionality (these j-nodes are not in any hash table) just for reading
20007 + from and writing to disk. */
20008 +
20009 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20010 +{
20011 + jnode *jal = jalloc();
20012 +
20013 + if (jal != NULL) {
20014 + jnode_init(jal, current_tree, JNODE_IO_HEAD);
20015 + jnode_set_block(jal, block);
20016 + }
20017 +
20018 + jref(jal);
20019 +
20020 + return jal;
20021 +}
20022 +
20023 +void reiser4_drop_io_head(jnode * node)
20024 +{
20025 + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20026 +
20027 + jput(node);
20028 + jdrop(node);
20029 +}
20030 +
20031 +/* protect keep jnode data from reiser4_releasepage() */
20032 +void pin_jnode_data(jnode * node)
20033 +{
20034 + assert("zam-671", jnode_page(node) != NULL);
20035 + page_cache_get(jnode_page(node));
20036 +}
20037 +
20038 +/* make jnode data free-able again */
20039 +void unpin_jnode_data(jnode * node)
20040 +{
20041 + assert("zam-672", jnode_page(node) != NULL);
20042 + page_cache_release(jnode_page(node));
20043 +}
20044 +
20045 +struct address_space *jnode_get_mapping(const jnode * node)
20046 +{
20047 + assert("nikita-3162", node != NULL);
20048 + return jnode_ops(node)->mapping(node);
20049 +}
20050 +
20051 +#if REISER4_DEBUG
20052 +/* debugging aid: jnode invariant */
20053 +int jnode_invariant_f(const jnode * node, char const **msg)
20054 +{
20055 +#define _ergo(ant, con) \
20056 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20057 +#define _check(exp) ((*msg) = #exp, (exp))
20058 +
20059 + return _check(node != NULL) &&
20060 + /* [jnode-queued] */
20061 + /* only relocated node can be queued, except that when znode
20062 + * is being deleted, its JNODE_RELOC bit is cleared */
20063 + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20064 + JF_ISSET(node, JNODE_RELOC) ||
20065 + JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20066 + _check(node->jnodes.prev != NULL) &&
20067 + _check(node->jnodes.next != NULL) &&
20068 + /* [jnode-dirty] invariant */
20069 + /* dirty inode is part of atom */
20070 + _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20071 + /* [jnode-oid] invariant */
20072 + /* for unformatted node ->objectid and ->mapping fields are
20073 + * consistent */
20074 + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20075 + node->key.j.objectid ==
20076 + get_inode_oid(node->key.j.mapping->host)) &&
20077 + /* [jnode-atom-valid] invariant */
20078 + /* node atom has valid state */
20079 + _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20080 + /* [jnode-page-binding] invariant */
20081 + /* if node points to page, it points back to node */
20082 + _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20083 + /* [jnode-refs] invariant */
20084 + /* only referenced jnode can be loaded */
20085 + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20086 +
20087 +}
20088 +
20089 +static const char *jnode_type_name(jnode_type type)
20090 +{
20091 + switch (type) {
20092 + case JNODE_UNFORMATTED_BLOCK:
20093 + return "unformatted";
20094 + case JNODE_FORMATTED_BLOCK:
20095 + return "formatted";
20096 + case JNODE_BITMAP:
20097 + return "bitmap";
20098 + case JNODE_IO_HEAD:
20099 + return "io head";
20100 + case JNODE_INODE:
20101 + return "inode";
20102 + case LAST_JNODE_TYPE:
20103 + return "last";
20104 + default:{
20105 + static char unknown[30];
20106 +
20107 + sprintf(unknown, "unknown %i", type);
20108 + return unknown;
20109 + }
20110 + }
20111 +}
20112 +
20113 +#define jnode_state_name( node, flag ) \
20114 + ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20115 +
20116 +/* debugging aid: output human readable information about @node */
20117 +static void info_jnode(const char *prefix /* prefix to print */ ,
20118 + const jnode * node /* node to print */ )
20119 +{
20120 + assert("umka-068", prefix != NULL);
20121 +
20122 + if (node == NULL) {
20123 + printk("%s: null\n", prefix);
20124 + return;
20125 + }
20126 +
20127 + printk
20128 + ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20129 + " block: %s, d_count: %d, x_count: %d, "
20130 + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20131 + node->state,
20132 + jnode_state_name(node, JNODE_PARSED),
20133 + jnode_state_name(node, JNODE_HEARD_BANSHEE),
20134 + jnode_state_name(node, JNODE_LEFT_CONNECTED),
20135 + jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20136 + jnode_state_name(node, JNODE_ORPHAN),
20137 + jnode_state_name(node, JNODE_CREATED),
20138 + jnode_state_name(node, JNODE_RELOC),
20139 + jnode_state_name(node, JNODE_OVRWR),
20140 + jnode_state_name(node, JNODE_DIRTY),
20141 + jnode_state_name(node, JNODE_IS_DYING),
20142 + jnode_state_name(node, JNODE_RIP),
20143 + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20144 + jnode_state_name(node, JNODE_WRITEBACK),
20145 + jnode_state_name(node, JNODE_NEW),
20146 + jnode_state_name(node, JNODE_DKSET),
20147 + jnode_state_name(node, JNODE_REPACK),
20148 + jnode_state_name(node, JNODE_CLUSTER_PAGE),
20149 + jnode_get_level(node), sprint_address(jnode_get_block(node)),
20150 + atomic_read(&node->d_count), atomic_read(&node->x_count),
20151 + jnode_page(node), node->atom, 0, 0,
20152 + jnode_type_name(jnode_get_type(node)));
20153 + if (jnode_is_unformatted(node)) {
20154 + printk("inode: %llu, index: %lu, ",
20155 + node->key.j.objectid, node->key.j.index);
20156 + }
20157 +}
20158 +
20159 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20160 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20161 +{
20162 + char const *failed_msg;
20163 + int result;
20164 + reiser4_tree *tree;
20165 +
20166 + tree = jnode_get_tree(node);
20167 +
20168 + assert("umka-063312", node != NULL);
20169 + assert("umka-064321", tree != NULL);
20170 +
20171 + if (!jlocked && !tlocked)
20172 + spin_lock_jnode((jnode *) node);
20173 + if (!tlocked)
20174 + read_lock_tree(jnode_get_tree(node));
20175 + result = jnode_invariant_f(node, &failed_msg);
20176 + if (!result) {
20177 + info_jnode("corrupted node", node);
20178 + warning("jmacd-555", "Condition %s failed", failed_msg);
20179 + }
20180 + if (!tlocked)
20181 + read_unlock_tree(jnode_get_tree(node));
20182 + if (!jlocked && !tlocked)
20183 + spin_unlock_jnode((jnode *) node);
20184 + return result;
20185 +}
20186 +
20187 +#endif /* REISER4_DEBUG */
20188 +
20189 +/* Make Linus happy.
20190 + Local variables:
20191 + c-indentation-style: "K&R"
20192 + mode-name: "LC"
20193 + c-basic-offset: 8
20194 + tab-width: 8
20195 + fill-column: 80
20196 + End:
20197 +*/
20198 diff -urN linux-2.6.20.orig/fs/reiser4/jnode.h linux-2.6.20/fs/reiser4/jnode.h
20199 --- linux-2.6.20.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20200 +++ linux-2.6.20/fs/reiser4/jnode.h 2007-05-06 14:50:43.734986973 +0400
20201 @@ -0,0 +1,705 @@
20202 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20203 + * reiser4/README */
20204 +
20205 +/* Declaration of jnode. See jnode.c for details. */
20206 +
20207 +#ifndef __JNODE_H__
20208 +#define __JNODE_H__
20209 +
20210 +#include "forward.h"
20211 +#include "type_safe_hash.h"
20212 +#include "txnmgr.h"
20213 +#include "key.h"
20214 +#include "debug.h"
20215 +#include "dformat.h"
20216 +#include "page_cache.h"
20217 +#include "context.h"
20218 +
20219 +#include "plugin/plugin.h"
20220 +
20221 +#include <linux/fs.h>
20222 +#include <linux/mm.h>
20223 +#include <linux/spinlock.h>
20224 +#include <asm/atomic.h>
20225 +#include <asm/bitops.h>
20226 +#include <linux/list.h>
20227 +#include <linux/rcupdate.h>
20228 +
20229 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20230 + nodes) */
20231 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20232 +
20233 +/* declare hash table of znodes */
20234 +TYPE_SAFE_HASH_DECLARE(z, znode);
20235 +
20236 +typedef struct {
20237 + __u64 objectid;
20238 + unsigned long index;
20239 + struct address_space *mapping;
20240 +} jnode_key_t;
20241 +
20242 +/*
20243 + Jnode is the "base class" of other nodes in reiser4. It is also happens to
20244 + be exactly the node we use for unformatted tree nodes.
20245 +
20246 + Jnode provides following basic functionality:
20247 +
20248 + . reference counting and indexing.
20249 +
20250 + . integration with page cache. Jnode has ->pg reference to which page can
20251 + be attached.
20252 +
20253 + . interface to transaction manager. It is jnode that is kept in transaction
20254 + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20255 + means, there should be special type of jnode for inode.)
20256 +
20257 + Locking:
20258 +
20259 + Spin lock: the following fields are protected by the per-jnode spin lock:
20260 +
20261 + ->state
20262 + ->atom
20263 + ->capture_link
20264 +
20265 + Following fields are protected by the global tree lock:
20266 +
20267 + ->link
20268 + ->key.z (content of ->key.z is only changed in znode_rehash())
20269 + ->key.j
20270 +
20271 + Atomic counters
20272 +
20273 + ->x_count
20274 + ->d_count
20275 +
20276 + ->pg, and ->data are protected by spin lock for unused jnode and are
20277 + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20278 + is false).
20279 +
20280 + ->tree is immutable after creation
20281 +
20282 + Unclear
20283 +
20284 + ->blocknr: should be under jnode spin-lock, but current interface is based
20285 + on passing of block address.
20286 +
20287 + If you ever need to spin lock two nodes at once, do this in "natural"
20288 + memory order: lock znode with lower address first. (See lock_two_nodes().)
20289 +
20290 + Invariants involving this data-type:
20291 +
20292 + [jnode-dirty]
20293 + [jnode-refs]
20294 + [jnode-oid]
20295 + [jnode-queued]
20296 + [jnode-atom-valid]
20297 + [jnode-page-binding]
20298 +*/
20299 +
20300 +struct jnode {
20301 +#if REISER4_DEBUG
20302 +#define JMAGIC 0x52654973 /* "ReIs" */
20303 + int magic;
20304 +#endif
20305 + /* FIRST CACHE LINE (16 bytes): data used by jload */
20306 +
20307 + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20308 + /* 0 */ unsigned long state;
20309 +
20310 + /* lock, protecting jnode's fields. */
20311 + /* 4 */ spinlock_t load;
20312 +
20313 + /* counter of references to jnode itself. Increased on jref().
20314 + Decreased on jput().
20315 + */
20316 + /* 8 */ atomic_t x_count;
20317 +
20318 + /* counter of references to jnode's data. Pin data page(s) in
20319 + memory while this is greater than 0. Increased on jload().
20320 + Decreased on jrelse().
20321 + */
20322 + /* 12 */ atomic_t d_count;
20323 +
20324 + /* SECOND CACHE LINE: data used by hash table lookups */
20325 +
20326 + /* 16 */ union {
20327 + /* znodes are hashed by block number */
20328 + reiser4_block_nr z;
20329 + /* unformatted nodes are hashed by mapping plus offset */
20330 + jnode_key_t j;
20331 + } key;
20332 +
20333 + /* THIRD CACHE LINE */
20334 +
20335 + /* 32 */ union {
20336 + /* pointers to maintain hash-table */
20337 + z_hash_link z;
20338 + j_hash_link j;
20339 + } link;
20340 +
20341 + /* pointer to jnode page. */
20342 + /* 36 */ struct page *pg;
20343 + /* pointer to node itself. This is page_address(node->pg) when page is
20344 + attached to the jnode
20345 + */
20346 + /* 40 */ void *data;
20347 +
20348 + /* 44 */ reiser4_tree *tree;
20349 +
20350 + /* FOURTH CACHE LINE: atom related fields */
20351 +
20352 + /* 48 */ spinlock_t guard;
20353 +
20354 + /* atom the block is in, if any */
20355 + /* 52 */ txn_atom *atom;
20356 +
20357 + /* capture list */
20358 + /* 56 */ struct list_head capture_link;
20359 +
20360 + /* FIFTH CACHE LINE */
20361 +
20362 + /* 64 */ struct rcu_head rcu;
20363 + /* crosses cache line */
20364 +
20365 + /* SIXTH CACHE LINE */
20366 +
20367 + /* the real blocknr (where io is going to/from) */
20368 + /* 80 */ reiser4_block_nr blocknr;
20369 + /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20370 + /* NOTE: this parent_item_id looks like jnode type. */
20371 + /* 88 */ reiser4_plugin_id parent_item_id;
20372 + /* 92 */
20373 +#if REISER4_DEBUG
20374 + /* number of pages referenced by the jnode (meaningful while capturing of
20375 + page clusters) */
20376 + int page_count;
20377 + /* list of all jnodes for debugging purposes. */
20378 + struct list_head jnodes;
20379 + /* how many times this jnode was written in one transaction */
20380 + int written;
20381 + /* this indicates which atom's list the jnode is on */
20382 + atom_list list;
20383 +#endif
20384 +} __attribute__ ((aligned(16)));
20385 +
20386 +/*
20387 + * jnode types. Enumeration of existing jnode types.
20388 + */
20389 +typedef enum {
20390 + JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20391 + JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20392 + JNODE_BITMAP, /* bitmap */
20393 + JNODE_IO_HEAD, /* jnode representing a block in the
20394 + * wandering log */
20395 + JNODE_INODE, /* jnode embedded into inode */
20396 + LAST_JNODE_TYPE
20397 +} jnode_type;
20398 +
20399 +/* jnode states */
20400 +typedef enum {
20401 + /* jnode's page is loaded and data checked */
20402 + JNODE_PARSED = 0,
20403 + /* node was deleted, not all locks on it were released. This
20404 + node is empty and is going to be removed from the tree
20405 + shortly. */
20406 + JNODE_HEARD_BANSHEE = 1,
20407 + /* left sibling pointer is valid */
20408 + JNODE_LEFT_CONNECTED = 2,
20409 + /* right sibling pointer is valid */
20410 + JNODE_RIGHT_CONNECTED = 3,
20411 +
20412 + /* znode was just created and doesn't yet have a pointer from
20413 + its parent */
20414 + JNODE_ORPHAN = 4,
20415 +
20416 + /* this node was created by its transaction and has not been assigned
20417 + a block address. */
20418 + JNODE_CREATED = 5,
20419 +
20420 + /* this node is currently relocated */
20421 + JNODE_RELOC = 6,
20422 + /* this node is currently wandered */
20423 + JNODE_OVRWR = 7,
20424 +
20425 + /* this znode has been modified */
20426 + JNODE_DIRTY = 8,
20427 +
20428 + /* znode lock is being invalidated */
20429 + JNODE_IS_DYING = 9,
20430 +
20431 + /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20432 +
20433 + /* jnode is queued for flushing. */
20434 + JNODE_FLUSH_QUEUED = 12,
20435 +
20436 + /* In the following bits jnode type is encoded. */
20437 + JNODE_TYPE_1 = 13,
20438 + JNODE_TYPE_2 = 14,
20439 + JNODE_TYPE_3 = 15,
20440 +
20441 + /* jnode is being destroyed */
20442 + JNODE_RIP = 16,
20443 +
20444 + /* znode was not captured during locking (it might so be because
20445 + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20446 + JNODE_MISSED_IN_CAPTURE = 17,
20447 +
20448 + /* write is in progress */
20449 + JNODE_WRITEBACK = 18,
20450 +
20451 + /* FIXME: now it is used by crypto-compress plugin only */
20452 + JNODE_NEW = 19,
20453 +
20454 + /* delimiting keys are already set for this znode. */
20455 + JNODE_DKSET = 20,
20456 +
20457 + /* when this bit is set page and jnode can not be disconnected */
20458 + JNODE_WRITE_PREPARED = 21,
20459 +
20460 + JNODE_CLUSTER_PAGE = 22,
20461 + /* Jnode is marked for repacking, that means the reiser4 flush and the
20462 + * block allocator should process this node special way */
20463 + JNODE_REPACK = 23,
20464 + /* node should be converted by flush in squalloc phase */
20465 + JNODE_CONVERTIBLE = 24,
20466 + /*
20467 + * When jnode is dirtied for the first time in given transaction,
20468 + * do_jnode_make_dirty() checks whether this jnode can possible became
20469 + * member of overwrite set. If so, this bit is set, and one block is
20470 + * reserved in the ->flush_reserved space of atom.
20471 + *
20472 + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20473 + *
20474 + * (1) flush decides that we want this block to go into relocate
20475 + * set after all.
20476 + *
20477 + * (2) wandering log is allocated (by log writer)
20478 + *
20479 + * (3) extent is allocated
20480 + *
20481 + */
20482 + JNODE_FLUSH_RESERVED = 29
20483 +} reiser4_jnode_state;
20484 +
20485 +/* Macros for accessing the jnode state. */
20486 +
20487 +static inline void JF_CLR(jnode * j, int f)
20488 +{
20489 + assert("unknown-1", j->magic == JMAGIC);
20490 + clear_bit(f, &j->state);
20491 +}
20492 +static inline int JF_ISSET(const jnode * j, int f)
20493 +{
20494 + assert("unknown-2", j->magic == JMAGIC);
20495 + return test_bit(f, &((jnode *) j)->state);
20496 +}
20497 +static inline void JF_SET(jnode * j, int f)
20498 +{
20499 + assert("unknown-3", j->magic == JMAGIC);
20500 + set_bit(f, &j->state);
20501 +}
20502 +
20503 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20504 +{
20505 + assert("unknown-4", j->magic == JMAGIC);
20506 + return test_and_set_bit(f, &j->state);
20507 +}
20508 +
20509 +static inline void spin_lock_jnode(jnode *node)
20510 +{
20511 + /* check that spinlocks of lower priorities are not held */
20512 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20513 + LOCK_CNT_NIL(spin_locked_txnh) &&
20514 + LOCK_CNT_NIL(spin_locked_zlock) &&
20515 + LOCK_CNT_NIL(rw_locked_dk) &&
20516 + LOCK_CNT_LT(spin_locked_jnode, 2)));
20517 +
20518 + spin_lock(&(node->guard));
20519 +
20520 + LOCK_CNT_INC(spin_locked_jnode);
20521 + LOCK_CNT_INC(spin_locked);
20522 +}
20523 +
20524 +static inline void spin_unlock_jnode(jnode *node)
20525 +{
20526 + assert_spin_locked(&(node->guard));
20527 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20528 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20529 +
20530 + LOCK_CNT_DEC(spin_locked_jnode);
20531 + LOCK_CNT_DEC(spin_locked);
20532 +
20533 + spin_unlock(&(node->guard));
20534 +}
20535 +
20536 +static inline int jnode_is_in_deleteset(const jnode * node)
20537 +{
20538 + return JF_ISSET(node, JNODE_RELOC);
20539 +}
20540 +
20541 +extern int init_jnodes(void);
20542 +extern void done_jnodes(void);
20543 +
20544 +/* Jnode routines */
20545 +extern jnode *jalloc(void);
20546 +extern void jfree(jnode * node) NONNULL;
20547 +extern jnode *jclone(jnode *);
20548 +extern jnode *jlookup(reiser4_tree * tree,
20549 + oid_t objectid, unsigned long ind) NONNULL;
20550 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20551 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20552 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20553 +void jnode_attach_page(jnode * node, struct page *pg);
20554 +
20555 +void unhash_unformatted_jnode(jnode *);
20556 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20557 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20558 +extern void jnode_make_dirty(jnode * node) NONNULL;
20559 +extern void jnode_make_clean(jnode * node) NONNULL;
20560 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20561 +extern void jnode_make_wander(jnode *) NONNULL;
20562 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20563 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20564 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20565 +
20566 +/**
20567 + * jnode_get_block
20568 + * @node: jnode to query
20569 + *
20570 + */
20571 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20572 +{
20573 + assert("nikita-528", node != NULL);
20574 +
20575 + return &node->blocknr;
20576 +}
20577 +
20578 +/**
20579 + * jnode_set_block
20580 + * @node: jnode to update
20581 + * @blocknr: new block nr
20582 + */
20583 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20584 +{
20585 + assert("nikita-2020", node != NULL);
20586 + assert("umka-055", blocknr != NULL);
20587 + node->blocknr = *blocknr;
20588 +}
20589 +
20590 +
20591 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20592 + * jnode was emergency flushed---then block number chosen by eflush is
20593 + * used. */
20594 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20595 +{
20596 + assert("nikita-2768", node != NULL);
20597 + assert_spin_locked(&(node->guard));
20598 +
20599 + return jnode_get_block(node);
20600 +}
20601 +
20602 +/* Jnode flush interface. */
20603 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20604 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20605 +
20606 +/* FIXME-VS: these are used in plugin/item/extent.c */
20607 +
20608 +/* does extent_get_block have to be called */
20609 +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20610 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20611 +
20612 +/* the node should be converted during flush squalloc phase */
20613 +#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20614 +#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20615 +
20616 +/* Macros to convert from jnode to znode, znode to jnode. These are macros
20617 + because C doesn't allow overloading of const prototypes. */
20618 +#define ZJNODE(x) (& (x) -> zjnode)
20619 +#define JZNODE(x) \
20620 +({ \
20621 + typeof (x) __tmp_x; \
20622 + \
20623 + __tmp_x = (x); \
20624 + assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20625 + (znode*) __tmp_x; \
20626 +})
20627 +
20628 +extern int jnodes_tree_init(reiser4_tree * tree);
20629 +extern int jnodes_tree_done(reiser4_tree * tree);
20630 +
20631 +#if REISER4_DEBUG
20632 +
20633 +extern int znode_is_any_locked(const znode * node);
20634 +extern void jnode_list_remove(jnode * node);
20635 +
20636 +#else
20637 +
20638 +#define jnode_list_remove(node) noop
20639 +
20640 +#endif
20641 +
20642 +int znode_is_root(const znode * node) NONNULL;
20643 +
20644 +/* bump reference counter on @node */
20645 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20646 +{
20647 + assert("nikita-1911", node != NULL);
20648 +
20649 + atomic_inc(&node->x_count);
20650 + LOCK_CNT_INC(x_refs);
20651 +}
20652 +
20653 +static inline void dec_x_ref(jnode * node)
20654 +{
20655 + assert("nikita-3215", node != NULL);
20656 + assert("nikita-3216", atomic_read(&node->x_count) > 0);
20657 +
20658 + atomic_dec(&node->x_count);
20659 + assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20660 + LOCK_CNT_DEC(x_refs);
20661 +}
20662 +
20663 +/* jref() - increase counter of references to jnode/znode (x_count) */
20664 +static inline jnode *jref(jnode * node)
20665 +{
20666 + assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20667 + add_x_ref(node);
20668 + return node;
20669 +}
20670 +
20671 +/* get the page of jnode */
20672 +static inline struct page *jnode_page(const jnode * node)
20673 +{
20674 + return node->pg;
20675 +}
20676 +
20677 +/* return pointer to jnode data */
20678 +static inline char *jdata(const jnode * node)
20679 +{
20680 + assert("nikita-1415", node != NULL);
20681 + assert("nikita-3198", jnode_page(node) != NULL);
20682 + return node->data;
20683 +}
20684 +
20685 +static inline int jnode_is_loaded(const jnode * node)
20686 +{
20687 + assert("zam-506", node != NULL);
20688 + return atomic_read(&node->d_count) > 0;
20689 +}
20690 +
20691 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20692 +
20693 +static inline void jnode_set_reloc(jnode * node)
20694 +{
20695 + assert("nikita-2431", node != NULL);
20696 + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20697 + JF_SET(node, JNODE_RELOC);
20698 +}
20699 +
20700 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20701 +
20702 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20703 +
20704 +static inline int jload(jnode *node)
20705 +{
20706 + return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20707 +}
20708 +
20709 +extern int jinit_new(jnode *, gfp_t) NONNULL;
20710 +extern int jstartio(jnode *) NONNULL;
20711 +
20712 +extern void jdrop(jnode *) NONNULL;
20713 +extern int jwait_io(jnode *, int rw) NONNULL;
20714 +
20715 +void jload_prefetch(jnode *);
20716 +
20717 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20718 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
20719 +
20720 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
20721 +{
20722 + assert("nikita-2691", node != NULL);
20723 + return node->tree;
20724 +}
20725 +
20726 +extern void pin_jnode_data(jnode *);
20727 +extern void unpin_jnode_data(jnode *);
20728 +
20729 +static inline jnode_type jnode_get_type(const jnode * node)
20730 +{
20731 + static const unsigned long state_mask =
20732 + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20733 +
20734 + static jnode_type mask_to_type[] = {
20735 + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20736 +
20737 + /* 000 */
20738 + [0] = JNODE_FORMATTED_BLOCK,
20739 + /* 001 */
20740 + [1] = JNODE_UNFORMATTED_BLOCK,
20741 + /* 010 */
20742 + [2] = JNODE_BITMAP,
20743 + /* 011 */
20744 + [3] = LAST_JNODE_TYPE, /*invalid */
20745 + /* 100 */
20746 + [4] = JNODE_INODE,
20747 + /* 101 */
20748 + [5] = LAST_JNODE_TYPE,
20749 + /* 110 */
20750 + [6] = JNODE_IO_HEAD,
20751 + /* 111 */
20752 + [7] = LAST_JNODE_TYPE, /* invalid */
20753 + };
20754 +
20755 + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20756 +}
20757 +
20758 +/* returns true if node is a znode */
20759 +static inline int jnode_is_znode(const jnode * node)
20760 +{
20761 + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20762 +}
20763 +
20764 +static inline int jnode_is_flushprepped(jnode * node)
20765 +{
20766 + assert("jmacd-78212", node != NULL);
20767 + assert_spin_locked(&(node->guard));
20768 + return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20769 + JF_ISSET(node, JNODE_OVRWR);
20770 +}
20771 +
20772 +/* Return true if @node has already been processed by the squeeze and allocate
20773 + process. This implies the block address has been finalized for the
20774 + duration of this atom (or it is clean and will remain in place). If this
20775 + returns true you may use the block number as a hint. */
20776 +static inline int jnode_check_flushprepped(jnode * node)
20777 +{
20778 + int result;
20779 +
20780 + /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20781 + spin_lock_jnode(node);
20782 + result = jnode_is_flushprepped(node);
20783 + spin_unlock_jnode(node);
20784 + return result;
20785 +}
20786 +
20787 +/* returns true if node is unformatted */
20788 +static inline int jnode_is_unformatted(const jnode * node)
20789 +{
20790 + assert("jmacd-0123", node != NULL);
20791 + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20792 +}
20793 +
20794 +/* returns true if node represents a cluster cache page */
20795 +static inline int jnode_is_cluster_page(const jnode * node)
20796 +{
20797 + assert("edward-50", node != NULL);
20798 + return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20799 +}
20800 +
20801 +/* returns true is node is builtin inode's jnode */
20802 +static inline int jnode_is_inode(const jnode * node)
20803 +{
20804 + assert("vs-1240", node != NULL);
20805 + return jnode_get_type(node) == JNODE_INODE;
20806 +}
20807 +
20808 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20809 +{
20810 + assert("nikita-2367", type < LAST_JNODE_TYPE);
20811 + return jnode_plugin_by_id((reiser4_plugin_id) type);
20812 +}
20813 +
20814 +static inline jnode_plugin *jnode_ops(const jnode * node)
20815 +{
20816 + assert("nikita-2366", node != NULL);
20817 +
20818 + return jnode_ops_of(jnode_get_type(node));
20819 +}
20820 +
20821 +/* Get the index of a block. */
20822 +static inline unsigned long jnode_get_index(jnode * node)
20823 +{
20824 + return jnode_ops(node)->index(node);
20825 +}
20826 +
20827 +/* return true if "node" is the root */
20828 +static inline int jnode_is_root(const jnode * node)
20829 +{
20830 + return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20831 +}
20832 +
20833 +extern struct address_space *mapping_jnode(const jnode * node);
20834 +extern unsigned long index_jnode(const jnode * node);
20835 +
20836 +static inline void jput(jnode * node);
20837 +extern void jput_final(jnode * node);
20838 +
20839 +/* bump data counter on @node */
20840 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20841 +{
20842 + assert("nikita-1962", node != NULL);
20843 +
20844 + atomic_inc(&node->d_count);
20845 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
20846 + LOCK_CNT_INC(d_refs);
20847 +}
20848 +
20849 +/* jput() - decrement x_count reference counter on znode.
20850 +
20851 + Count may drop to 0, jnode stays in cache until memory pressure causes the
20852 + eviction of its page. The c_count variable also ensures that children are
20853 + pressured out of memory before the parent. The jnode remains hashed as
20854 + long as the VM allows its page to stay in memory.
20855 +*/
20856 +static inline void jput(jnode * node)
20857 +{
20858 + assert("jmacd-509", node != NULL);
20859 + assert("jmacd-510", atomic_read(&node->x_count) > 0);
20860 + assert("zam-926", reiser4_schedulable());
20861 + LOCK_CNT_DEC(x_refs);
20862 +
20863 + rcu_read_lock();
20864 + /*
20865 + * we don't need any kind of lock here--jput_final() uses RCU.
20866 + */
20867 + if (unlikely(atomic_dec_and_test(&node->x_count))) {
20868 + jput_final(node);
20869 + } else
20870 + rcu_read_unlock();
20871 + assert("nikita-3473", reiser4_schedulable());
20872 +}
20873 +
20874 +extern void jrelse(jnode * node);
20875 +extern void jrelse_tail(jnode * node);
20876 +
20877 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20878 +
20879 +/* resolve race with jput */
20880 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20881 +{
20882 + if (unlikely(JF_ISSET(node, JNODE_RIP)))
20883 + node = jnode_rip_sync(tree, node);
20884 + return node;
20885 +}
20886 +
20887 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20888 +
20889 +#if REISER4_DEBUG
20890 +extern int jnode_invariant_f(const jnode *node, char const **msg);
20891 +#endif
20892 +
20893 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20894 +
20895 +/* __JNODE_H__ */
20896 +#endif
20897 +
20898 +/* Make Linus happy.
20899 + Local variables:
20900 + c-indentation-style: "K&R"
20901 + mode-name: "LC"
20902 + c-basic-offset: 8
20903 + tab-width: 8
20904 + fill-column: 120
20905 + End:
20906 +*/
20907 diff -urN linux-2.6.20.orig/fs/reiser4/kassign.c linux-2.6.20/fs/reiser4/kassign.c
20908 --- linux-2.6.20.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20909 +++ linux-2.6.20/fs/reiser4/kassign.c 2007-05-06 14:50:43.734986973 +0400
20910 @@ -0,0 +1,661 @@
20911 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20912 + * reiser4/README */
20913 +
20914 +/* Key assignment policy implementation */
20915 +
20916 +/*
20917 + * In reiser4 every piece of file system data and meta-data has a key. Keys
20918 + * are used to store information in and retrieve it from reiser4 internal
20919 + * tree. In addition to this, keys define _ordering_ of all file system
20920 + * information: things having close keys are placed into the same or
20921 + * neighboring (in the tree order) nodes of the tree. As our block allocator
20922 + * tries to respect tree order (see flush.c), keys also define order in which
20923 + * things are laid out on the disk, and hence, affect performance directly.
20924 + *
20925 + * Obviously, assignment of keys to data and meta-data should be consistent
20926 + * across whole file system. Algorithm that calculates a key for a given piece
20927 + * of data or meta-data is referred to as "key assignment".
20928 + *
20929 + * Key assignment is too expensive to be implemented as a plugin (that is,
20930 + * with an ability to support different key assignment schemas in the same
20931 + * compiled kernel image). As a compromise, all key-assignment functions and
20932 + * data-structures are collected in this single file, so that modifications to
20933 + * key assignment algorithm can be localized. Additional changes may be
20934 + * required in key.[ch].
20935 + *
20936 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
20937 + * may guess, there is "Plan B" too.
20938 + *
20939 + */
20940 +
20941 +/*
20942 + * Additional complication with key assignment implementation is a requirement
20943 + * to support different key length.
20944 + */
20945 +
20946 +/*
20947 + * KEY ASSIGNMENT: PLAN A, LONG KEYS.
20948 + *
20949 + * DIRECTORY ITEMS
20950 + *
20951 + * | 60 | 4 | 7 |1| 56 | 64 | 64 |
20952 + * +--------------+---+---+-+-------------+------------------+-----------------+
20953 + * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
20954 + * +--------------+---+---+-+-------------+------------------+-----------------+
20955 + * | | | | |
20956 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
20957 + *
20958 + * dirid objectid of directory this item is for
20959 + *
20960 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
20961 + *
20962 + * H 1 if last 8 bytes of the key contain hash,
20963 + * 0 if last 8 bytes of the key contain prefix-3
20964 + *
20965 + * prefix-1 first 7 characters of file name.
20966 + * Padded by zeroes if name is not long enough.
20967 + *
20968 + * prefix-2 next 8 characters of the file name.
20969 + *
20970 + * prefix-3 next 8 characters of the file name.
20971 + *
20972 + * hash hash of the rest of file name (i.e., portion of file
20973 + * name not included into prefix-1 and prefix-2).
20974 + *
20975 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
20976 + * in the key. Such file names are called "short". They are distinguished by H
20977 + * bit set 0 in the key.
20978 + *
20979 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
20980 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
20981 + * key. Last 8 bytes of the key are occupied by hash of the remaining
20982 + * characters of the name.
20983 + *
20984 + * This key assignment reaches following important goals:
20985 + *
20986 + * (1) directory entries are sorted in approximately lexicographical
20987 + * order.
20988 + *
20989 + * (2) collisions (when multiple directory items have the same key), while
20990 + * principally unavoidable in a tree with fixed length keys, are rare.
20991 + *
20992 + * STAT DATA
20993 + *
20994 + * | 60 | 4 | 64 | 4 | 60 | 64 |
20995 + * +--------------+---+-----------------+---+--------------+-----------------+
20996 + * | locality id | 1 | ordering | 0 | objectid | 0 |
20997 + * +--------------+---+-----------------+---+--------------+-----------------+
20998 + * | | | | |
20999 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21000 + *
21001 + * locality id object id of a directory where first name was created for
21002 + * the object
21003 + *
21004 + * ordering copy of second 8-byte portion of the key of directory
21005 + * entry for the first name of this object. Ordering has a form
21006 + * {
21007 + * fibration :7;
21008 + * h :1;
21009 + * prefix1 :56;
21010 + * }
21011 + * see description of key for directory entry above.
21012 + *
21013 + * objectid object id for this object
21014 + *
21015 + * This key assignment policy is designed to keep stat-data in the same order
21016 + * as corresponding directory items, thus speeding up readdir/stat types of
21017 + * workload.
21018 + *
21019 + * FILE BODY
21020 + *
21021 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21022 + * +--------------+---+-----------------+---+--------------+-----------------+
21023 + * | locality id | 4 | ordering | 0 | objectid | offset |
21024 + * +--------------+---+-----------------+---+--------------+-----------------+
21025 + * | | | | |
21026 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21027 + *
21028 + * locality id object id of a directory where first name was created for
21029 + * the object
21030 + *
21031 + * ordering the same as in the key of stat-data for this object
21032 + *
21033 + * objectid object id for this object
21034 + *
21035 + * offset logical offset from the beginning of this file.
21036 + * Measured in bytes.
21037 + *
21038 + *
21039 + * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21040 + *
21041 + * DIRECTORY ITEMS
21042 + *
21043 + * | 60 | 4 | 7 |1| 56 | 64 |
21044 + * +--------------+---+---+-+-------------+-----------------+
21045 + * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21046 + * +--------------+---+---+-+-------------+-----------------+
21047 + * | | | |
21048 + * | 8 bytes | 8 bytes | 8 bytes |
21049 + *
21050 + * dirid objectid of directory this item is for
21051 + *
21052 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21053 + *
21054 + * H 1 if last 8 bytes of the key contain hash,
21055 + * 0 if last 8 bytes of the key contain prefix-2
21056 + *
21057 + * prefix-1 first 7 characters of file name.
21058 + * Padded by zeroes if name is not long enough.
21059 + *
21060 + * prefix-2 next 8 characters of the file name.
21061 + *
21062 + * hash hash of the rest of file name (i.e., portion of file
21063 + * name not included into prefix-1).
21064 + *
21065 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21066 + * the key. Such file names are called "short". They are distinguished by H
21067 + * bit set in the key.
21068 + *
21069 + * Other file names are "long". For long name, H bit is 0, and first 7
21070 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21071 + * key are occupied by hash of the remaining characters of the name.
21072 + *
21073 + * STAT DATA
21074 + *
21075 + * | 60 | 4 | 4 | 60 | 64 |
21076 + * +--------------+---+---+--------------+-----------------+
21077 + * | locality id | 1 | 0 | objectid | 0 |
21078 + * +--------------+---+---+--------------+-----------------+
21079 + * | | | |
21080 + * | 8 bytes | 8 bytes | 8 bytes |
21081 + *
21082 + * locality id object id of a directory where first name was created for
21083 + * the object
21084 + *
21085 + * objectid object id for this object
21086 + *
21087 + * FILE BODY
21088 + *
21089 + * | 60 | 4 | 4 | 60 | 64 |
21090 + * +--------------+---+---+--------------+-----------------+
21091 + * | locality id | 4 | 0 | objectid | offset |
21092 + * +--------------+---+---+--------------+-----------------+
21093 + * | | | |
21094 + * | 8 bytes | 8 bytes | 8 bytes |
21095 + *
21096 + * locality id object id of a directory where first name was created for
21097 + * the object
21098 + *
21099 + * objectid object id for this object
21100 + *
21101 + * offset logical offset from the beginning of this file.
21102 + * Measured in bytes.
21103 + *
21104 + *
21105 + */
21106 +
21107 +#include "debug.h"
21108 +#include "key.h"
21109 +#include "kassign.h"
21110 +#include "vfs_ops.h"
21111 +#include "inode.h"
21112 +#include "super.h"
21113 +#include "dscale.h"
21114 +
21115 +#include <linux/types.h> /* for __u?? */
21116 +#include <linux/fs.h> /* for struct super_block, etc */
21117 +
21118 +/* bitmask for H bit (see comment at the beginning of this file */
21119 +static const __u64 longname_mark = 0x0100000000000000ull;
21120 +/* bitmask for F and H portions of the key. */
21121 +static const __u64 fibration_mask = 0xff00000000000000ull;
21122 +
21123 +/* return true if name is not completely encoded in @key */
21124 +int is_longname_key(const reiser4_key * key)
21125 +{
21126 + __u64 highpart;
21127 +
21128 + assert("nikita-2863", key != NULL);
21129 + if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21130 + reiser4_print_key("oops", key);
21131 + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21132 +
21133 + if (REISER4_LARGE_KEY)
21134 + highpart = get_key_ordering(key);
21135 + else
21136 + highpart = get_key_objectid(key);
21137 +
21138 + return (highpart & longname_mark) ? 1 : 0;
21139 +}
21140 +
21141 +/* return true if @name is too long to be completely encoded in the key */
21142 +int is_longname(const char *name UNUSED_ARG, int len)
21143 +{
21144 + if (REISER4_LARGE_KEY)
21145 + return len > 23;
21146 + else
21147 + return len > 15;
21148 +}
21149 +
21150 +/* code ascii string into __u64.
21151 +
21152 + Put characters of @name into result (@str) one after another starting
21153 + from @start_idx-th highest (arithmetically) byte. This produces
21154 + endian-safe encoding. memcpy(2) will not do.
21155 +
21156 +*/
21157 +static __u64 pack_string(const char *name /* string to encode */ ,
21158 + int start_idx /* highest byte in result from
21159 + * which to start encoding */ )
21160 +{
21161 + unsigned i;
21162 + __u64 str;
21163 +
21164 + str = 0;
21165 + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21166 + str <<= 8;
21167 + str |= (unsigned char)name[i];
21168 + }
21169 + str <<= (sizeof str - i - start_idx) << 3;
21170 + return str;
21171 +}
21172 +
21173 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21174 + * string encoded in it and stores result in @buf */
21175 +char * reiser4_unpack_string(__u64 value, char *buf)
21176 +{
21177 + do {
21178 + *buf = value >> (64 - 8);
21179 + if (*buf)
21180 + ++buf;
21181 + value <<= 8;
21182 + } while (value != 0);
21183 + *buf = 0;
21184 + return buf;
21185 +}
21186 +
21187 +/* obtain name encoded in @key and store it in @buf */
21188 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21189 +{
21190 + char *c;
21191 +
21192 + assert("nikita-2868", !is_longname_key(key));
21193 +
21194 + c = buf;
21195 + if (REISER4_LARGE_KEY) {
21196 + c = reiser4_unpack_string(get_key_ordering(key) &
21197 + ~fibration_mask, c);
21198 + c = reiser4_unpack_string(get_key_fulloid(key), c);
21199 + } else
21200 + c = reiser4_unpack_string(get_key_fulloid(key) &
21201 + ~fibration_mask, c);
21202 + reiser4_unpack_string(get_key_offset(key), c);
21203 + return buf;
21204 +}
21205 +
21206 +/**
21207 + * complete_entry_key - calculate entry key by name
21208 + * @dir: directory where entry is (or will be) in
21209 + * @name: name to calculate key of
21210 + * @len: lenth of name
21211 + * @result: place to store result in
21212 + *
21213 + * Sets fields of entry key @result which depend on file name.
21214 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21215 + * objectid and offset. Otherwise, objectid and offset are set.
21216 + */
21217 +void complete_entry_key(const struct inode *dir, const char *name,
21218 + int len, reiser4_key *result)
21219 +{
21220 +#if REISER4_LARGE_KEY
21221 + __u64 ordering;
21222 + __u64 objectid;
21223 + __u64 offset;
21224 +
21225 + assert("nikita-1139", dir != NULL);
21226 + assert("nikita-1142", result != NULL);
21227 + assert("nikita-2867", strlen(name) == len);
21228 +
21229 + /*
21230 + * key allocation algorithm for directory entries in case of large
21231 + * keys:
21232 + *
21233 + * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21234 + * characters into ordering field of key, next 8 charactes (if any)
21235 + * into objectid field of key and next 8 ones (of any) into offset
21236 + * field of key
21237 + *
21238 + * If file name is longer than 23 characters, put first 7 characters
21239 + * into key's ordering, next 8 to objectid and hash of remaining
21240 + * characters into offset field.
21241 + *
21242 + * To distinguish above cases, in latter set up unused high bit in
21243 + * ordering field.
21244 + */
21245 +
21246 + /* [0-6] characters to ordering */
21247 + ordering = pack_string(name, 1);
21248 + if (len > 7) {
21249 + /* [7-14] characters to objectid */
21250 + objectid = pack_string(name + 7, 0);
21251 + if (len > 15) {
21252 + if (len <= 23) {
21253 + /* [15-23] characters to offset */
21254 + offset = pack_string(name + 15, 0);
21255 + } else {
21256 + /* note in a key the fact that offset contains hash. */
21257 + ordering |= longname_mark;
21258 +
21259 + /* offset is the hash of the file name's tail. */
21260 + offset = inode_hash_plugin(dir)->hash(name + 15,
21261 + len - 15);
21262 + }
21263 + } else {
21264 + offset = 0ull;
21265 + }
21266 + } else {
21267 + objectid = 0ull;
21268 + offset = 0ull;
21269 + }
21270 +
21271 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21272 + ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21273 +
21274 + set_key_ordering(result, ordering);
21275 + set_key_fulloid(result, objectid);
21276 + set_key_offset(result, offset);
21277 + return;
21278 +
21279 +#else
21280 + __u64 objectid;
21281 + __u64 offset;
21282 +
21283 + assert("nikita-1139", dir != NULL);
21284 + assert("nikita-1142", result != NULL);
21285 + assert("nikita-2867", strlen(name) == len);
21286 +
21287 + /*
21288 + * key allocation algorithm for directory entries in case of not large
21289 + * keys:
21290 + *
21291 + * If name is not longer than 7 + 8 = 15 characters, put first 7
21292 + * characters into objectid field of key, next 8 charactes (if any)
21293 + * into offset field of key
21294 + *
21295 + * If file name is longer than 15 characters, put first 7 characters
21296 + * into key's objectid, and hash of remaining characters into offset
21297 + * field.
21298 + *
21299 + * To distinguish above cases, in latter set up unused high bit in
21300 + * objectid field.
21301 + */
21302 +
21303 + /* [0-6] characters to objectid */
21304 + objectid = pack_string(name, 1);
21305 + if (len > 7) {
21306 + if (len <= 15) {
21307 + /* [7-14] characters to offset */
21308 + offset = pack_string(name + 7, 0);
21309 + } else {
21310 + /* note in a key the fact that offset contains hash. */
21311 + objectid |= longname_mark;
21312 +
21313 + /* offset is the hash of the file name. */
21314 + offset = inode_hash_plugin(dir)->hash(name + 7,
21315 + len - 7);
21316 + }
21317 + } else
21318 + offset = 0ull;
21319 +
21320 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21321 + objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21322 +
21323 + set_key_fulloid(result, objectid);
21324 + set_key_offset(result, offset);
21325 + return;
21326 +#endif /* ! REISER4_LARGE_KEY */
21327 +}
21328 +
21329 +/* true, if @key is the key of "." */
21330 +int is_dot_key(const reiser4_key * key /* key to check */ )
21331 +{
21332 + assert("nikita-1717", key != NULL);
21333 + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21334 + return
21335 + (get_key_ordering(key) == 0ull) &&
21336 + (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21337 +}
21338 +
21339 +/* build key for stat-data.
21340 +
21341 + return key of stat-data of this object. This should became sd plugin
21342 + method in the future. For now, let it be here.
21343 +
21344 +*/
21345 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21346 + reiser4_key * result /* resulting key of @target
21347 + stat-data */ )
21348 +{
21349 + assert("nikita-261", result != NULL);
21350 +
21351 + reiser4_key_init(result);
21352 + set_key_locality(result, reiser4_inode_data(target)->locality_id);
21353 + set_key_ordering(result, get_inode_ordering(target));
21354 + set_key_objectid(result, get_inode_oid(target));
21355 + set_key_type(result, KEY_SD_MINOR);
21356 + set_key_offset(result, (__u64) 0);
21357 + return result;
21358 +}
21359 +
21360 +/* encode part of key into &obj_key_id
21361 +
21362 + This encodes into @id part of @key sufficient to restore @key later,
21363 + given that latter is key of object (key of stat-data).
21364 +
21365 + See &obj_key_id
21366 +*/
21367 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21368 + obj_key_id * id /* id where key is encoded in */ )
21369 +{
21370 + assert("nikita-1151", key != NULL);
21371 + assert("nikita-1152", id != NULL);
21372 +
21373 + memcpy(id, key, sizeof *id);
21374 + return 0;
21375 +}
21376 +
21377 +/* encode reference to @obj in @id.
21378 +
21379 + This is like build_obj_key_id() above, but takes inode as parameter. */
21380 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21381 + obj_key_id * id /* result */ )
21382 +{
21383 + reiser4_key sdkey;
21384 +
21385 + assert("nikita-1166", obj != NULL);
21386 + assert("nikita-1167", id != NULL);
21387 +
21388 + build_sd_key(obj, &sdkey);
21389 + build_obj_key_id(&sdkey, id);
21390 + return 0;
21391 +}
21392 +
21393 +/* decode @id back into @key
21394 +
21395 + Restore key of object stat-data from @id. This is dual to
21396 + build_obj_key_id() above.
21397 +*/
21398 +int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21399 + * from */ ,
21400 + reiser4_key * key /* result */ )
21401 +{
21402 + assert("nikita-1153", id != NULL);
21403 + assert("nikita-1154", key != NULL);
21404 +
21405 + reiser4_key_init(key);
21406 + memcpy(key, id, sizeof *id);
21407 + return 0;
21408 +}
21409 +
21410 +/* extract objectid of directory from key of directory entry within said
21411 + directory.
21412 + */
21413 +oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21414 + * directory
21415 + * entry */ )
21416 +{
21417 + assert("nikita-1314", de_key != NULL);
21418 + return get_key_locality(de_key);
21419 +}
21420 +
21421 +/* encode into @id key of directory entry.
21422 +
21423 + Encode into @id information sufficient to later distinguish directory
21424 + entries within the same directory. This is not whole key, because all
21425 + directory entries within directory item share locality which is equal
21426 + to objectid of their directory.
21427 +
21428 +*/
21429 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21430 + const struct qstr *name /* name to be given to @obj by
21431 + * directory entry being
21432 + * constructed */ ,
21433 + de_id * id /* short key of directory entry */ )
21434 +{
21435 + reiser4_key key;
21436 +
21437 + assert("nikita-1290", dir != NULL);
21438 + assert("nikita-1292", id != NULL);
21439 +
21440 + /* NOTE-NIKITA this is suboptimal. */
21441 + inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21442 + return build_de_id_by_key(&key, id);
21443 +}
21444 +
21445 +/* encode into @id key of directory entry.
21446 +
21447 + Encode into @id information sufficient to later distinguish directory
21448 + entries within the same directory. This is not whole key, because all
21449 + directory entries within directory item share locality which is equal
21450 + to objectid of their directory.
21451 +
21452 +*/
21453 +int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21454 + * entry */ ,
21455 + de_id * id /* short key of directory entry */ )
21456 +{
21457 + memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21458 + return 0;
21459 +}
21460 +
21461 +/* restore from @id key of directory entry.
21462 +
21463 + Function dual to build_de_id(): given @id and locality, build full
21464 + key of directory entry within directory item.
21465 +
21466 +*/
21467 +int extract_key_from_de_id(const oid_t locality /* locality of directory
21468 + * entry */ ,
21469 + const de_id * id /* directory entry id */ ,
21470 + reiser4_key * key /* result */ )
21471 +{
21472 + /* no need to initialise key here: all fields are overwritten */
21473 + memcpy(((__u64 *) key) + 1, id, sizeof *id);
21474 + set_key_locality(key, locality);
21475 + set_key_type(key, KEY_FILE_NAME_MINOR);
21476 + return 0;
21477 +}
21478 +
21479 +/* compare two &de_id's */
21480 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21481 + const de_id * id2 /* second &de_id to compare */ )
21482 +{
21483 + /* NOTE-NIKITA ugly implementation */
21484 + reiser4_key k1;
21485 + reiser4_key k2;
21486 +
21487 + extract_key_from_de_id((oid_t) 0, id1, &k1);
21488 + extract_key_from_de_id((oid_t) 0, id2, &k2);
21489 + return keycmp(&k1, &k2);
21490 +}
21491 +
21492 +/* compare &de_id with key */
21493 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21494 + const reiser4_key * key /* key to compare */ )
21495 +{
21496 + cmp_t result;
21497 + reiser4_key *k1;
21498 +
21499 + k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21500 + result = KEY_DIFF_EL(k1, key, 1);
21501 + if (result == EQUAL_TO) {
21502 + result = KEY_DIFF_EL(k1, key, 2);
21503 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21504 + result = KEY_DIFF_EL(k1, key, 3);
21505 + }
21506 + }
21507 + return result;
21508 +}
21509 +
21510 +/*
21511 + * return number of bytes necessary to encode @inode identity.
21512 + */
21513 +int inode_onwire_size(const struct inode *inode)
21514 +{
21515 + int result;
21516 +
21517 + result = dscale_bytes(get_inode_oid(inode));
21518 + result += dscale_bytes(get_inode_locality(inode));
21519 +
21520 + /*
21521 + * ordering is large (it usually has highest bits set), so it makes
21522 + * little sense to dscale it.
21523 + */
21524 + if (REISER4_LARGE_KEY)
21525 + result += sizeof(get_inode_ordering(inode));
21526 + return result;
21527 +}
21528 +
21529 +/*
21530 + * encode @inode identity at @start
21531 + */
21532 +char *build_inode_onwire(const struct inode *inode, char *start)
21533 +{
21534 + start += dscale_write(start, get_inode_locality(inode));
21535 + start += dscale_write(start, get_inode_oid(inode));
21536 +
21537 + if (REISER4_LARGE_KEY) {
21538 + put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21539 + start += sizeof(get_inode_ordering(inode));
21540 + }
21541 + return start;
21542 +}
21543 +
21544 +/*
21545 + * extract key that was previously encoded by build_inode_onwire() at @addr
21546 + */
21547 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21548 +{
21549 + __u64 val;
21550 +
21551 + addr += dscale_read(addr, &val);
21552 + val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21553 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21554 + addr += dscale_read(addr, &val);
21555 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21556 +#if REISER4_LARGE_KEY
21557 + memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21558 + addr += sizeof key_id->ordering;
21559 +#endif
21560 + return addr;
21561 +}
21562 +
21563 +/* Make Linus happy.
21564 + Local variables:
21565 + c-indentation-style: "K&R"
21566 + mode-name: "LC"
21567 + c-basic-offset: 8
21568 + tab-width: 8
21569 + fill-column: 120
21570 + End:
21571 +*/
21572 diff -urN linux-2.6.20.orig/fs/reiser4/kassign.h linux-2.6.20/fs/reiser4/kassign.h
21573 --- linux-2.6.20.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21574 +++ linux-2.6.20/fs/reiser4/kassign.h 2007-05-06 14:50:43.734986973 +0400
21575 @@ -0,0 +1,110 @@
21576 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21577 + * reiser4/README */
21578 +
21579 +/* Key assignment policy interface. See kassign.c for details. */
21580 +
21581 +#if !defined( __KASSIGN_H__ )
21582 +#define __KASSIGN_H__
21583 +
21584 +#include "forward.h"
21585 +#include "key.h"
21586 +#include "dformat.h"
21587 +
21588 +#include <linux/types.h> /* for __u?? */
21589 +#include <linux/fs.h> /* for struct super_block, etc */
21590 +#include <linux/dcache.h> /* for struct qstr */
21591 +
21592 +/* key assignment functions */
21593 +
21594 +/* Information from which key of file stat-data can be uniquely
21595 + restored. This depends on key assignment policy for
21596 + stat-data. Currently it's enough to store object id and locality id
21597 + (60+60==120) bits, because minor packing locality and offset of
21598 + stat-data key are always known constants: KEY_SD_MINOR and 0
21599 + respectively. For simplicity 4 bits are wasted in each id, and just
21600 + two 64 bit integers are stored.
21601 +
21602 + This field has to be byte-aligned, because we don't want to waste
21603 + space in directory entries. There is another side of a coin of
21604 + course: we waste CPU and bus bandwidth in stead, by copying data back
21605 + and forth.
21606 +
21607 + Next optimization: &obj_key_id is mainly used to address stat data from
21608 + directory entries. Under the assumption that majority of files only have
21609 + only name (one hard link) from *the* parent directory it seems reasonable
21610 + to only store objectid of stat data and take its locality from key of
21611 + directory item.
21612 +
21613 + This requires some flag to be added to the &obj_key_id to distinguish
21614 + between these two cases. Remaining bits in flag byte are then asking to be
21615 + used to store file type.
21616 +
21617 + This optimization requires changes in directory item handling code.
21618 +
21619 +*/
21620 +typedef struct obj_key_id {
21621 + d8 locality[sizeof(__u64)];
21622 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21623 + )
21624 + d8 objectid[sizeof(__u64)];
21625 +}
21626 +obj_key_id;
21627 +
21628 +/* Information sufficient to uniquely identify directory entry within
21629 + compressed directory item.
21630 +
21631 + For alignment issues see &obj_key_id above.
21632 +*/
21633 +typedef struct de_id {
21634 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21635 + d8 objectid[sizeof(__u64)];
21636 + d8 offset[sizeof(__u64)];
21637 +}
21638 +de_id;
21639 +
21640 +extern int inode_onwire_size(const struct inode *obj);
21641 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21642 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21643 +
21644 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21645 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21646 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21647 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21648 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21649 + de_id * id);
21650 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21651 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21652 + reiser4_key * key);
21653 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21654 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21655 +
21656 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21657 +extern void build_entry_key_common(const struct inode *dir,
21658 + const struct qstr *name,
21659 + reiser4_key * result);
21660 +extern void build_entry_key_stable_entry(const struct inode *dir,
21661 + const struct qstr *name,
21662 + reiser4_key * result);
21663 +extern int is_dot_key(const reiser4_key * key);
21664 +extern reiser4_key *build_sd_key(const struct inode *target,
21665 + reiser4_key * result);
21666 +
21667 +extern int is_longname_key(const reiser4_key * key);
21668 +extern int is_longname(const char *name, int len);
21669 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21670 +extern char *reiser4_unpack_string(__u64 value, char *buf);
21671 +extern void complete_entry_key(const struct inode *dir, const char *name,
21672 + int len, reiser4_key *result);
21673 +
21674 +/* __KASSIGN_H__ */
21675 +#endif
21676 +
21677 +/* Make Linus happy.
21678 + Local variables:
21679 + c-indentation-style: "K&R"
21680 + mode-name: "LC"
21681 + c-basic-offset: 8
21682 + tab-width: 8
21683 + fill-column: 120
21684 + End:
21685 +*/
21686 diff -urN linux-2.6.20.orig/fs/reiser4/Kconfig linux-2.6.20/fs/reiser4/Kconfig
21687 --- linux-2.6.20.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21688 +++ linux-2.6.20/fs/reiser4/Kconfig 2007-05-06 14:50:43.734986973 +0400
21689 @@ -0,0 +1,32 @@
21690 +config REISER4_FS
21691 + tristate "Reiser4 (EXPERIMENTAL)"
21692 + depends on EXPERIMENTAL
21693 + select ZLIB_INFLATE
21694 + select ZLIB_DEFLATE
21695 + select CRYPTO
21696 + help
21697 + Reiser4 is a filesystem that performs all filesystem operations
21698 + as atomic transactions, which means that it either performs a
21699 + write, or it does not, and in the event of a crash it does not
21700 + partially perform it or corrupt it.
21701 +
21702 + It stores files in dancing trees, which are like balanced trees but
21703 + faster. It packs small files together so that they share blocks
21704 + without wasting space. This means you can use it to store really
21705 + small files. It also means that it saves you disk space. It avoids
21706 + hassling you with anachronisms like having a maximum number of
21707 + inodes, and wasting space if you use less than that number.
21708 +
21709 + Reiser4 is a distinct filesystem type from reiserfs (V3).
21710 + It's therefore not possible to use reiserfs file systems
21711 + with reiser4.
21712 +
21713 + To learn more about reiser4, go to http://www.namesys.com
21714 +
21715 +config REISER4_DEBUG
21716 + bool "Enable reiser4 debug mode"
21717 + depends on REISER4_FS
21718 + help
21719 + Don't use this unless you are debugging reiser4.
21720 +
21721 + If unsure, say N.
21722 diff -urN linux-2.6.20.orig/fs/reiser4/key.c linux-2.6.20/fs/reiser4/key.c
21723 --- linux-2.6.20.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21724 +++ linux-2.6.20/fs/reiser4/key.c 2007-05-06 14:50:43.734986973 +0400
21725 @@ -0,0 +1,137 @@
21726 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21727 +
21728 +/* Key manipulations. */
21729 +
21730 +#include "debug.h"
21731 +#include "key.h"
21732 +#include "super.h"
21733 +#include "reiser4.h"
21734 +
21735 +#include <linux/types.h> /* for __u?? */
21736 +
21737 +/* Minimal possible key: all components are zero. It is presumed that this is
21738 + independent of key scheme. */
21739 +static const reiser4_key MINIMAL_KEY = {
21740 + .el = {
21741 + 0ull,
21742 + ON_LARGE_KEY(0ull,)
21743 + 0ull,
21744 + 0ull
21745 + }
21746 +};
21747 +
21748 +/* Maximal possible key: all components are ~0. It is presumed that this is
21749 + independent of key scheme. */
21750 +static const reiser4_key MAXIMAL_KEY = {
21751 + .el = {
21752 + __constant_cpu_to_le64(~0ull),
21753 + ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21754 + __constant_cpu_to_le64(~0ull),
21755 + __constant_cpu_to_le64(~0ull)
21756 + }
21757 +};
21758 +
21759 +/* Initialize key. */
21760 +void reiser4_key_init(reiser4_key * key /* key to init */ )
21761 +{
21762 + assert("nikita-1169", key != NULL);
21763 + memset(key, 0, sizeof *key);
21764 +}
21765 +
21766 +/* minimal possible key in the tree. Return pointer to the static storage. */
21767 +const reiser4_key *reiser4_min_key(void)
21768 +{
21769 + return &MINIMAL_KEY;
21770 +}
21771 +
21772 +/* maximum possible key in the tree. Return pointer to the static storage. */
21773 +const reiser4_key *reiser4_max_key(void)
21774 +{
21775 + return &MAXIMAL_KEY;
21776 +}
21777 +
21778 +#if REISER4_DEBUG
21779 +/* debugging aid: print symbolic name of key type */
21780 +static const char *type_name(unsigned int key_type /* key type */ )
21781 +{
21782 + switch (key_type) {
21783 + case KEY_FILE_NAME_MINOR:
21784 + return "file name";
21785 + case KEY_SD_MINOR:
21786 + return "stat data";
21787 + case KEY_ATTR_NAME_MINOR:
21788 + return "attr name";
21789 + case KEY_ATTR_BODY_MINOR:
21790 + return "attr body";
21791 + case KEY_BODY_MINOR:
21792 + return "file body";
21793 + default:
21794 + return "unknown";
21795 + }
21796 +}
21797 +
21798 +/* debugging aid: print human readable information about key */
21799 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
21800 + const reiser4_key * key /* key to print */ )
21801 +{
21802 + /* turn bold on */
21803 + /* printf ("\033[1m"); */
21804 + if (key == NULL)
21805 + printk("%s: null key\n", prefix);
21806 + else {
21807 + if (REISER4_LARGE_KEY)
21808 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21809 + get_key_locality(key),
21810 + get_key_type(key),
21811 + get_key_ordering(key),
21812 + get_key_band(key),
21813 + get_key_objectid(key), get_key_offset(key));
21814 + else
21815 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21816 + get_key_locality(key),
21817 + get_key_type(key),
21818 + get_key_band(key),
21819 + get_key_objectid(key), get_key_offset(key));
21820 + /*
21821 + * if this is a key of directory entry, try to decode part of
21822 + * a name stored in the key, and output it.
21823 + */
21824 + if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21825 + char buf[DE_NAME_BUF_LEN];
21826 + char *c;
21827 +
21828 + c = buf;
21829 + c = reiser4_unpack_string(get_key_ordering(key), c);
21830 + reiser4_unpack_string(get_key_fulloid(key), c);
21831 + printk("[%s", buf);
21832 + if (is_longname_key(key))
21833 + /*
21834 + * only part of the name is stored in the key.
21835 + */
21836 + printk("...]\n");
21837 + else {
21838 + /*
21839 + * whole name is stored in the key.
21840 + */
21841 + reiser4_unpack_string(get_key_offset(key), buf);
21842 + printk("%s]\n", buf);
21843 + }
21844 + } else {
21845 + printk("[%s]\n", type_name(get_key_type(key)));
21846 + }
21847 + }
21848 + /* turn bold off */
21849 + /* printf ("\033[m\017"); */
21850 +}
21851 +
21852 +#endif
21853 +
21854 +/* Make Linus happy.
21855 + Local variables:
21856 + c-indentation-style: "K&R"
21857 + mode-name: "LC"
21858 + c-basic-offset: 8
21859 + tab-width: 8
21860 + fill-column: 120
21861 + End:
21862 +*/
21863 diff -urN linux-2.6.20.orig/fs/reiser4/key.h linux-2.6.20/fs/reiser4/key.h
21864 --- linux-2.6.20.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21865 +++ linux-2.6.20/fs/reiser4/key.h 2007-05-06 14:50:43.738988223 +0400
21866 @@ -0,0 +1,384 @@
21867 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21868 +
21869 +/* Declarations of key-related data-structures and operations on keys. */
21870 +
21871 +#if !defined( __REISER4_KEY_H__ )
21872 +#define __REISER4_KEY_H__
21873 +
21874 +#include "dformat.h"
21875 +#include "forward.h"
21876 +#include "debug.h"
21877 +
21878 +#include <linux/types.h> /* for __u?? */
21879 +
21880 +/* Operations on keys in reiser4 tree */
21881 +
21882 +/* No access to any of these fields shall be done except via a
21883 + wrapping macro/function, and that wrapping macro/function shall
21884 + convert to little endian order. Compare keys will consider cpu byte order. */
21885 +
21886 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21887 + which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21888 + within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21889 + approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21890 + right one. */
21891 +
21892 +/* possible values for minor packing locality (4 bits required) */
21893 +typedef enum {
21894 + /* file name */
21895 + KEY_FILE_NAME_MINOR = 0,
21896 + /* stat-data */
21897 + KEY_SD_MINOR = 1,
21898 + /* file attribute name */
21899 + KEY_ATTR_NAME_MINOR = 2,
21900 + /* file attribute value */
21901 + KEY_ATTR_BODY_MINOR = 3,
21902 + /* file body (tail or extent) */
21903 + KEY_BODY_MINOR = 4,
21904 +} key_minor_locality;
21905 +
21906 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21907 + Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21908 + and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21909 + segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21910 + block_alloc.c to check the node type when deciding where to allocate the node.
21911 +
21912 + The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21913 + should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21914 + current implementation tails have a different minor packing locality from extents, and no files have both extents and
21915 + tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21916 +*/
21917 +
21918 +/* Arbitrary major packing localities can be assigned to objects using
21919 + the reiser4(filenameA/..packing<=some_number) system call.
21920 +
21921 + In reiser4, the creat() syscall creates a directory
21922 +
21923 + whose default flow (that which is referred to if the directory is
21924 + read as a file) is the traditional unix file body.
21925 +
21926 + whose directory plugin is the 'filedir'
21927 +
21928 + whose major packing locality is that of the parent of the object created.
21929 +
21930 + The static_stat item is a particular commonly used directory
21931 + compression (the one for normal unix files).
21932 +
21933 + The filedir plugin checks to see if the static_stat item exists.
21934 + There is a unique key for static_stat. If yes, then it uses the
21935 + static_stat item for all of the values that it contains. The
21936 + static_stat item contains a flag for each stat it contains which
21937 + indicates whether one should look outside the static_stat item for its
21938 + contents.
21939 +*/
21940 +
21941 +/* offset of fields in reiser4_key. Value of each element of this enum
21942 + is index within key (thought as array of __u64's) where this field
21943 + is. */
21944 +typedef enum {
21945 + /* major "locale", aka dirid. Sits in 1st element */
21946 + KEY_LOCALITY_INDEX = 0,
21947 + /* minor "locale", aka item type. Sits in 1st element */
21948 + KEY_TYPE_INDEX = 0,
21949 + ON_LARGE_KEY(KEY_ORDERING_INDEX,)
21950 + /* "object band". Sits in 2nd element */
21951 + KEY_BAND_INDEX,
21952 + /* objectid. Sits in 2nd element */
21953 + KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
21954 + /* full objectid. Sits in 2nd element */
21955 + KEY_FULLOID_INDEX = KEY_BAND_INDEX,
21956 + /* Offset. Sits in 3rd element */
21957 + KEY_OFFSET_INDEX,
21958 + /* Name hash. Sits in 3rd element */
21959 + KEY_HASH_INDEX = KEY_OFFSET_INDEX,
21960 + KEY_CACHELINE_END = KEY_OFFSET_INDEX,
21961 + KEY_LAST_INDEX
21962 +} reiser4_key_field_index;
21963 +
21964 +/* key in reiser4 internal "balanced" tree. It is just array of three
21965 + 64bit integers in disk byte order (little-endian by default). This
21966 + array is actually indexed by reiser4_key_field. Each __u64 within
21967 + this array is called "element". Logical key component encoded within
21968 + elements are called "fields".
21969 +
21970 + We declare this as union with second component dummy to suppress
21971 + inconvenient array<->pointer casts implied in C. */
21972 +union reiser4_key {
21973 + __le64 el[KEY_LAST_INDEX];
21974 + int pad;
21975 +};
21976 +
21977 +/* bitmasks showing where within reiser4_key particular key is stored. */
21978 +/* major locality occupies higher 60 bits of the first element */
21979 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
21980 +
21981 +/* minor locality occupies lower 4 bits of the first element */
21982 +#define KEY_TYPE_MASK 0xfull
21983 +
21984 +/* controversial band occupies higher 4 bits of the 2nd element */
21985 +#define KEY_BAND_MASK 0xf000000000000000ull
21986 +
21987 +/* objectid occupies lower 60 bits of the 2nd element */
21988 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
21989 +
21990 +/* full 64bit objectid*/
21991 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
21992 +
21993 +/* offset is just 3rd L.M.Nt itself */
21994 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
21995 +
21996 +/* ordering is whole second element */
21997 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
21998 +
21999 +/* how many bits key element should be shifted to left to get particular field */
22000 +typedef enum {
22001 + KEY_LOCALITY_SHIFT = 4,
22002 + KEY_TYPE_SHIFT = 0,
22003 + KEY_BAND_SHIFT = 60,
22004 + KEY_OBJECTID_SHIFT = 0,
22005 + KEY_FULLOID_SHIFT = 0,
22006 + KEY_OFFSET_SHIFT = 0,
22007 + KEY_ORDERING_SHIFT = 0,
22008 +} reiser4_key_field_shift;
22009 +
22010 +static inline __u64
22011 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22012 +{
22013 + assert("nikita-753", key != NULL);
22014 + assert("nikita-754", off < KEY_LAST_INDEX);
22015 + return le64_to_cpu(get_unaligned(&key->el[off]));
22016 +}
22017 +
22018 +static inline void
22019 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22020 +{
22021 + assert("nikita-755", key != NULL);
22022 + assert("nikita-756", off < KEY_LAST_INDEX);
22023 + put_unaligned(cpu_to_le64(value), &key->el[off]);
22024 +}
22025 +
22026 +/* macro to define getter and setter functions for field F with type T */
22027 +#define DEFINE_KEY_FIELD( L, U, T ) \
22028 +static inline T get_key_ ## L ( const reiser4_key *key ) \
22029 +{ \
22030 + assert( "nikita-750", key != NULL ); \
22031 + return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22032 + KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22033 +} \
22034 + \
22035 +static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22036 +{ \
22037 + __u64 el; \
22038 + \
22039 + assert( "nikita-752", key != NULL ); \
22040 + \
22041 + el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22042 + /* clear field bits in the key */ \
22043 + el &= ~KEY_ ## U ## _MASK; \
22044 + /* actually it should be \
22045 + \
22046 + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22047 + \
22048 + but we trust user to never pass values that wouldn't fit \
22049 + into field. Clearing extra bits is one operation, but this \
22050 + function is time-critical. \
22051 + But check this in assertion. */ \
22052 + assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22053 + ~KEY_ ## U ## _MASK ) == 0 ); \
22054 + el |= ( loc << KEY_ ## U ## _SHIFT ); \
22055 + set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22056 +}
22057 +
22058 +typedef __u64 oid_t;
22059 +
22060 +/* define get_key_locality(), set_key_locality() */
22061 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22062 +/* define get_key_type(), set_key_type() */
22063 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22064 +/* define get_key_band(), set_key_band() */
22065 +DEFINE_KEY_FIELD(band, BAND, __u64);
22066 +/* define get_key_objectid(), set_key_objectid() */
22067 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22068 +/* define get_key_fulloid(), set_key_fulloid() */
22069 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22070 +/* define get_key_offset(), set_key_offset() */
22071 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22072 +#if (REISER4_LARGE_KEY)
22073 +/* define get_key_ordering(), set_key_ordering() */
22074 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22075 +#else
22076 +static inline __u64 get_key_ordering(const reiser4_key * key)
22077 +{
22078 + return 0;
22079 +}
22080 +
22081 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22082 +{
22083 +}
22084 +#endif
22085 +
22086 +/* key comparison result */
22087 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22088 + EQUAL_TO = 0, /* if keys are equal */
22089 + GREATER_THAN = +1 /* if first key is greater than second */
22090 +} cmp_t;
22091 +
22092 +void reiser4_key_init(reiser4_key * key);
22093 +
22094 +/* minimal possible key in the tree. Return pointer to the static storage. */
22095 +extern const reiser4_key *reiser4_min_key(void);
22096 +extern const reiser4_key *reiser4_max_key(void);
22097 +
22098 +/* helper macro for keycmp() */
22099 +#define KEY_DIFF(k1, k2, field) \
22100 +({ \
22101 + typeof (get_key_ ## field (k1)) f1; \
22102 + typeof (get_key_ ## field (k2)) f2; \
22103 + \
22104 + f1 = get_key_ ## field (k1); \
22105 + f2 = get_key_ ## field (k2); \
22106 + \
22107 + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22108 +})
22109 +
22110 +/* helper macro for keycmp() */
22111 +#define KEY_DIFF_EL(k1, k2, off) \
22112 +({ \
22113 + __u64 e1; \
22114 + __u64 e2; \
22115 + \
22116 + e1 = get_key_el(k1, off); \
22117 + e2 = get_key_el(k2, off); \
22118 + \
22119 + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22120 +})
22121 +
22122 +/* compare `k1' and `k2'. This function is a heart of "key allocation
22123 + policy". All you need to implement new policy is to add yet another
22124 + clause here. */
22125 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22126 + const reiser4_key * k2 /* second key to compare */ )
22127 +{
22128 + cmp_t result;
22129 +
22130 + /*
22131 + * This function is the heart of reiser4 tree-routines. Key comparison
22132 + * is among most heavily used operations in the file system.
22133 + */
22134 +
22135 + assert("nikita-439", k1 != NULL);
22136 + assert("nikita-440", k2 != NULL);
22137 +
22138 + /* there is no actual branch here: condition is compile time constant
22139 + * and constant folding and propagation ensures that only one branch
22140 + * is actually compiled in. */
22141 +
22142 + if (REISER4_PLANA_KEY_ALLOCATION) {
22143 + /* if physical order of fields in a key is identical
22144 + with logical order, we can implement key comparison
22145 + as three 64bit comparisons. */
22146 + /* logical order of fields in plan-a:
22147 + locality->type->objectid->offset. */
22148 + /* compare locality and type at once */
22149 + result = KEY_DIFF_EL(k1, k2, 0);
22150 + if (result == EQUAL_TO) {
22151 + /* compare objectid (and band if it's there) */
22152 + result = KEY_DIFF_EL(k1, k2, 1);
22153 + /* compare offset */
22154 + if (result == EQUAL_TO) {
22155 + result = KEY_DIFF_EL(k1, k2, 2);
22156 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22157 + result = KEY_DIFF_EL(k1, k2, 3);
22158 + }
22159 + }
22160 + }
22161 + } else if (REISER4_3_5_KEY_ALLOCATION) {
22162 + result = KEY_DIFF(k1, k2, locality);
22163 + if (result == EQUAL_TO) {
22164 + result = KEY_DIFF(k1, k2, objectid);
22165 + if (result == EQUAL_TO) {
22166 + result = KEY_DIFF(k1, k2, type);
22167 + if (result == EQUAL_TO)
22168 + result = KEY_DIFF(k1, k2, offset);
22169 + }
22170 + }
22171 + } else
22172 + impossible("nikita-441", "Unknown key allocation scheme!");
22173 + return result;
22174 +}
22175 +
22176 +/* true if @k1 equals @k2 */
22177 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22178 + const reiser4_key * k2 /* second key to compare */ )
22179 +{
22180 + assert("nikita-1879", k1 != NULL);
22181 + assert("nikita-1880", k2 != NULL);
22182 + return !memcmp(k1, k2, sizeof *k1);
22183 +}
22184 +
22185 +/* true if @k1 is less than @k2 */
22186 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22187 + const reiser4_key * k2 /* second key to compare */ )
22188 +{
22189 + assert("nikita-1952", k1 != NULL);
22190 + assert("nikita-1953", k2 != NULL);
22191 + return keycmp(k1, k2) == LESS_THAN;
22192 +}
22193 +
22194 +/* true if @k1 is less than or equal to @k2 */
22195 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22196 + const reiser4_key * k2 /* second key to compare */ )
22197 +{
22198 + assert("nikita-1954", k1 != NULL);
22199 + assert("nikita-1955", k2 != NULL);
22200 + return keycmp(k1, k2) != GREATER_THAN;
22201 +}
22202 +
22203 +/* true if @k1 is greater than @k2 */
22204 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22205 + const reiser4_key * k2 /* second key to compare */ )
22206 +{
22207 + assert("nikita-1959", k1 != NULL);
22208 + assert("nikita-1960", k2 != NULL);
22209 + return keycmp(k1, k2) == GREATER_THAN;
22210 +}
22211 +
22212 +/* true if @k1 is greater than or equal to @k2 */
22213 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22214 + const reiser4_key * k2 /* second key to compare */ )
22215 +{
22216 + assert("nikita-1956", k1 != NULL);
22217 + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22218 + * November 3: Laika */
22219 + return keycmp(k1, k2) != LESS_THAN;
22220 +}
22221 +
22222 +static inline void prefetchkey(reiser4_key * key)
22223 +{
22224 + prefetch(key);
22225 + prefetch(&key->el[KEY_CACHELINE_END]);
22226 +}
22227 +
22228 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22229 + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22230 +/* size of a buffer suitable to hold human readable key representation */
22231 +#define KEY_BUF_LEN (80)
22232 +
22233 +#if REISER4_DEBUG
22234 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22235 +#else
22236 +#define reiser4_print_key(p,k) noop
22237 +#endif
22238 +
22239 +/* __FS_REISERFS_KEY_H__ */
22240 +#endif
22241 +
22242 +/* Make Linus happy.
22243 + Local variables:
22244 + c-indentation-style: "K&R"
22245 + mode-name: "LC"
22246 + c-basic-offset: 8
22247 + tab-width: 8
22248 + fill-column: 120
22249 + End:
22250 +*/
22251 diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c linux-2.6.20/fs/reiser4/ktxnmgrd.c
22252 --- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22253 +++ linux-2.6.20/fs/reiser4/ktxnmgrd.c 2007-05-06 14:50:43.738988223 +0400
22254 @@ -0,0 +1,215 @@
22255 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22256 +/* Transaction manager daemon. */
22257 +
22258 +/*
22259 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22260 + * needed/important for the following reasons:
22261 + *
22262 + * 1. in reiser4 atom is not committed immediately when last transaction
22263 + * handle closes, unless atom is either too old or too large (see
22264 + * atom_should_commit()). This is done to avoid committing too frequently.
22265 + * because:
22266 + *
22267 + * 2. sometimes we don't want to commit atom when closing last transaction
22268 + * handle even if it is old and fat enough. For example, because we are at
22269 + * this point under directory semaphore, and committing would stall all
22270 + * accesses to this directory.
22271 + *
22272 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22273 + * either due to (tunable) timeout or because it was explicitly woken up by
22274 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22275 + * eligible.
22276 + *
22277 + */
22278 +
22279 +#include "debug.h"
22280 +#include "txnmgr.h"
22281 +#include "tree.h"
22282 +#include "ktxnmgrd.h"
22283 +#include "super.h"
22284 +#include "reiser4.h"
22285 +
22286 +#include <linux/sched.h> /* for struct task_struct */
22287 +#include <linux/wait.h>
22288 +#include <linux/suspend.h>
22289 +#include <linux/kernel.h>
22290 +#include <linux/writeback.h>
22291 +#include <linux/kthread.h>
22292 +#include <linux/freezer.h>
22293 +
22294 +static int scan_mgr(struct super_block *);
22295 +
22296 +/*
22297 + * change current->comm so that ps, top, and friends will see changed
22298 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22299 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22300 + */
22301 +#define set_comm( state ) \
22302 + snprintf( current -> comm, sizeof( current -> comm ), \
22303 + "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22304 +
22305 +/**
22306 + * ktxnmgrd - kernel txnmgr daemon
22307 + * @arg: pointer to super block
22308 + *
22309 + * The background transaction manager daemon, started as a kernel thread during
22310 + * reiser4 initialization.
22311 + */
22312 +static int ktxnmgrd(void *arg)
22313 +{
22314 + struct super_block *super;
22315 + ktxnmgrd_context *ctx;
22316 + txn_mgr *mgr;
22317 + int done = 0;
22318 +
22319 + super = arg;
22320 + mgr = &get_super_private(super)->tmgr;
22321 +
22322 + /*
22323 + * do_fork() just copies task_struct into the new thread. ->fs_context
22324 + * shouldn't be copied of course. This shouldn't be a problem for the
22325 + * rest of the code though.
22326 + */
22327 + current->journal_info = NULL;
22328 + ctx = mgr->daemon;
22329 + while (1) {
22330 + try_to_freeze();
22331 + set_comm("wait");
22332 + {
22333 + DEFINE_WAIT(__wait);
22334 +
22335 + prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22336 + if (kthread_should_stop()) {
22337 + done = 1;
22338 + } else
22339 + schedule_timeout(ctx->timeout);
22340 + finish_wait(&ctx->wait, &__wait);
22341 + }
22342 + if (done)
22343 + break;
22344 + set_comm("run");
22345 + spin_lock(&ctx->guard);
22346 + /*
22347 + * wait timed out or ktxnmgrd was woken up by explicit request
22348 + * to commit something. Scan list of atoms in txnmgr and look
22349 + * for too old atoms.
22350 + */
22351 + do {
22352 + ctx->rescan = 0;
22353 + scan_mgr(super);
22354 + spin_lock(&ctx->guard);
22355 + if (ctx->rescan) {
22356 + /*
22357 + * the list could be modified while ctx
22358 + * spinlock was released, we have to repeat
22359 + * scanning from the beginning
22360 + */
22361 + break;
22362 + }
22363 + } while (ctx->rescan);
22364 + spin_unlock(&ctx->guard);
22365 + }
22366 + return 0;
22367 +}
22368 +
22369 +#undef set_comm
22370 +
22371 +/**
22372 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22373 + * @super: pointer to super block
22374 + *
22375 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22376 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22377 + */
22378 +int reiser4_init_ktxnmgrd(struct super_block *super)
22379 +{
22380 + txn_mgr *mgr;
22381 + ktxnmgrd_context *ctx;
22382 +
22383 + mgr = &get_super_private(super)->tmgr;
22384 +
22385 + assert("zam-1014", mgr->daemon == NULL);
22386 +
22387 + ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22388 + if (ctx == NULL)
22389 + return RETERR(-ENOMEM);
22390 +
22391 + assert("nikita-2442", ctx != NULL);
22392 +
22393 + memset(ctx, 0, sizeof *ctx);
22394 + init_waitqueue_head(&ctx->wait);
22395 +
22396 + /*kcond_init(&ctx->startup);*/
22397 + spin_lock_init(&ctx->guard);
22398 + ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22399 + ctx->rescan = 1;
22400 + mgr->daemon = ctx;
22401 +
22402 + ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22403 + if (IS_ERR(ctx->tsk)) {
22404 + int ret = PTR_ERR(ctx->tsk);
22405 + mgr->daemon = NULL;
22406 + kfree(ctx);
22407 + return RETERR(ret);
22408 + }
22409 + return 0;
22410 +}
22411 +
22412 +void ktxnmgrd_kick(txn_mgr *mgr)
22413 +{
22414 + assert("nikita-3234", mgr != NULL);
22415 + assert("nikita-3235", mgr->daemon != NULL);
22416 + wake_up(&mgr->daemon->wait);
22417 +}
22418 +
22419 +int is_current_ktxnmgrd(void)
22420 +{
22421 + return (get_current_super_private()->tmgr.daemon->tsk == current);
22422 +}
22423 +
22424 +/**
22425 + * scan_mgr - commit atoms which are to be committed
22426 + * @super: super block to commit atoms of
22427 + *
22428 + * Commits old atoms.
22429 + */
22430 +static int scan_mgr(struct super_block *super)
22431 +{
22432 + int ret;
22433 + reiser4_context ctx;
22434 +
22435 + init_stack_context(&ctx, super);
22436 +
22437 + ret = commit_some_atoms(&get_super_private(super)->tmgr);
22438 +
22439 + reiser4_exit_context(&ctx);
22440 + return ret;
22441 +}
22442 +
22443 +/**
22444 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22445 + * @mgr:
22446 + *
22447 + * This is called on umount. Stops ktxnmgrd and free t
22448 + */
22449 +void reiser4_done_ktxnmgrd(struct super_block *super)
22450 +{
22451 + txn_mgr *mgr;
22452 +
22453 + mgr = &get_super_private(super)->tmgr;
22454 + assert("zam-1012", mgr->daemon != NULL);
22455 +
22456 + kthread_stop(mgr->daemon->tsk);
22457 + kfree(mgr->daemon);
22458 + mgr->daemon = NULL;
22459 +}
22460 +
22461 +/*
22462 + * Local variables:
22463 + * c-indentation-style: "K&R"
22464 + * mode-name: "LC"
22465 + * c-basic-offset: 8
22466 + * tab-width: 8
22467 + * fill-column: 120
22468 + * End:
22469 + */
22470 diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h linux-2.6.20/fs/reiser4/ktxnmgrd.h
22471 --- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22472 +++ linux-2.6.20/fs/reiser4/ktxnmgrd.h 2007-05-06 14:50:43.738988223 +0400
22473 @@ -0,0 +1,52 @@
22474 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22475 + * reiser4/README */
22476 +
22477 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22478 +
22479 +#ifndef __KTXNMGRD_H__
22480 +#define __KTXNMGRD_H__
22481 +
22482 +#include "txnmgr.h"
22483 +
22484 +#include <linux/fs.h>
22485 +#include <linux/wait.h>
22486 +#include <linux/completion.h>
22487 +#include <linux/spinlock.h>
22488 +#include <asm/atomic.h>
22489 +#include <linux/sched.h> /* for struct task_struct */
22490 +
22491 +/* in this structure all data necessary to start up, shut down and communicate
22492 + * with ktxnmgrd are kept. */
22493 +struct ktxnmgrd_context {
22494 + /* wait queue head on which ktxnmgrd sleeps */
22495 + wait_queue_head_t wait;
22496 + /* spin lock protecting all fields of this structure */
22497 + spinlock_t guard;
22498 + /* timeout of sleeping on ->wait */
22499 + signed long timeout;
22500 + /* kernel thread running ktxnmgrd */
22501 + struct task_struct *tsk;
22502 + /* list of all file systems served by this ktxnmgrd */
22503 + struct list_head queue;
22504 + /* should ktxnmgrd repeat scanning of atoms? */
22505 + unsigned int rescan:1;
22506 +};
22507 +
22508 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22509 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22510 +
22511 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22512 +extern int is_current_ktxnmgrd(void);
22513 +
22514 +/* __KTXNMGRD_H__ */
22515 +#endif
22516 +
22517 +/* Make Linus happy.
22518 + Local variables:
22519 + c-indentation-style: "K&R"
22520 + mode-name: "LC"
22521 + c-basic-offset: 8
22522 + tab-width: 8
22523 + fill-column: 120
22524 + End:
22525 +*/
22526 diff -urN linux-2.6.20.orig/fs/reiser4/lock.c linux-2.6.20/fs/reiser4/lock.c
22527 --- linux-2.6.20.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22528 +++ linux-2.6.20/fs/reiser4/lock.c 2007-05-06 14:50:43.742989473 +0400
22529 @@ -0,0 +1,1232 @@
22530 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22531 + * reiser4/README */
22532 +
22533 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22534 + order. V4 balances the tree from the bottom up, and searches the tree from
22535 + the top down, and that is really the way we want it, so tradition won't work
22536 + for us.
22537 +
22538 + Instead we have two lock orderings, a high priority lock ordering, and a low
22539 + priority lock ordering. Each node in the tree has a lock in its znode.
22540 +
22541 + Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22542 + has a set (maybe empty) of already locked nodes ("process locked set"). Each
22543 + process may have a pending lock request to a node locked by another process.
22544 + Note: we lock and unlock, but do not transfer locks: it is possible
22545 + transferring locks instead would save some bus locking....
22546 +
22547 + Deadlock occurs when we have a loop constructed from process locked sets and
22548 + lock request vectors.
22549 +
22550 + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22551 + memory is extended with "znodes" with which we connect nodes with their left
22552 + and right neighbors using sibling pointers stored in the znodes. When we
22553 + perform balancing operations we often go from left to right and from right to
22554 + left.
22555 +
22556 + +-P1-+ +-P3-+
22557 + |+--+| V1 |+--+|
22558 + ||N1|| -------> ||N3||
22559 + |+--+| |+--+|
22560 + +----+ +----+
22561 + ^ |
22562 + |V2 |V3
22563 + | v
22564 + +---------P2---------+
22565 + |+--+ +--+|
22566 + ||N2| -------- |N4||
22567 + |+--+ +--+|
22568 + +--------------------+
22569 +
22570 + We solve this by ensuring that only low priority processes lock in top to
22571 + bottom order and from right to left, and high priority processes lock from
22572 + bottom to top and left to right.
22573 +
22574 + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22575 + kill those damn busy loops.
22576 + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22577 + stage) cannot be ordered that way. There are no rules what nodes can belong
22578 + to the atom and what nodes cannot. We cannot define what is right or left
22579 + direction, what is top or bottom. We can take immediate parent or side
22580 + neighbor of one node, but nobody guarantees that, say, left neighbor node is
22581 + not a far right neighbor for other nodes from the same atom. It breaks
22582 + deadlock avoidance rules and hi-low priority locking cannot be applied for
22583 + atom locks.
22584 +
22585 + How does it help to avoid deadlocks ?
22586 +
22587 + Suppose we have a deadlock with n processes. Processes from one priority
22588 + class never deadlock because they take locks in one consistent
22589 + order.
22590 +
22591 + So, any possible deadlock loop must have low priority as well as high
22592 + priority processes. There are no other lock priority levels except low and
22593 + high. We know that any deadlock loop contains at least one node locked by a
22594 + low priority process and requested by a high priority process. If this
22595 + situation is caught and resolved it is sufficient to avoid deadlocks.
22596 +
22597 + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22598 +
22599 + The deadlock prevention algorithm is based on comparing
22600 + priorities of node owners (processes which keep znode locked) and
22601 + requesters (processes which want to acquire a lock on znode). We
22602 + implement a scheme where low-priority owners yield locks to
22603 + high-priority requesters. We created a signal passing system that
22604 + is used to ask low-priority processes to yield one or more locked
22605 + znodes.
22606 +
22607 + The condition when a znode needs to change its owners is described by the
22608 + following formula:
22609 +
22610 + #############################################
22611 + # #
22612 + # (number of high-priority requesters) > 0 #
22613 + # AND #
22614 + # (numbers of high-priority owners) == 0 #
22615 + # #
22616 + #############################################
22617 +
22618 + Note that a low-priority process delays node releasing if another
22619 + high-priority process owns this node. So, slightly more strictly speaking,
22620 + to have a deadlock capable cycle you must have a loop in which a high
22621 + priority process is waiting on a low priority process to yield a node, which
22622 + is slightly different from saying a high priority process is waiting on a
22623 + node owned by a low priority process.
22624 +
22625 + It is enough to avoid deadlocks if we prevent any low-priority process from
22626 + falling asleep if its locked set contains a node which satisfies the
22627 + deadlock condition.
22628 +
22629 + That condition is implicitly or explicitly checked in all places where new
22630 + high-priority requests may be added or removed from node request queue or
22631 + high-priority process takes or releases a lock on node. The main
22632 + goal of these checks is to never lose the moment when node becomes "has
22633 + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22634 + at that time.
22635 +
22636 + The information about received signals is stored in the per-process
22637 + structure (lock stack) and analyzed before a low-priority process goes to
22638 + sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22639 + sleeping process up and forces him to re-check lock status and received
22640 + signal info. If "must-yield-this-lock" signals were received the locking
22641 + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22642 +
22643 + V4 LOCKING DRAWBACKS
22644 +
22645 + If we have already balanced on one level, and we are propagating our changes
22646 + upward to a higher level, it could be very messy to surrender all locks on
22647 + the lower level because we put so much computational work into it, and
22648 + reverting them to their state before they were locked might be very complex.
22649 + We also don't want to acquire all locks before performing balancing because
22650 + that would either be almost as much work as the balancing, or it would be
22651 + too conservative and lock too much. We want balancing to be done only at
22652 + high priority. Yet, we might want to go to the left one node and use some
22653 + of its empty space... So we make one attempt at getting the node to the left
22654 + using try_lock, and if it fails we do without it, because we didn't really
22655 + need it, it was only a nice to have.
22656 +
22657 + LOCK STRUCTURES DESCRIPTION
22658 +
22659 + The following data structures are used in the reiser4 locking
22660 + implementation:
22661 +
22662 + All fields related to long-term locking are stored in znode->lock.
22663 +
22664 + The lock stack is a per thread object. It owns all znodes locked by the
22665 + thread. One znode may be locked by several threads in case of read lock or
22666 + one znode may be write locked by one thread several times. The special link
22667 + objects (lock handles) support n<->m relation between znodes and lock
22668 + owners.
22669 +
22670 + <Thread 1> <Thread 2>
22671 +
22672 + +---------+ +---------+
22673 + | LS1 | | LS2 |
22674 + +---------+ +---------+
22675 + ^ ^
22676 + |---------------+ +----------+
22677 + v v v v
22678 + +---------+ +---------+ +---------+ +---------+
22679 + | LH1 | | LH2 | | LH3 | | LH4 |
22680 + +---------+ +---------+ +---------+ +---------+
22681 + ^ ^ ^ ^
22682 + | +------------+ |
22683 + v v v
22684 + +---------+ +---------+ +---------+
22685 + | Z1 | | Z2 | | Z3 |
22686 + +---------+ +---------+ +---------+
22687 +
22688 + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22689 + picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22690 + LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22691 + Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22692 + list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22693 + is locked (for read) twice by different threads and two lock handles are on
22694 + its list. Each lock handle represents a single relation of a locking of a
22695 + znode by a thread. Locking of a znode is an establishing of a locking
22696 + relation between the lock stack and the znode by adding of a new lock handle
22697 + to a list of lock handles, the lock stack. The lock stack links all lock
22698 + handles for all znodes locked by the lock stack. The znode list groups all
22699 + lock handles for all locks stacks which locked the znode.
22700 +
22701 + Yet another relation may exist between znode and lock owners. If lock
22702 + procedure cannot immediately take lock on an object it adds the lock owner
22703 + on special `requestors' list belongs to znode. That list represents a
22704 + queue of pending lock requests. Because one lock owner may request only
22705 + only one lock object at a time, it is a 1->n relation between lock objects
22706 + and a lock owner implemented as it is described above. Full information
22707 + (priority, pointers to lock and link objects) about each lock request is
22708 + stored in lock owner structure in `request' field.
22709 +
22710 + SHORT_TERM LOCKING
22711 +
22712 + This is a list of primitive operations over lock stacks / lock handles /
22713 + znodes and locking descriptions for them.
22714 +
22715 + 1. locking / unlocking which is done by two list insertion/deletion, one
22716 + to/from znode's list of lock handles, another one is to/from lock stack's
22717 + list of lock handles. The first insertion is protected by
22718 + znode->lock.guard spinlock. The list owned by the lock stack can be
22719 + modified only by thread who owns the lock stack and nobody else can
22720 + modify/read it. There is nothing to be protected by a spinlock or
22721 + something else.
22722 +
22723 + 2. adding/removing a lock request to/from znode requesters list. The rule is
22724 + that znode->lock.guard spinlock should be taken for this.
22725 +
22726 + 3. we can traverse list of lock handles and use references to lock stacks who
22727 + locked given znode if znode->lock.guard spinlock is taken.
22728 +
22729 + 4. If a lock stack is associated with a znode as a lock requestor or lock
22730 + owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22731 + (lock stack's) fields should be protected from being accessed in parallel
22732 + by two or more threads. Please look at lock_stack structure definition
22733 + for the info how those fields are protected. */
22734 +
22735 +/* Znode lock and capturing intertwining. */
22736 +/* In current implementation we capture formatted nodes before locking
22737 + them. Take a look on longterm lock znode, reiser4_try_capture() request
22738 + precedes locking requests. The longterm_lock_znode function unconditionally
22739 + captures znode before even checking of locking conditions.
22740 +
22741 + Another variant is to capture znode after locking it. It was not tested, but
22742 + at least one deadlock condition is supposed to be there. One thread has
22743 + locked a znode (Node-1) and calls reiser4_try_capture() for it.
22744 + reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22745 + Second thread is a flushing thread, its current atom is the atom Node-1
22746 + belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22747 + is locked by the first thread. The described situation is a deadlock. */
22748 +
22749 +#include "debug.h"
22750 +#include "txnmgr.h"
22751 +#include "znode.h"
22752 +#include "jnode.h"
22753 +#include "tree.h"
22754 +#include "plugin/node/node.h"
22755 +#include "super.h"
22756 +
22757 +#include <linux/spinlock.h>
22758 +
22759 +#if REISER4_DEBUG
22760 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
22761 + znode_lock_request);
22762 +#endif
22763 +
22764 +/* Returns a lock owner associated with current thread */
22765 +lock_stack *get_current_lock_stack(void)
22766 +{
22767 + return &get_current_context()->stack;
22768 +}
22769 +
22770 +/* Wakes up all low priority owners informing them about possible deadlock */
22771 +static void wake_up_all_lopri_owners(znode * node)
22772 +{
22773 + lock_handle *handle;
22774 +
22775 + assert_spin_locked(&(node->lock.guard));
22776 + list_for_each_entry(handle, &node->lock.owners, owners_link) {
22777 + assert("nikita-1832", handle->node == node);
22778 + /* count this signal in owner->nr_signaled */
22779 + if (!handle->signaled) {
22780 + handle->signaled = 1;
22781 + atomic_inc(&handle->owner->nr_signaled);
22782 + /* Wake up a single process */
22783 + reiser4_wake_up(handle->owner);
22784 + }
22785 + }
22786 +}
22787 +
22788 +/* Adds a lock to a lock owner, which means creating a link to the lock and
22789 + putting the link into the two lists all links are on (the doubly linked list
22790 + that forms the lock_stack, and the doubly linked list of links attached
22791 + to a lock.
22792 +*/
22793 +static inline void
22794 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
22795 +{
22796 + assert("jmacd-810", handle->owner == NULL);
22797 + assert_spin_locked(&(node->lock.guard));
22798 +
22799 + handle->owner = owner;
22800 + handle->node = node;
22801 +
22802 + assert("reiser4-4",
22803 + ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22804 +
22805 + /* add lock handle to the end of lock_stack's list of locks */
22806 + list_add_tail(&handle->locks_link, &owner->locks);
22807 + ON_DEBUG(owner->nr_locks++);
22808 + reiser4_ctx_gfp_mask_set();
22809 +
22810 + /* add lock handle to the head of znode's list of owners */
22811 + list_add(&handle->owners_link, &node->lock.owners);
22812 + handle->signaled = 0;
22813 +}
22814 +
22815 +/* Breaks a relation between a lock and its owner */
22816 +static inline void unlink_object(lock_handle * handle)
22817 +{
22818 + assert("zam-354", handle->owner != NULL);
22819 + assert("nikita-1608", handle->node != NULL);
22820 + assert_spin_locked(&(handle->node->lock.guard));
22821 + assert("nikita-1829", handle->owner == get_current_lock_stack());
22822 + assert("reiser4-5", handle->owner->nr_locks > 0);
22823 +
22824 + /* remove lock handle from lock_stack's list of locks */
22825 + list_del(&handle->locks_link);
22826 + ON_DEBUG(handle->owner->nr_locks--);
22827 + reiser4_ctx_gfp_mask_set();
22828 + assert("reiser4-6",
22829 + ergo(list_empty_careful(&handle->owner->locks),
22830 + handle->owner->nr_locks == 0));
22831 + /* remove lock handle from znode's list of owners */
22832 + list_del(&handle->owners_link);
22833 + /* indicates that lock handle is free now */
22834 + handle->node = NULL;
22835 +#if REISER4_DEBUG
22836 + INIT_LIST_HEAD(&handle->locks_link);
22837 + INIT_LIST_HEAD(&handle->owners_link);
22838 + handle->owner = NULL;
22839 +#endif
22840 +}
22841 +
22842 +/* Actually locks an object knowing that we are able to do this */
22843 +static void lock_object(lock_stack * owner)
22844 +{
22845 + lock_request *request;
22846 + znode *node;
22847 +
22848 + request = &owner->request;
22849 + node = request->node;
22850 + assert_spin_locked(&(node->lock.guard));
22851 + if (request->mode == ZNODE_READ_LOCK) {
22852 + node->lock.nr_readers++;
22853 + } else {
22854 + /* check that we don't switched from read to write lock */
22855 + assert("nikita-1840", node->lock.nr_readers <= 0);
22856 + /* We allow recursive locking; a node can be locked several
22857 + times for write by same process */
22858 + node->lock.nr_readers--;
22859 + }
22860 +
22861 + link_object(request->handle, owner, node);
22862 +
22863 + if (owner->curpri) {
22864 + node->lock.nr_hipri_owners++;
22865 + }
22866 +}
22867 +
22868 +/* Check for recursive write locking */
22869 +static int recursive(lock_stack * owner)
22870 +{
22871 + int ret;
22872 + znode *node;
22873 + lock_handle *lh;
22874 +
22875 + node = owner->request.node;
22876 +
22877 + /* Owners list is not empty for a locked node */
22878 + assert("zam-314", !list_empty_careful(&node->lock.owners));
22879 + assert("nikita-1841", owner == get_current_lock_stack());
22880 + assert_spin_locked(&(node->lock.guard));
22881 +
22882 + lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22883 + ret = (lh->owner == owner);
22884 +
22885 + /* Recursive read locking should be done usual way */
22886 + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22887 + /* mixing of read/write locks is not allowed */
22888 + assert("zam-341", !ret || znode_is_wlocked(node));
22889 +
22890 + return ret;
22891 +}
22892 +
22893 +#if REISER4_DEBUG
22894 +/* Returns true if the lock is held by the calling thread. */
22895 +int znode_is_any_locked(const znode * node)
22896 +{
22897 + lock_handle *handle;
22898 + lock_stack *stack;
22899 + int ret;
22900 +
22901 + if (!znode_is_locked(node)) {
22902 + return 0;
22903 + }
22904 +
22905 + stack = get_current_lock_stack();
22906 +
22907 + spin_lock_stack(stack);
22908 +
22909 + ret = 0;
22910 +
22911 + list_for_each_entry(handle, &stack->locks, locks_link) {
22912 + if (handle->node == node) {
22913 + ret = 1;
22914 + break;
22915 + }
22916 + }
22917 +
22918 + spin_unlock_stack(stack);
22919 +
22920 + return ret;
22921 +}
22922 +
22923 +#endif
22924 +
22925 +/* Returns true if a write lock is held by the calling thread. */
22926 +int znode_is_write_locked(const znode * node)
22927 +{
22928 + lock_stack *stack;
22929 + lock_handle *handle;
22930 +
22931 + assert("jmacd-8765", node != NULL);
22932 +
22933 + if (!znode_is_wlocked(node)) {
22934 + return 0;
22935 + }
22936 +
22937 + stack = get_current_lock_stack();
22938 +
22939 + /*
22940 + * When znode is write locked, all owner handles point to the same lock
22941 + * stack. Get pointer to lock stack from the first lock handle from
22942 + * znode's owner list
22943 + */
22944 + handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
22945 +
22946 + return (handle->owner == stack);
22947 +}
22948 +
22949 +/* This "deadlock" condition is the essential part of reiser4 locking
22950 + implementation. This condition is checked explicitly by calling
22951 + check_deadlock_condition() or implicitly in all places where znode lock
22952 + state (set of owners and request queue) is changed. Locking code is
22953 + designed to use this condition to trigger procedure of passing object from
22954 + low priority owner(s) to high priority one(s).
22955 +
22956 + The procedure results in passing an event (setting lock_handle->signaled
22957 + flag) and counting this event in nr_signaled field of owner's lock stack
22958 + object and wakeup owner's process.
22959 +*/
22960 +static inline int check_deadlock_condition(znode * node)
22961 +{
22962 + assert_spin_locked(&(node->lock.guard));
22963 + return node->lock.nr_hipri_requests > 0
22964 + && node->lock.nr_hipri_owners == 0;
22965 +}
22966 +
22967 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
22968 +{
22969 + zlock * lock = &node->lock;
22970 +
22971 + return mode == ZNODE_READ_LOCK &&
22972 + lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
22973 +}
22974 +
22975 +/* checks lock/request compatibility */
22976 +static int can_lock_object(lock_stack * owner)
22977 +{
22978 + znode *node = owner->request.node;
22979 +
22980 + assert_spin_locked(&(node->lock.guard));
22981 +
22982 + /* See if the node is disconnected. */
22983 + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
22984 + return RETERR(-EINVAL);
22985 +
22986 + /* Do not ever try to take a lock if we are going in low priority
22987 + direction and a node have a high priority request without high
22988 + priority owners. */
22989 + if (unlikely(!owner->curpri && check_deadlock_condition(node)))
22990 + return RETERR(-E_REPEAT);
22991 + if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
22992 + return RETERR(-E_REPEAT);
22993 + if (unlikely(!is_lock_compatible(node, owner->request.mode)))
22994 + return RETERR(-E_REPEAT);
22995 + return 0;
22996 +}
22997 +
22998 +/* Setting of a high priority to the process. It clears "signaled" flags
22999 + because znode locked by high-priority process can't satisfy our "deadlock
23000 + condition". */
23001 +static void set_high_priority(lock_stack * owner)
23002 +{
23003 + assert("nikita-1846", owner == get_current_lock_stack());
23004 + /* Do nothing if current priority is already high */
23005 + if (!owner->curpri) {
23006 + /* We don't need locking for owner->locks list, because, this
23007 + * function is only called with the lock stack of the current
23008 + * thread, and no other thread can play with owner->locks list
23009 + * and/or change ->node pointers of lock handles in this list.
23010 + *
23011 + * (Interrupts also are not involved.)
23012 + */
23013 + lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23014 + while (&owner->locks != &item->locks_link) {
23015 + znode *node = item->node;
23016 +
23017 + spin_lock_zlock(&node->lock);
23018 +
23019 + node->lock.nr_hipri_owners++;
23020 +
23021 + /* we can safely set signaled to zero, because
23022 + previous statement (nr_hipri_owners ++) guarantees
23023 + that signaled will be never set again. */
23024 + item->signaled = 0;
23025 + spin_unlock_zlock(&node->lock);
23026 +
23027 + item = list_entry(item->locks_link.next, lock_handle, locks_link);
23028 + }
23029 + owner->curpri = 1;
23030 + atomic_set(&owner->nr_signaled, 0);
23031 + }
23032 +}
23033 +
23034 +/* Sets a low priority to the process. */
23035 +static void set_low_priority(lock_stack * owner)
23036 +{
23037 + assert("nikita-3075", owner == get_current_lock_stack());
23038 + /* Do nothing if current priority is already low */
23039 + if (owner->curpri) {
23040 + /* scan all locks (lock handles) held by @owner, which is
23041 + actually current thread, and check whether we are reaching
23042 + deadlock possibility anywhere.
23043 + */
23044 + lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23045 + while (&owner->locks != &handle->locks_link) {
23046 + znode *node = handle->node;
23047 + spin_lock_zlock(&node->lock);
23048 + /* this thread just was hipri owner of @node, so
23049 + nr_hipri_owners has to be greater than zero. */
23050 + assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23051 + node->lock.nr_hipri_owners--;
23052 + /* If we have deadlock condition, adjust a nr_signaled
23053 + field. It is enough to set "signaled" flag only for
23054 + current process, other low-pri owners will be
23055 + signaled and waken up after current process unlocks
23056 + this object and any high-priority requestor takes
23057 + control. */
23058 + if (check_deadlock_condition(node)
23059 + && !handle->signaled) {
23060 + handle->signaled = 1;
23061 + atomic_inc(&owner->nr_signaled);
23062 + }
23063 + spin_unlock_zlock(&node->lock);
23064 + handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23065 + }
23066 + owner->curpri = 0;
23067 + }
23068 +}
23069 +
23070 +static void remove_lock_request(lock_stack * requestor)
23071 +{
23072 + zlock * lock = &requestor->request.node->lock;
23073 +
23074 + if (requestor->curpri) {
23075 + assert("nikita-1838", lock->nr_hipri_requests > 0);
23076 + lock->nr_hipri_requests--;
23077 + if (requestor->request.mode == ZNODE_WRITE_LOCK)
23078 + lock->nr_hipri_write_requests --;
23079 + }
23080 + list_del(&requestor->requestors_link);
23081 +}
23082 +
23083 +static void invalidate_all_lock_requests(znode * node)
23084 +{
23085 + lock_stack *requestor, *tmp;
23086 +
23087 + assert_spin_locked(&(node->lock.guard));
23088 +
23089 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23090 + remove_lock_request(requestor);
23091 + requestor->request.ret_code = -EINVAL;
23092 + reiser4_wake_up(requestor);
23093 + requestor->request.mode = ZNODE_NO_LOCK;
23094 + }
23095 +}
23096 +
23097 +static void dispatch_lock_requests(znode * node)
23098 +{
23099 + lock_stack *requestor, *tmp;
23100 +
23101 + assert_spin_locked(&(node->lock.guard));
23102 +
23103 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23104 + if (znode_is_write_locked(node))
23105 + break;
23106 + if (!can_lock_object(requestor)) {
23107 + lock_object(requestor);
23108 + remove_lock_request(requestor);
23109 + requestor->request.ret_code = 0;
23110 + reiser4_wake_up(requestor);
23111 + requestor->request.mode = ZNODE_NO_LOCK;
23112 + }
23113 + }
23114 +}
23115 +
23116 +/* release long-term lock, acquired by longterm_lock_znode() */
23117 +void longterm_unlock_znode(lock_handle * handle)
23118 +{
23119 + znode *node = handle->node;
23120 + lock_stack *oldowner = handle->owner;
23121 + int hipri;
23122 + int readers;
23123 + int rdelta;
23124 + int youdie;
23125 +
23126 + /*
23127 + * this is time-critical and highly optimized code. Modify carefully.
23128 + */
23129 +
23130 + assert("jmacd-1021", handle != NULL);
23131 + assert("jmacd-1022", handle->owner != NULL);
23132 + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23133 +
23134 + assert("zam-130", oldowner == get_current_lock_stack());
23135 +
23136 + LOCK_CNT_DEC(long_term_locked_znode);
23137 +
23138 + /*
23139 + * to minimize amount of operations performed under lock, pre-compute
23140 + * all variables used within critical section. This makes code
23141 + * obscure.
23142 + */
23143 +
23144 + /* was this lock of hi or lo priority */
23145 + hipri = oldowner->curpri ? 1 : 0;
23146 + /* number of readers */
23147 + readers = node->lock.nr_readers;
23148 + /* +1 if write lock, -1 if read lock */
23149 + rdelta = (readers > 0) ? -1 : +1;
23150 + /* true if node is to die and write lock is released */
23151 + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23152 +
23153 + spin_lock_zlock(&node->lock);
23154 +
23155 + assert("zam-101", znode_is_locked(node));
23156 +
23157 + /* Adjust a number of high priority owners of this lock */
23158 + assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23159 + node->lock.nr_hipri_owners -= hipri;
23160 +
23161 + /* Handle znode deallocation on last write-lock release. */
23162 + if (znode_is_wlocked_once(node)) {
23163 + if (youdie) {
23164 + forget_znode(handle);
23165 + assert("nikita-2191", znode_invariant(node));
23166 + zput(node);
23167 + return;
23168 + }
23169 + }
23170 +
23171 + if (handle->signaled)
23172 + atomic_dec(&oldowner->nr_signaled);
23173 +
23174 + /* Unlocking means owner<->object link deletion */
23175 + unlink_object(handle);
23176 +
23177 + /* This is enough to be sure whether an object is completely
23178 + unlocked. */
23179 + node->lock.nr_readers += rdelta;
23180 +
23181 + /* If the node is locked it must have an owners list. Likewise, if
23182 + the node is unlocked it must have an empty owners list. */
23183 + assert("zam-319", equi(znode_is_locked(node),
23184 + !list_empty_careful(&node->lock.owners)));
23185 +
23186 +#if REISER4_DEBUG
23187 + if (!znode_is_locked(node))
23188 + ++node->times_locked;
23189 +#endif
23190 +
23191 + /* If there are pending lock requests we wake up a requestor */
23192 + if (!znode_is_wlocked(node))
23193 + dispatch_lock_requests(node);
23194 + if (check_deadlock_condition(node))
23195 + wake_up_all_lopri_owners(node);
23196 + spin_unlock_zlock(&node->lock);
23197 +
23198 + /* minus one reference from handle->node */
23199 + assert("nikita-2190", znode_invariant(node));
23200 + ON_DEBUG(check_lock_data());
23201 + ON_DEBUG(check_lock_node_data(node));
23202 + zput(node);
23203 +}
23204 +
23205 +/* final portion of longterm-lock */
23206 +static int
23207 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23208 +{
23209 + znode *node = owner->request.node;
23210 +
23211 + assert_spin_locked(&(node->lock.guard));
23212 +
23213 + /* If we broke with (ok == 0) it means we can_lock, now do it. */
23214 + if (ok == 0) {
23215 + lock_object(owner);
23216 + owner->request.mode = 0;
23217 + /* count a reference from lockhandle->node
23218 +
23219 + znode was already referenced at the entry to this function,
23220 + hence taking spin-lock here is not necessary (see comment
23221 + in the zref()).
23222 + */
23223 + zref(node);
23224 +
23225 + LOCK_CNT_INC(long_term_locked_znode);
23226 + }
23227 + spin_unlock_zlock(&node->lock);
23228 + ON_DEBUG(check_lock_data());
23229 + ON_DEBUG(check_lock_node_data(node));
23230 + return ok;
23231 +}
23232 +
23233 +/*
23234 + * version of longterm_znode_lock() optimized for the most common case: read
23235 + * lock without any special flags. This is the kind of lock that any tree
23236 + * traversal takes on the root node of the tree, which is very frequent.
23237 + */
23238 +static int longterm_lock_tryfast(lock_stack * owner)
23239 +{
23240 + int result;
23241 + znode *node;
23242 + zlock *lock;
23243 +
23244 + node = owner->request.node;
23245 + lock = &node->lock;
23246 +
23247 + assert("nikita-3340", reiser4_schedulable());
23248 + assert("nikita-3341", request_is_deadlock_safe(node,
23249 + ZNODE_READ_LOCK,
23250 + ZNODE_LOCK_LOPRI));
23251 + spin_lock_zlock(lock);
23252 + result = can_lock_object(owner);
23253 + spin_unlock_zlock(lock);
23254 +
23255 + if (likely(result != -EINVAL)) {
23256 + spin_lock_znode(node);
23257 + result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23258 + spin_unlock_znode(node);
23259 + spin_lock_zlock(lock);
23260 + if (unlikely(result != 0)) {
23261 + owner->request.mode = 0;
23262 + } else {
23263 + result = can_lock_object(owner);
23264 + if (unlikely(result == -E_REPEAT)) {
23265 + /* fall back to longterm_lock_znode() */
23266 + spin_unlock_zlock(lock);
23267 + return 1;
23268 + }
23269 + }
23270 + return lock_tail(owner, result, ZNODE_READ_LOCK);
23271 + } else
23272 + return 1;
23273 +}
23274 +
23275 +/* locks given lock object */
23276 +int longterm_lock_znode(
23277 + /* local link object (allocated by lock owner thread, usually on its own
23278 + * stack) */
23279 + lock_handle * handle,
23280 + /* znode we want to lock. */
23281 + znode * node,
23282 + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23283 + znode_lock_mode mode,
23284 + /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23285 + znode_lock_request request) {
23286 + int ret;
23287 + int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23288 + int non_blocking = 0;
23289 + int has_atom;
23290 + txn_capture cap_flags;
23291 + zlock *lock;
23292 + txn_handle *txnh;
23293 + tree_level level;
23294 +
23295 + /* Get current process context */
23296 + lock_stack *owner = get_current_lock_stack();
23297 +
23298 + /* Check that the lock handle is initialized and isn't already being
23299 + * used. */
23300 + assert("jmacd-808", handle->owner == NULL);
23301 + assert("nikita-3026", reiser4_schedulable());
23302 + assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23303 + assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23304 + /* long term locks are not allowed in the VM contexts (->writepage(),
23305 + * prune_{d,i}cache()).
23306 + *
23307 + * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23308 + * bug caused by d_splice_alias() only working for directories.
23309 + */
23310 + assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23311 + assert ("zam-1055", mode != ZNODE_NO_LOCK);
23312 +
23313 + cap_flags = 0;
23314 + if (request & ZNODE_LOCK_NONBLOCK) {
23315 + cap_flags |= TXN_CAPTURE_NONBLOCKING;
23316 + non_blocking = 1;
23317 + }
23318 +
23319 + if (request & ZNODE_LOCK_DONT_FUSE)
23320 + cap_flags |= TXN_CAPTURE_DONT_FUSE;
23321 +
23322 + /* If we are changing our process priority we must adjust a number
23323 + of high priority owners for each znode that we already lock */
23324 + if (hipri) {
23325 + set_high_priority(owner);
23326 + } else {
23327 + set_low_priority(owner);
23328 + }
23329 +
23330 + level = znode_get_level(node);
23331 +
23332 + /* Fill request structure with our values. */
23333 + owner->request.mode = mode;
23334 + owner->request.handle = handle;
23335 + owner->request.node = node;
23336 +
23337 + txnh = get_current_context()->trans;
23338 + lock = &node->lock;
23339 +
23340 + if (mode == ZNODE_READ_LOCK && request == 0) {
23341 + ret = longterm_lock_tryfast(owner);
23342 + if (ret <= 0)
23343 + return ret;
23344 + }
23345 +
23346 + has_atom = (txnh->atom != NULL);
23347 +
23348 + /* Synchronize on node's zlock guard lock. */
23349 + spin_lock_zlock(lock);
23350 +
23351 + if (znode_is_locked(node) &&
23352 + mode == ZNODE_WRITE_LOCK && recursive(owner))
23353 + return lock_tail(owner, 0, mode);
23354 +
23355 + for (;;) {
23356 + /* Check the lock's availability: if it is unavaiable we get
23357 + E_REPEAT, 0 indicates "can_lock", otherwise the node is
23358 + invalid. */
23359 + ret = can_lock_object(owner);
23360 +
23361 + if (unlikely(ret == -EINVAL)) {
23362 + /* @node is dying. Leave it alone. */
23363 + break;
23364 + }
23365 +
23366 + if (unlikely(ret == -E_REPEAT && non_blocking)) {
23367 + /* either locking of @node by the current thread will
23368 + * lead to the deadlock, or lock modes are
23369 + * incompatible. */
23370 + break;
23371 + }
23372 +
23373 + assert("nikita-1844", (ret == 0)
23374 + || ((ret == -E_REPEAT) && !non_blocking));
23375 + /* If we can get the lock... Try to capture first before
23376 + taking the lock. */
23377 +
23378 + /* first handle commonest case where node and txnh are already
23379 + * in the same atom. */
23380 + /* safe to do without taking locks, because:
23381 + *
23382 + * 1. read of aligned word is atomic with respect to writes to
23383 + * this word
23384 + *
23385 + * 2. false negatives are handled in reiser4_try_capture().
23386 + *
23387 + * 3. false positives are impossible.
23388 + *
23389 + * PROOF: left as an exercise to the curious reader.
23390 + *
23391 + * Just kidding. Here is one:
23392 + *
23393 + * At the time T0 txnh->atom is stored in txnh_atom.
23394 + *
23395 + * At the time T1 node->atom is stored in node_atom.
23396 + *
23397 + * At the time T2 we observe that
23398 + *
23399 + * txnh_atom != NULL && node_atom == txnh_atom.
23400 + *
23401 + * Imagine that at this moment we acquire node and txnh spin
23402 + * lock in this order. Suppose that under spin lock we have
23403 + *
23404 + * node->atom != txnh->atom, (S1)
23405 + *
23406 + * at the time T3.
23407 + *
23408 + * txnh->atom != NULL still, because txnh is open by the
23409 + * current thread.
23410 + *
23411 + * Suppose node->atom == NULL, that is, node was un-captured
23412 + * between T1, and T3. But un-capturing of formatted node is
23413 + * always preceded by the call to reiser4_invalidate_lock(),
23414 + * which marks znode as JNODE_IS_DYING under zlock spin
23415 + * lock. Contradiction, because can_lock_object() above checks
23416 + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23417 + *
23418 + * Suppose that node->atom != node_atom, that is, atom, node
23419 + * belongs to was fused into another atom: node_atom was fused
23420 + * into node->atom. Atom of txnh was equal to node_atom at T2,
23421 + * which means that under spin lock, txnh->atom == node->atom,
23422 + * because txnh->atom can only follow fusion
23423 + * chain. Contradicts S1.
23424 + *
23425 + * The same for hypothesis txnh->atom != txnh_atom. Hence,
23426 + * node->atom == node_atom == txnh_atom == txnh->atom. Again
23427 + * contradicts S1. Hence S1 is false. QED.
23428 + *
23429 + */
23430 +
23431 + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23432 + ;
23433 + } else {
23434 + /*
23435 + * unlock zlock spin lock here. It is possible for
23436 + * longterm_unlock_znode() to sneak in here, but there
23437 + * is no harm: reiser4_invalidate_lock() will mark znode
23438 + * as JNODE_IS_DYING and this will be noted by
23439 + * can_lock_object() below.
23440 + */
23441 + spin_unlock_zlock(lock);
23442 + spin_lock_znode(node);
23443 + ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23444 + spin_unlock_znode(node);
23445 + spin_lock_zlock(lock);
23446 + if (unlikely(ret != 0)) {
23447 + /* In the failure case, the txnmgr releases
23448 + the znode's lock (or in some cases, it was
23449 + released a while ago). There's no need to
23450 + reacquire it so we should return here,
23451 + avoid releasing the lock. */
23452 + owner->request.mode = 0;
23453 + break;
23454 + }
23455 +
23456 + /* Check the lock's availability again -- this is
23457 + because under some circumstances the capture code
23458 + has to release and reacquire the znode spinlock. */
23459 + ret = can_lock_object(owner);
23460 + }
23461 +
23462 + /* This time, a return of (ret == 0) means we can lock, so we
23463 + should break out of the loop. */
23464 + if (likely(ret != -E_REPEAT || non_blocking))
23465 + break;
23466 +
23467 + /* Lock is unavailable, we have to wait. */
23468 + ret = reiser4_prepare_to_sleep(owner);
23469 + if (unlikely(ret != 0))
23470 + break;
23471 +
23472 + assert_spin_locked(&(node->lock.guard));
23473 + if (hipri) {
23474 + /* If we are going in high priority direction then
23475 + increase high priority requests counter for the
23476 + node */
23477 + lock->nr_hipri_requests++;
23478 + if (mode == ZNODE_WRITE_LOCK)
23479 + lock->nr_hipri_write_requests ++;
23480 + /* If there are no high priority owners for a node,
23481 + then immediately wake up low priority owners, so
23482 + they can detect possible deadlock */
23483 + if (lock->nr_hipri_owners == 0)
23484 + wake_up_all_lopri_owners(node);
23485 + }
23486 + list_add_tail(&owner->requestors_link, &lock->requestors);
23487 +
23488 + /* Ok, here we have prepared a lock request, so unlock
23489 + a znode ... */
23490 + spin_unlock_zlock(lock);
23491 + /* ... and sleep */
23492 + reiser4_go_to_sleep(owner);
23493 + if (owner->request.mode == ZNODE_NO_LOCK)
23494 + goto request_is_done;
23495 + spin_lock_zlock(lock);
23496 + if (owner->request.mode == ZNODE_NO_LOCK) {
23497 + spin_unlock_zlock(lock);
23498 + request_is_done:
23499 + if (owner->request.ret_code == 0) {
23500 + LOCK_CNT_INC(long_term_locked_znode);
23501 + zref(node);
23502 + }
23503 + return owner->request.ret_code;
23504 + }
23505 + remove_lock_request(owner);
23506 + }
23507 +
23508 + return lock_tail(owner, ret, mode);
23509 +}
23510 +
23511 +/* lock object invalidation means changing of lock object state to `INVALID'
23512 + and waiting for all other processes to cancel theirs lock requests. */
23513 +void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23514 + * owner and lock
23515 + * object is being
23516 + * invalidated. */ )
23517 +{
23518 + znode *node = handle->node;
23519 + lock_stack *owner = handle->owner;
23520 +
23521 + assert("zam-325", owner == get_current_lock_stack());
23522 + assert("zam-103", znode_is_write_locked(node));
23523 + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23524 + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23525 + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23526 + assert("nikita-3097", znode_is_wlocked_once(node));
23527 + assert_spin_locked(&(node->lock.guard));
23528 +
23529 + if (handle->signaled)
23530 + atomic_dec(&owner->nr_signaled);
23531 +
23532 + ZF_SET(node, JNODE_IS_DYING);
23533 + unlink_object(handle);
23534 + node->lock.nr_readers = 0;
23535 +
23536 + invalidate_all_lock_requests(node);
23537 + spin_unlock_zlock(&node->lock);
23538 +}
23539 +
23540 +/* Initializes lock_stack. */
23541 +void init_lock_stack(lock_stack * owner /* pointer to
23542 + * allocated
23543 + * structure. */ )
23544 +{
23545 + INIT_LIST_HEAD(&owner->locks);
23546 + INIT_LIST_HEAD(&owner->requestors_link);
23547 + spin_lock_init(&owner->sguard);
23548 + owner->curpri = 1;
23549 + init_waitqueue_head(&owner->wait);
23550 +}
23551 +
23552 +/* Initializes lock object. */
23553 +void reiser4_init_lock(zlock * lock /* pointer on allocated
23554 + * uninitialized lock object
23555 + * structure. */ )
23556 +{
23557 + memset(lock, 0, sizeof(zlock));
23558 + spin_lock_init(&lock->guard);
23559 + INIT_LIST_HEAD(&lock->requestors);
23560 + INIT_LIST_HEAD(&lock->owners);
23561 +}
23562 +
23563 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
23564 + heap locations). */
23565 +static void
23566 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23567 +{
23568 + znode *node = old->node;
23569 + lock_stack *owner = old->owner;
23570 + int signaled;
23571 +
23572 + /* locks_list, modified by link_object() is not protected by
23573 + anything. This is valid because only current thread ever modifies
23574 + locks_list of its lock_stack.
23575 + */
23576 + assert("nikita-1827", owner == get_current_lock_stack());
23577 + assert("nikita-1831", new->owner == NULL);
23578 +
23579 + spin_lock_zlock(&node->lock);
23580 +
23581 + signaled = old->signaled;
23582 + if (unlink_old) {
23583 + unlink_object(old);
23584 + } else {
23585 + if (node->lock.nr_readers > 0) {
23586 + node->lock.nr_readers += 1;
23587 + } else {
23588 + node->lock.nr_readers -= 1;
23589 + }
23590 + if (signaled) {
23591 + atomic_inc(&owner->nr_signaled);
23592 + }
23593 + if (owner->curpri) {
23594 + node->lock.nr_hipri_owners += 1;
23595 + }
23596 + LOCK_CNT_INC(long_term_locked_znode);
23597 +
23598 + zref(node);
23599 + }
23600 + link_object(new, owner, node);
23601 + new->signaled = signaled;
23602 +
23603 + spin_unlock_zlock(&node->lock);
23604 +}
23605 +
23606 +void move_lh(lock_handle * new, lock_handle * old)
23607 +{
23608 + move_lh_internal(new, old, /*unlink_old */ 1);
23609 +}
23610 +
23611 +void copy_lh(lock_handle * new, lock_handle * old)
23612 +{
23613 + move_lh_internal(new, old, /*unlink_old */ 0);
23614 +}
23615 +
23616 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23617 +int reiser4_check_deadlock(void)
23618 +{
23619 + lock_stack *owner = get_current_lock_stack();
23620 + return atomic_read(&owner->nr_signaled) != 0;
23621 +}
23622 +
23623 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23624 + priorities. */
23625 +int reiser4_prepare_to_sleep(lock_stack * owner)
23626 +{
23627 + assert("nikita-1847", owner == get_current_lock_stack());
23628 +
23629 + /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23630 + * counted in nr_signaled */
23631 + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23632 + assert("zam-959", !owner->curpri);
23633 + return RETERR(-E_DEADLOCK);
23634 + }
23635 + return 0;
23636 +}
23637 +
23638 +/* Wakes up a single thread */
23639 +void __reiser4_wake_up(lock_stack * owner)
23640 +{
23641 + atomic_set(&owner->wakeup, 1);
23642 + wake_up(&owner->wait);
23643 +}
23644 +
23645 +/* Puts a thread to sleep */
23646 +void reiser4_go_to_sleep(lock_stack * owner)
23647 +{
23648 + /* Well, we might sleep here, so holding of any spinlocks is no-no */
23649 + assert("nikita-3027", reiser4_schedulable());
23650 +
23651 + wait_event(owner->wait, atomic_read(&owner->wakeup));
23652 + atomic_set(&owner->wakeup, 0);
23653 +}
23654 +
23655 +int lock_stack_isclean(lock_stack * owner)
23656 +{
23657 + if (list_empty_careful(&owner->locks)) {
23658 + assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23659 + return 1;
23660 + }
23661 +
23662 + return 0;
23663 +}
23664 +
23665 +#if REISER4_DEBUG
23666 +
23667 +/*
23668 + * debugging functions
23669 + */
23670 +
23671 +static void list_check(struct list_head *head)
23672 +{
23673 + struct list_head *pos;
23674 +
23675 + list_for_each(pos, head)
23676 + assert("", (pos->prev != NULL && pos->next != NULL &&
23677 + pos->prev->next == pos && pos->next->prev == pos));
23678 +}
23679 +
23680 +/* check consistency of locking data-structures hanging of the @stack */
23681 +static void check_lock_stack(lock_stack * stack)
23682 +{
23683 + spin_lock_stack(stack);
23684 + /* check that stack->locks is not corrupted */
23685 + list_check(&stack->locks);
23686 + spin_unlock_stack(stack);
23687 +}
23688 +
23689 +/* check consistency of locking data structures */
23690 +void check_lock_data(void)
23691 +{
23692 + check_lock_stack(&get_current_context()->stack);
23693 +}
23694 +
23695 +/* check consistency of locking data structures for @node */
23696 +void check_lock_node_data(znode * node)
23697 +{
23698 + spin_lock_zlock(&node->lock);
23699 + list_check(&node->lock.owners);
23700 + list_check(&node->lock.requestors);
23701 + spin_unlock_zlock(&node->lock);
23702 +}
23703 +
23704 +/* check that given lock request is dead lock safe. This check is, of course,
23705 + * not exhaustive. */
23706 +static int
23707 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23708 + znode_lock_request request)
23709 +{
23710 + lock_stack *owner;
23711 +
23712 + owner = get_current_lock_stack();
23713 + /*
23714 + * check that hipri lock request is not issued when there are locked
23715 + * nodes at the higher levels.
23716 + */
23717 + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23718 + znode_get_level(node) != 0) {
23719 + lock_handle *item;
23720 +
23721 + list_for_each_entry(item, &owner->locks, locks_link) {
23722 + znode *other;
23723 +
23724 + other = item->node;
23725 +
23726 + if (znode_get_level(other) == 0)
23727 + continue;
23728 + if (znode_get_level(other) > znode_get_level(node))
23729 + return 0;
23730 + }
23731 + }
23732 + return 1;
23733 +}
23734 +
23735 +#endif
23736 +
23737 +/* return pointer to static storage with name of lock_mode. For
23738 + debugging */
23739 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23740 +{
23741 + if (lock == ZNODE_READ_LOCK)
23742 + return "read";
23743 + else if (lock == ZNODE_WRITE_LOCK)
23744 + return "write";
23745 + else {
23746 + static char buf[30];
23747 +
23748 + sprintf(buf, "unknown: %i", lock);
23749 + return buf;
23750 + }
23751 +}
23752 +
23753 +/* Make Linus happy.
23754 + Local variables:
23755 + c-indentation-style: "K&R"
23756 + mode-name: "LC"
23757 + c-basic-offset: 8
23758 + tab-width: 8
23759 + fill-column: 79
23760 + End:
23761 +*/
23762 diff -urN linux-2.6.20.orig/fs/reiser4/lock.h linux-2.6.20/fs/reiser4/lock.h
23763 --- linux-2.6.20.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23764 +++ linux-2.6.20/fs/reiser4/lock.h 2007-05-06 14:50:43.742989473 +0400
23765 @@ -0,0 +1,249 @@
23766 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23767 +
23768 +/* Long term locking data structures. See lock.c for details. */
23769 +
23770 +#ifndef __LOCK_H__
23771 +#define __LOCK_H__
23772 +
23773 +#include "forward.h"
23774 +#include "debug.h"
23775 +#include "dformat.h"
23776 +#include "key.h"
23777 +#include "coord.h"
23778 +#include "plugin/node/node.h"
23779 +#include "txnmgr.h"
23780 +#include "readahead.h"
23781 +
23782 +#include <linux/types.h>
23783 +#include <linux/spinlock.h>
23784 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23785 +#include <asm/atomic.h>
23786 +#include <linux/wait.h>
23787 +
23788 +/* Per-znode lock object */
23789 +struct zlock {
23790 + spinlock_t guard;
23791 + /* The number of readers if positive; the number of recursively taken
23792 + write locks if negative. Protected by zlock spin lock. */
23793 + int nr_readers;
23794 + /* A number of processes (lock_stacks) that have this object
23795 + locked with high priority */
23796 + unsigned nr_hipri_owners;
23797 + /* A number of attempts to lock znode in high priority direction */
23798 + unsigned nr_hipri_requests;
23799 + /* A linked list of lock_handle objects that contains pointers
23800 + for all lock_stacks which have this lock object locked */
23801 + unsigned nr_hipri_write_requests;
23802 + struct list_head owners;
23803 + /* A linked list of lock_stacks that wait for this lock */
23804 + struct list_head requestors;
23805 +};
23806 +
23807 +static inline void spin_lock_zlock(zlock *lock)
23808 +{
23809 + /* check that zlock is not locked */
23810 + assert("", LOCK_CNT_NIL(spin_locked_zlock));
23811 + /* check that spinlocks of lower priorities are not held */
23812 + assert("", LOCK_CNT_NIL(spin_locked_stack));
23813 +
23814 + spin_lock(&lock->guard);
23815 +
23816 + LOCK_CNT_INC(spin_locked_zlock);
23817 + LOCK_CNT_INC(spin_locked);
23818 +}
23819 +
23820 +static inline void spin_unlock_zlock(zlock *lock)
23821 +{
23822 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23823 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23824 +
23825 + LOCK_CNT_DEC(spin_locked_zlock);
23826 + LOCK_CNT_DEC(spin_locked);
23827 +
23828 + spin_unlock(&lock->guard);
23829 +}
23830 +
23831 +#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23832 +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23833 +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23834 +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23835 +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23836 +#define lock_mode_compatible(lock, mode) \
23837 + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23838 + ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23839 +
23840 +/* Since we have R/W znode locks we need additional bidirectional `link'
23841 + objects to implement n<->m relationship between lock owners and lock
23842 + objects. We call them `lock handles'.
23843 +
23844 + Locking: see lock.c/"SHORT-TERM LOCKING"
23845 +*/
23846 +struct lock_handle {
23847 + /* This flag indicates that a signal to yield a lock was passed to
23848 + lock owner and counted in owner->nr_signalled
23849 +
23850 + Locking: this is accessed under spin lock on ->node.
23851 + */
23852 + int signaled;
23853 + /* A link to owner of a lock */
23854 + lock_stack *owner;
23855 + /* A link to znode locked */
23856 + znode *node;
23857 + /* A list of all locks for a process */
23858 + struct list_head locks_link;
23859 + /* A list of all owners for a znode */
23860 + struct list_head owners_link;
23861 +};
23862 +
23863 +typedef struct lock_request {
23864 + /* A pointer to uninitialized link object */
23865 + lock_handle *handle;
23866 + /* A pointer to the object we want to lock */
23867 + znode *node;
23868 + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23869 + znode_lock_mode mode;
23870 + /* how dispatch_lock_requests() returns lock request result code */
23871 + int ret_code;
23872 +} lock_request;
23873 +
23874 +/* A lock stack structure for accumulating locks owned by a process */
23875 +struct lock_stack {
23876 + /* A guard lock protecting a lock stack */
23877 + spinlock_t sguard;
23878 + /* number of znodes which were requested by high priority processes */
23879 + atomic_t nr_signaled;
23880 + /* Current priority of a process
23881 +
23882 + This is only accessed by the current thread and thus requires no
23883 + locking.
23884 + */
23885 + int curpri;
23886 + /* A list of all locks owned by this process. Elements can be added to
23887 + * this list only by the current thread. ->node pointers in this list
23888 + * can be only changed by the current thread. */
23889 + struct list_head locks;
23890 + /* When lock_stack waits for the lock, it puts itself on double-linked
23891 + requestors list of that lock */
23892 + struct list_head requestors_link;
23893 + /* Current lock request info.
23894 +
23895 + This is only accessed by the current thread and thus requires no
23896 + locking.
23897 + */
23898 + lock_request request;
23899 + /* the following two fields are the lock stack's
23900 + * synchronization object to use with the standard linux/wait.h
23901 + * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23902 + * usage details. */
23903 + wait_queue_head_t wait;
23904 + atomic_t wakeup;
23905 +#if REISER4_DEBUG
23906 + int nr_locks; /* number of lock handles in the above list */
23907 +#endif
23908 +};
23909 +
23910 +/*
23911 + User-visible znode locking functions
23912 +*/
23913 +
23914 +extern int longterm_lock_znode(lock_handle * handle,
23915 + znode * node,
23916 + znode_lock_mode mode,
23917 + znode_lock_request request);
23918 +
23919 +extern void longterm_unlock_znode(lock_handle * handle);
23920 +
23921 +extern int reiser4_check_deadlock(void);
23922 +
23923 +extern lock_stack *get_current_lock_stack(void);
23924 +
23925 +extern void init_lock_stack(lock_stack * owner);
23926 +extern void reiser4_init_lock(zlock * lock);
23927 +
23928 +static inline void init_lh(lock_handle *lh)
23929 +{
23930 +#if REISER4_DEBUG
23931 + memset(lh, 0, sizeof *lh);
23932 + INIT_LIST_HEAD(&lh->locks_link);
23933 + INIT_LIST_HEAD(&lh->owners_link);
23934 +#else
23935 + lh->node = NULL;
23936 +#endif
23937 +}
23938 +
23939 +static inline void done_lh(lock_handle *lh)
23940 +{
23941 + assert("zam-342", lh != NULL);
23942 + if (lh->node != NULL)
23943 + longterm_unlock_znode(lh);
23944 +}
23945 +
23946 +extern void move_lh(lock_handle * new, lock_handle * old);
23947 +extern void copy_lh(lock_handle * new, lock_handle * old);
23948 +
23949 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
23950 +extern void reiser4_go_to_sleep(lock_stack * owner);
23951 +extern void __reiser4_wake_up(lock_stack * owner);
23952 +
23953 +extern int lock_stack_isclean(lock_stack * owner);
23954 +
23955 +/* zlock object state check macros: only used in assertions. Both forms imply that the
23956 + lock is held by the current thread. */
23957 +extern int znode_is_write_locked(const znode *);
23958 +extern void reiser4_invalidate_lock(lock_handle *);
23959 +
23960 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
23961 +#define spin_ordering_pred_stack(stack) \
23962 + (LOCK_CNT_NIL(spin_locked_stack) && \
23963 + LOCK_CNT_NIL(spin_locked_txnmgr) && \
23964 + LOCK_CNT_NIL(spin_locked_inode) && \
23965 + LOCK_CNT_NIL(rw_locked_cbk_cache) && \
23966 + LOCK_CNT_NIL(spin_locked_super_eflush) )
23967 +
23968 +static inline void spin_lock_stack(lock_stack *stack)
23969 +{
23970 + assert("", spin_ordering_pred_stack(stack));
23971 + spin_lock(&(stack->sguard));
23972 + LOCK_CNT_INC(spin_locked_stack);
23973 + LOCK_CNT_INC(spin_locked);
23974 +}
23975 +
23976 +static inline void spin_unlock_stack(lock_stack *stack)
23977 +{
23978 + assert_spin_locked(&(stack->sguard));
23979 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
23980 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23981 + LOCK_CNT_DEC(spin_locked_stack);
23982 + LOCK_CNT_DEC(spin_locked);
23983 + spin_unlock(&(stack->sguard));
23984 +}
23985 +
23986 +static inline void reiser4_wake_up(lock_stack * owner)
23987 +{
23988 + spin_lock_stack(owner);
23989 + __reiser4_wake_up(owner);
23990 + spin_unlock_stack(owner);
23991 +}
23992 +
23993 +const char *lock_mode_name(znode_lock_mode lock);
23994 +
23995 +#if REISER4_DEBUG
23996 +extern void check_lock_data(void);
23997 +extern void check_lock_node_data(znode * node);
23998 +#else
23999 +#define check_lock_data() noop
24000 +#define check_lock_node_data() noop
24001 +#endif
24002 +
24003 +/* __LOCK_H__ */
24004 +#endif
24005 +
24006 +/* Make Linus happy.
24007 + Local variables:
24008 + c-indentation-style: "K&R"
24009 + mode-name: "LC"
24010 + c-basic-offset: 8
24011 + tab-width: 8
24012 + fill-column: 120
24013 + End:
24014 +*/
24015 diff -urN linux-2.6.20.orig/fs/reiser4/Makefile linux-2.6.20/fs/reiser4/Makefile
24016 --- linux-2.6.20.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24017 +++ linux-2.6.20/fs/reiser4/Makefile 2007-05-06 14:50:43.742989473 +0400
24018 @@ -0,0 +1,99 @@
24019 +#
24020 +# reiser4/Makefile
24021 +#
24022 +
24023 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24024 +
24025 +reiser4-y := \
24026 + debug.o \
24027 + jnode.o \
24028 + znode.o \
24029 + key.o \
24030 + pool.o \
24031 + tree_mod.o \
24032 + estimate.o \
24033 + carry.o \
24034 + carry_ops.o \
24035 + lock.o \
24036 + tree.o \
24037 + context.o \
24038 + tap.o \
24039 + coord.o \
24040 + block_alloc.o \
24041 + txnmgr.o \
24042 + kassign.o \
24043 + flush.o \
24044 + wander.o \
24045 + eottl.o \
24046 + search.o \
24047 + page_cache.o \
24048 + seal.o \
24049 + dscale.o \
24050 + flush_queue.o \
24051 + ktxnmgrd.o \
24052 + blocknrset.o \
24053 + super.o \
24054 + super_ops.o \
24055 + fsdata.o \
24056 + export_ops.o \
24057 + oid.o \
24058 + tree_walk.o \
24059 + inode.o \
24060 + vfs_ops.o \
24061 + as_ops.o \
24062 + entd.o\
24063 + readahead.o \
24064 + status_flags.o \
24065 + init_super.o \
24066 + safe_link.o \
24067 + \
24068 + plugin/plugin.o \
24069 + plugin/plugin_set.o \
24070 + plugin/node/node.o \
24071 + plugin/object.o \
24072 + plugin/cluster.o \
24073 + plugin/inode_ops.o \
24074 + plugin/inode_ops_rename.o \
24075 + plugin/file_ops.o \
24076 + plugin/file_ops_readdir.o \
24077 + plugin/file_plugin_common.o \
24078 + plugin/file/file.o \
24079 + plugin/file/tail_conversion.o \
24080 + plugin/file/file_conversion.o \
24081 + plugin/file/symlink.o \
24082 + plugin/file/cryptcompress.o \
24083 + plugin/dir_plugin_common.o \
24084 + plugin/dir/hashed_dir.o \
24085 + plugin/dir/seekable_dir.o \
24086 + plugin/node/node40.o \
24087 + \
24088 + plugin/crypto/cipher.o \
24089 + plugin/crypto/digest.o \
24090 + \
24091 + plugin/compress/minilzo.o \
24092 + plugin/compress/compress.o \
24093 + plugin/compress/compress_mode.o \
24094 + \
24095 + plugin/item/static_stat.o \
24096 + plugin/item/sde.o \
24097 + plugin/item/cde.o \
24098 + plugin/item/blackbox.o \
24099 + plugin/item/internal.o \
24100 + plugin/item/tail.o \
24101 + plugin/item/ctail.o \
24102 + plugin/item/extent.o \
24103 + plugin/item/extent_item_ops.o \
24104 + plugin/item/extent_file_ops.o \
24105 + plugin/item/extent_flush_ops.o \
24106 + \
24107 + plugin/hash.o \
24108 + plugin/fibration.o \
24109 + plugin/tail_policy.o \
24110 + plugin/item/item.o \
24111 + \
24112 + plugin/security/perm.o \
24113 + plugin/space/bitmap.o \
24114 + \
24115 + plugin/disk_format/disk_format40.o \
24116 + plugin/disk_format/disk_format.o
24117 +
24118 diff -urN linux-2.6.20.orig/fs/reiser4/oid.c linux-2.6.20/fs/reiser4/oid.c
24119 --- linux-2.6.20.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24120 +++ linux-2.6.20/fs/reiser4/oid.c 2007-05-06 14:50:43.742989473 +0400
24121 @@ -0,0 +1,141 @@
24122 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24123 +
24124 +#include "debug.h"
24125 +#include "super.h"
24126 +#include "txnmgr.h"
24127 +
24128 +/* we used to have oid allocation plugin. It was removed because it
24129 + was recognized as providing unneeded level of abstraction. If one
24130 + ever will find it useful - look at yet_unneeded_abstractions/oid
24131 +*/
24132 +
24133 +/*
24134 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24135 + * are provided by disk format plugin that reads them from the disk during
24136 + * mount.
24137 + */
24138 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24139 +{
24140 + reiser4_super_info_data *sbinfo;
24141 +
24142 + sbinfo = get_super_private(super);
24143 +
24144 + sbinfo->next_to_use = next;
24145 + sbinfo->oids_in_use = nr_files;
24146 + return 0;
24147 +}
24148 +
24149 +/*
24150 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24151 + * runs out of oids.
24152 + */
24153 +oid_t oid_allocate(struct super_block * super)
24154 +{
24155 + reiser4_super_info_data *sbinfo;
24156 + oid_t oid;
24157 +
24158 + sbinfo = get_super_private(super);
24159 +
24160 + spin_lock_reiser4_super(sbinfo);
24161 + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24162 + oid = sbinfo->next_to_use++;
24163 + sbinfo->oids_in_use++;
24164 + } else
24165 + oid = ABSOLUTE_MAX_OID;
24166 + spin_unlock_reiser4_super(sbinfo);
24167 + return oid;
24168 +}
24169 +
24170 +/*
24171 + * Tell oid allocator that @oid is now free.
24172 + */
24173 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24174 +{
24175 + reiser4_super_info_data *sbinfo;
24176 +
24177 + sbinfo = get_super_private(super);
24178 +
24179 + spin_lock_reiser4_super(sbinfo);
24180 + sbinfo->oids_in_use--;
24181 + spin_unlock_reiser4_super(sbinfo);
24182 + return 0;
24183 +}
24184 +
24185 +/*
24186 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24187 + * without actually allocating it. This is used by disk format plugin to save
24188 + * oid allocator state on the disk.
24189 + */
24190 +oid_t oid_next(const struct super_block * super)
24191 +{
24192 + reiser4_super_info_data *sbinfo;
24193 + oid_t oid;
24194 +
24195 + sbinfo = get_super_private(super);
24196 +
24197 + spin_lock_reiser4_super(sbinfo);
24198 + oid = sbinfo->next_to_use;
24199 + spin_unlock_reiser4_super(sbinfo);
24200 + return oid;
24201 +}
24202 +
24203 +/*
24204 + * returns number of currently used oids. This is used by statfs(2) to report
24205 + * number of "inodes" and by disk format plugin to save oid allocator state on
24206 + * the disk.
24207 + */
24208 +long oids_used(const struct super_block *super)
24209 +{
24210 + reiser4_super_info_data *sbinfo;
24211 + oid_t used;
24212 +
24213 + sbinfo = get_super_private(super);
24214 +
24215 + spin_lock_reiser4_super(sbinfo);
24216 + used = sbinfo->oids_in_use;
24217 + spin_unlock_reiser4_super(sbinfo);
24218 + if (used < (__u64) ((long)~0) >> 1)
24219 + return (long)used;
24220 + else
24221 + return (long)-1;
24222 +}
24223 +
24224 +/*
24225 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24226 + * at the point when we are irrevocably committed to creation of the new file
24227 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24228 + * error).
24229 + */
24230 +void oid_count_allocated(void)
24231 +{
24232 + txn_atom *atom;
24233 +
24234 + atom = get_current_atom_locked();
24235 + atom->nr_objects_created++;
24236 + spin_unlock_atom(atom);
24237 +}
24238 +
24239 +/*
24240 + * Count oid as free in atom. This is done after call to oid_release() at the
24241 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24242 + * when oid release cannot be any longer rolled back due to some error).
24243 + */
24244 +void oid_count_released(void)
24245 +{
24246 + txn_atom *atom;
24247 +
24248 + atom = get_current_atom_locked();
24249 + atom->nr_objects_deleted++;
24250 + spin_unlock_atom(atom);
24251 +}
24252 +
24253 +/*
24254 + Local variables:
24255 + c-indentation-style: "K&R"
24256 + mode-name: "LC"
24257 + c-basic-offset: 8
24258 + tab-width: 8
24259 + fill-column: 120
24260 + scroll-step: 1
24261 + End:
24262 +*/
24263 diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.c linux-2.6.20/fs/reiser4/page_cache.c
24264 --- linux-2.6.20.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24265 +++ linux-2.6.20/fs/reiser4/page_cache.c 2007-05-06 14:50:43.742989473 +0400
24266 @@ -0,0 +1,736 @@
24267 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24268 + * reiser4/README */
24269 +
24270 +/* Memory pressure hooks. Fake inodes handling. */
24271 +
24272 +/* GLOSSARY
24273 +
24274 + . Formatted and unformatted nodes.
24275 + Elements of reiser4 balanced tree to store data and metadata.
24276 + Unformatted nodes are pointed to by extent pointers. Such nodes
24277 + are used to store data of large objects. Unlike unformatted nodes,
24278 + formatted ones have associated format described by node4X plugin.
24279 +
24280 + . Jnode (or journal node)
24281 + The in-memory header which is used to track formatted and unformatted
24282 + nodes, bitmap nodes, etc. In particular, jnodes are used to track
24283 + transactional information associated with each block(see reiser4/jnode.c
24284 + for details).
24285 +
24286 + . Znode
24287 + The in-memory header which is used to track formatted nodes. Contains
24288 + embedded jnode (see reiser4/znode.c for details).
24289 +*/
24290 +
24291 +/* We store all file system meta data (and data, of course) in the page cache.
24292 +
24293 + What does this mean? In stead of using bread/brelse we create special
24294 + "fake" inode (one per super block) and store content of formatted nodes
24295 + into pages bound to this inode in the page cache. In newer kernels bread()
24296 + already uses inode attached to block device (bd_inode). Advantage of having
24297 + our own fake inode is that we can install appropriate methods in its
24298 + address_space operations. Such methods are called by VM on memory pressure
24299 + (or during background page flushing) and we can use them to react
24300 + appropriately.
24301 +
24302 + In initial version we only support one block per page. Support for multiple
24303 + blocks per page is complicated by relocation.
24304 +
24305 + To each page, used by reiser4, jnode is attached. jnode is analogous to
24306 + buffer head. Difference is that jnode is bound to the page permanently:
24307 + jnode cannot be removed from memory until its backing page is.
24308 +
24309 + jnode contain pointer to page (->pg field) and page contain pointer to
24310 + jnode in ->private field. Pointer from jnode to page is protected to by
24311 + jnode's spinlock and pointer from page to jnode is protected by page lock
24312 + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24313 + lock. To go into reverse direction use jnode_lock_page() function that uses
24314 + standard try-lock-and-release device.
24315 +
24316 + Properties:
24317 +
24318 + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24319 + reference counter is increased.
24320 +
24321 + 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24322 + reference counter is decreased.
24323 +
24324 + 3. on jload() reference counter on jnode page is increased, page is
24325 + kmapped and `referenced'.
24326 +
24327 + 4. on jrelse() inverse operations are performed.
24328 +
24329 + 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24330 +
24331 + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24332 + historically.]
24333 +
24334 + [In the following discussion, `lock' invariably means long term lock on
24335 + znode.] (What about page locks?)
24336 +
24337 + There is some special class of deadlock possibilities related to memory
24338 + pressure. Locks acquired by other reiser4 threads are accounted for in
24339 + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24340 + invoked additional hidden arc is added to the locking graph: thread that
24341 + tries to allocate memory waits for ->vm_writeback() to finish. If this
24342 + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24343 + prevention is useless.
24344 +
24345 + Another related problem is possibility for ->vm_writeback() to run out of
24346 + memory itself. This is not a problem for ext2 and friends, because their
24347 + ->vm_writeback() don't allocate much memory, but reiser4 flush is
24348 + definitely able to allocate huge amounts of memory.
24349 +
24350 + It seems that there is no reliable way to cope with the problems above. In
24351 + stead it was decided that ->vm_writeback() (as invoked in the kswapd
24352 + context) wouldn't perform any flushing itself, but rather should just wake
24353 + up some auxiliary thread dedicated for this purpose (or, the same thread
24354 + that does periodic commit of old atoms (ktxnmgrd.c)).
24355 +
24356 + Details:
24357 +
24358 + 1. Page is called `reclaimable' against particular reiser4 mount F if this
24359 + page can be ultimately released by try_to_free_pages() under presumptions
24360 + that:
24361 +
24362 + a. ->vm_writeback() for F is no-op, and
24363 +
24364 + b. none of the threads accessing F are making any progress, and
24365 +
24366 + c. other reiser4 mounts obey the same memory reservation protocol as F
24367 + (described below).
24368 +
24369 + For example, clean un-pinned page, or page occupied by ext2 data are
24370 + reclaimable against any reiser4 mount.
24371 +
24372 + When there is more than one reiser4 mount in a system, condition (c) makes
24373 + reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24374 +
24375 + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24376 +
24377 + Fake inode is used to bound formatted nodes and each node is indexed within
24378 + fake inode by its block number. If block size of smaller than page size, it
24379 + may so happen that block mapped to the page with formatted node is occupied
24380 + by unformatted node or is unallocated. This lead to some complications,
24381 + because flushing whole page can lead to an incorrect overwrite of
24382 + unformatted node that is moreover, can be cached in some other place as
24383 + part of the file body. To avoid this, buffers for unformatted nodes are
24384 + never marked dirty. Also pages in the fake are never marked dirty. This
24385 + rules out usage of ->writepage() as memory pressure hook. In stead
24386 + ->releasepage() is used.
24387 +
24388 + Josh is concerned that page->buffer is going to die. This should not pose
24389 + significant problem though, because we need to add some data structures to
24390 + the page anyway (jnode) and all necessary book keeping can be put there.
24391 +
24392 +*/
24393 +
24394 +/* Life cycle of pages/nodes.
24395 +
24396 + jnode contains reference to page and page contains reference back to
24397 + jnode. This reference is counted in page ->count. Thus, page bound to jnode
24398 + cannot be released back into free pool.
24399 +
24400 + 1. Formatted nodes.
24401 +
24402 + 1. formatted node is represented by znode. When new znode is created its
24403 + ->pg pointer is NULL initially.
24404 +
24405 + 2. when node content is loaded into znode (by call to zload()) for the
24406 + first time following happens (in call to ->read_node() or
24407 + ->allocate_node()):
24408 +
24409 + 1. new page is added to the page cache.
24410 +
24411 + 2. this page is attached to znode and its ->count is increased.
24412 +
24413 + 3. page is kmapped.
24414 +
24415 + 3. if more calls to zload() follow (without corresponding zrelses), page
24416 + counter is left intact and in its stead ->d_count is increased in znode.
24417 +
24418 + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24419 + ->release_node() is called and page is kunmapped as result.
24420 +
24421 + 5. at some moment node can be captured by a transaction. Its ->x_count
24422 + is then increased by transaction manager.
24423 +
24424 + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24425 + bit set) following will happen (also see comment at the top of znode.c):
24426 +
24427 + 1. when last lock is released, node will be uncaptured from
24428 + transaction. This released reference that transaction manager acquired
24429 + at the step 5.
24430 +
24431 + 2. when last reference is released, zput() detects that node is
24432 + actually deleted and calls ->delete_node()
24433 + operation. page_cache_delete_node() implementation detaches jnode from
24434 + page and releases page.
24435 +
24436 + 7. otherwise (node wasn't removed from the tree), last reference to
24437 + znode will be released after transaction manager committed transaction
24438 + node was in. This implies squallocing of this node (see
24439 + flush.c). Nothing special happens at this point. Znode is still in the
24440 + hash table and page is still attached to it.
24441 +
24442 + 8. znode is actually removed from the memory because of the memory
24443 + pressure, or during umount (znodes_tree_done()). Anyway, znode is
24444 + removed by the call to zdrop(). At this moment, page is detached from
24445 + znode and removed from the inode address space.
24446 +
24447 +*/
24448 +
24449 +#include "debug.h"
24450 +#include "dformat.h"
24451 +#include "key.h"
24452 +#include "txnmgr.h"
24453 +#include "jnode.h"
24454 +#include "znode.h"
24455 +#include "block_alloc.h"
24456 +#include "tree.h"
24457 +#include "vfs_ops.h"
24458 +#include "inode.h"
24459 +#include "super.h"
24460 +#include "entd.h"
24461 +#include "page_cache.h"
24462 +#include "ktxnmgrd.h"
24463 +
24464 +#include <linux/types.h>
24465 +#include <linux/fs.h>
24466 +#include <linux/mm.h> /* for struct page */
24467 +#include <linux/swap.h> /* for struct page */
24468 +#include <linux/pagemap.h>
24469 +#include <linux/bio.h>
24470 +#include <linux/writeback.h>
24471 +#include <linux/blkdev.h>
24472 +
24473 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24474 +
24475 +static struct address_space_operations formatted_fake_as_ops;
24476 +
24477 +static const oid_t fake_ino = 0x1;
24478 +static const oid_t bitmap_ino = 0x2;
24479 +static const oid_t cc_ino = 0x3;
24480 +
24481 +static void
24482 +init_fake_inode(struct super_block *super, struct inode *fake,
24483 + struct inode **pfake)
24484 +{
24485 + assert("nikita-2168", fake->i_state & I_NEW);
24486 + fake->i_mapping->a_ops = &formatted_fake_as_ops;
24487 + *pfake = fake;
24488 + /* NOTE-NIKITA something else? */
24489 + unlock_new_inode(fake);
24490 +}
24491 +
24492 +/**
24493 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24494 + * @super: super block to init fake inode for
24495 + *
24496 + * Initializes fake inode to which formatted nodes are bound in the page cache
24497 + * and inode for bitmaps.
24498 + */
24499 +int reiser4_init_formatted_fake(struct super_block *super)
24500 +{
24501 + struct inode *fake;
24502 + struct inode *bitmap;
24503 + struct inode *cc;
24504 + reiser4_super_info_data *sinfo;
24505 +
24506 + assert("nikita-1703", super != NULL);
24507 +
24508 + sinfo = get_super_private_nocheck(super);
24509 + fake = iget_locked(super, oid_to_ino(fake_ino));
24510 +
24511 + if (fake != NULL) {
24512 + init_fake_inode(super, fake, &sinfo->fake);
24513 +
24514 + bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24515 + if (bitmap != NULL) {
24516 + init_fake_inode(super, bitmap, &sinfo->bitmap);
24517 +
24518 + cc = iget_locked(super, oid_to_ino(cc_ino));
24519 + if (cc != NULL) {
24520 + init_fake_inode(super, cc, &sinfo->cc);
24521 + return 0;
24522 + } else {
24523 + iput(sinfo->fake);
24524 + iput(sinfo->bitmap);
24525 + sinfo->fake = NULL;
24526 + sinfo->bitmap = NULL;
24527 + }
24528 + } else {
24529 + iput(sinfo->fake);
24530 + sinfo->fake = NULL;
24531 + }
24532 + }
24533 + return RETERR(-ENOMEM);
24534 +}
24535 +
24536 +/**
24537 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24538 + * @super: super block to init fake inode for
24539 + *
24540 + * Releases inodes which were used as address spaces of bitmap and formatted
24541 + * nodes.
24542 + */
24543 +void reiser4_done_formatted_fake(struct super_block *super)
24544 +{
24545 + reiser4_super_info_data *sinfo;
24546 +
24547 + sinfo = get_super_private_nocheck(super);
24548 +
24549 + if (sinfo->fake != NULL) {
24550 + iput(sinfo->fake);
24551 + sinfo->fake = NULL;
24552 + }
24553 +
24554 + if (sinfo->bitmap != NULL) {
24555 + iput(sinfo->bitmap);
24556 + sinfo->bitmap = NULL;
24557 + }
24558 +
24559 + if (sinfo->cc != NULL) {
24560 + iput(sinfo->cc);
24561 + sinfo->cc = NULL;
24562 + }
24563 + return;
24564 +}
24565 +
24566 +void reiser4_wait_page_writeback(struct page *page)
24567 +{
24568 + assert("zam-783", PageLocked(page));
24569 +
24570 + do {
24571 + unlock_page(page);
24572 + wait_on_page_writeback(page);
24573 + lock_page(page);
24574 + } while (PageWriteback(page));
24575 +}
24576 +
24577 +/* return tree @page is in */
24578 +reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24579 +{
24580 + assert("nikita-2461", page != NULL);
24581 + return &get_super_private(page->mapping->host->i_sb)->tree;
24582 +}
24583 +
24584 +/* completion handler for single page bio-based read.
24585 +
24586 + mpage_end_io_read() would also do. But it's static.
24587 +
24588 +*/
24589 +static int
24590 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24591 + int err UNUSED_ARG)
24592 +{
24593 + struct page *page;
24594 +
24595 + if (bio->bi_size != 0) {
24596 + warning("nikita-3332", "Truncated single page read: %i",
24597 + bio->bi_size);
24598 + return 1;
24599 + }
24600 +
24601 + page = bio->bi_io_vec[0].bv_page;
24602 +
24603 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24604 + SetPageUptodate(page);
24605 + } else {
24606 + ClearPageUptodate(page);
24607 + SetPageError(page);
24608 + }
24609 + unlock_page(page);
24610 + bio_put(bio);
24611 + return 0;
24612 +}
24613 +
24614 +/* completion handler for single page bio-based write.
24615 +
24616 + mpage_end_io_write() would also do. But it's static.
24617 +
24618 +*/
24619 +static int
24620 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24621 + int err UNUSED_ARG)
24622 +{
24623 + struct page *page;
24624 +
24625 + if (bio->bi_size != 0) {
24626 + warning("nikita-3333", "Truncated single page write: %i",
24627 + bio->bi_size);
24628 + return 1;
24629 + }
24630 +
24631 + page = bio->bi_io_vec[0].bv_page;
24632 +
24633 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24634 + SetPageError(page);
24635 + end_page_writeback(page);
24636 + bio_put(bio);
24637 + return 0;
24638 +}
24639 +
24640 +/* ->readpage() method for formatted nodes */
24641 +static int formatted_readpage(struct file *f UNUSED_ARG,
24642 + struct page *page /* page to read */ )
24643 +{
24644 + assert("nikita-2412", PagePrivate(page) && jprivate(page));
24645 + return reiser4_page_io(page, jprivate(page), READ,
24646 + reiser4_ctx_gfp_mask_get());
24647 +}
24648 +
24649 +/**
24650 + * reiser4_page_io - submit single-page bio request
24651 + * @page: page to perform io for
24652 + * @node: jnode of page
24653 + * @rw: read or write
24654 + * @gfp: gfp mask for bio allocation
24655 + *
24656 + * Submits single page read or write.
24657 + */
24658 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24659 +{
24660 + struct bio *bio;
24661 + int result;
24662 +
24663 + assert("nikita-2094", page != NULL);
24664 + assert("nikita-2226", PageLocked(page));
24665 + assert("nikita-2634", node != NULL);
24666 + assert("nikita-2893", rw == READ || rw == WRITE);
24667 +
24668 + if (rw) {
24669 + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24670 + unlock_page(page);
24671 + return 0;
24672 + }
24673 + }
24674 +
24675 + bio = page_bio(page, node, rw, gfp);
24676 + if (!IS_ERR(bio)) {
24677 + if (rw == WRITE) {
24678 + SetPageWriteback(page);
24679 + unlock_page(page);
24680 + }
24681 + reiser4_submit_bio(rw, bio);
24682 + result = 0;
24683 + } else {
24684 + unlock_page(page);
24685 + result = PTR_ERR(bio);
24686 + }
24687 +
24688 + return result;
24689 +}
24690 +
24691 +/* helper function to construct bio for page */
24692 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24693 +{
24694 + struct bio *bio;
24695 + assert("nikita-2092", page != NULL);
24696 + assert("nikita-2633", node != NULL);
24697 +
24698 + /* Simple implementation in the assumption that blocksize == pagesize.
24699 +
24700 + We only have to submit one block, but submit_bh() will allocate bio
24701 + anyway, so lets use all the bells-and-whistles of bio code.
24702 + */
24703 +
24704 + bio = bio_alloc(gfp, 1);
24705 + if (bio != NULL) {
24706 + int blksz;
24707 + struct super_block *super;
24708 + reiser4_block_nr blocknr;
24709 +
24710 + super = page->mapping->host->i_sb;
24711 + assert("nikita-2029", super != NULL);
24712 + blksz = super->s_blocksize;
24713 + assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24714 +
24715 + spin_lock_jnode(node);
24716 + blocknr = *jnode_get_io_block(node);
24717 + spin_unlock_jnode(node);
24718 +
24719 + assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24720 + assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24721 +
24722 + bio->bi_bdev = super->s_bdev;
24723 + /* fill bio->bi_sector before calling bio_add_page(), because
24724 + * q->merge_bvec_fn may want to inspect it (see
24725 + * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24726 + bio->bi_sector = blocknr * (blksz >> 9);
24727 +
24728 + if (!bio_add_page(bio, page, blksz, 0)) {
24729 + warning("nikita-3452",
24730 + "Single page bio cannot be constructed");
24731 + return ERR_PTR(RETERR(-EINVAL));
24732 + }
24733 +
24734 + /* bio -> bi_idx is filled by bio_init() */
24735 + bio->bi_end_io = (rw == READ) ?
24736 + end_bio_single_page_read : end_bio_single_page_write;
24737 +
24738 + return bio;
24739 + } else
24740 + return ERR_PTR(RETERR(-ENOMEM));
24741 +}
24742 +
24743 +/* this function is internally called by jnode_make_dirty() */
24744 +int reiser4_set_page_dirty_internal(struct page *page)
24745 +{
24746 + struct address_space *mapping;
24747 +
24748 + mapping = page->mapping;
24749 + BUG_ON(mapping == NULL);
24750 +
24751 + if (!TestSetPageDirty(page)) {
24752 + if (mapping_cap_account_dirty(mapping))
24753 + inc_zone_page_state(page, NR_FILE_DIRTY);
24754 +
24755 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24756 + }
24757 +
24758 + /* znode must be dirty ? */
24759 + if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24760 + assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24761 + return 0;
24762 +}
24763 +
24764 +#if REISER4_DEBUG
24765 +
24766 +/**
24767 + * can_hit_entd
24768 + *
24769 + * This is used on
24770 + */
24771 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24772 +{
24773 + if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24774 + return 1;
24775 + if (ctx->super != s)
24776 + return 1;
24777 + if (get_super_private(s)->entd.tsk == current)
24778 + return 0;
24779 + if (!lock_stack_isclean(&ctx->stack))
24780 + return 0;
24781 + if (ctx->trans->atom != NULL)
24782 + return 0;
24783 + return 1;
24784 +}
24785 +
24786 +#endif
24787 +
24788 +/**
24789 + * reiser4_writepage - writepage of struct address_space_operations
24790 + * @page: page to write
24791 + * @wbc:
24792 + *
24793 + *
24794 + */
24795 +/* Common memory pressure notification. */
24796 +int reiser4_writepage(struct page *page,
24797 + struct writeback_control *wbc)
24798 +{
24799 + struct super_block *s;
24800 + reiser4_context *ctx;
24801 +
24802 + assert("vs-828", PageLocked(page));
24803 +
24804 + s = page->mapping->host->i_sb;
24805 + ctx = get_current_context_check();
24806 +
24807 + assert("", can_hit_entd(ctx, s));
24808 +
24809 + return write_page_by_ent(page, wbc);
24810 +}
24811 +
24812 +/* ->set_page_dirty() method of formatted address_space */
24813 +static int formatted_set_page_dirty(struct page *page)
24814 +{
24815 + assert("nikita-2173", page != NULL);
24816 + BUG();
24817 + return __set_page_dirty_nobuffers(page);
24818 +}
24819 +
24820 +/* writepages method of address space operations in reiser4 is used to involve
24821 + into transactions pages which are dirtied via mmap. Only regular files can
24822 + have such pages. Fake inode is used to access formatted nodes via page
24823 + cache. As formatted nodes can never be mmaped, fake inode's writepages has
24824 + nothing to do */
24825 +static int
24826 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24827 +{
24828 + return 0;
24829 +}
24830 +
24831 +/* address space operations for the fake inode */
24832 +static struct address_space_operations formatted_fake_as_ops = {
24833 + /* Perform a writeback of a single page as a memory-freeing
24834 + * operation. */
24835 + .writepage = reiser4_writepage,
24836 + /* this is called to read formatted node */
24837 + .readpage = formatted_readpage,
24838 + /* ->sync_page() method of fake inode address space operations. Called
24839 + from wait_on_page() and lock_page().
24840 +
24841 + This is most annoyingly misnomered method. Actually it is called
24842 + from wait_on_page_bit() and lock_page() and its purpose is to
24843 + actually start io by jabbing device drivers.
24844 + */
24845 + .sync_page = block_sync_page,
24846 + /* Write back some dirty pages from this mapping. Called from sync.
24847 + called during sync (pdflush) */
24848 + .writepages = writepages_fake,
24849 + /* Set a page dirty */
24850 + .set_page_dirty = formatted_set_page_dirty,
24851 + /* used for read-ahead. Not applicable */
24852 + .readpages = NULL,
24853 + .prepare_write = NULL,
24854 + .commit_write = NULL,
24855 + .bmap = NULL,
24856 + /* called just before page is being detached from inode mapping and
24857 + removed from memory. Called on truncate, cut/squeeze, and
24858 + umount. */
24859 + .invalidatepage = reiser4_invalidatepage,
24860 + /* this is called by shrink_cache() so that file system can try to
24861 + release objects (jnodes, buffers, journal heads) attached to page
24862 + and, may be made page itself free-able.
24863 + */
24864 + .releasepage = reiser4_releasepage,
24865 + .direct_IO = NULL
24866 +};
24867 +
24868 +/* called just before page is released (no longer used by reiser4). Callers:
24869 + jdelete() and extent2tail(). */
24870 +void reiser4_drop_page(struct page *page)
24871 +{
24872 + assert("nikita-2181", PageLocked(page));
24873 + clear_page_dirty_for_io(page);
24874 + ClearPageUptodate(page);
24875 +#if defined(PG_skipped)
24876 + ClearPageSkipped(page);
24877 +#endif
24878 + unlock_page(page);
24879 +}
24880 +
24881 +#define JNODE_GANG_SIZE (16)
24882 +
24883 +/* find all jnodes from range specified and invalidate them */
24884 +static int
24885 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24886 +{
24887 + reiser4_inode *info;
24888 + int truncated_jnodes;
24889 + reiser4_tree *tree;
24890 + unsigned long index;
24891 + unsigned long end;
24892 +
24893 + if (inode_file_plugin(inode) ==
24894 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24895 + /* No need to get rid of jnodes here: if the single jnode of
24896 + page cluster did not have page, then it was found and killed
24897 + before in
24898 + truncate_page_cluster_cryptcompress()->jput()->jput_final(),
24899 + otherwise it will be dropped by reiser4_invalidatepage() */
24900 + return 0;
24901 + truncated_jnodes = 0;
24902 +
24903 + info = reiser4_inode_data(inode);
24904 + tree = reiser4_tree_by_inode(inode);
24905 +
24906 + index = from;
24907 + end = from + count;
24908 +
24909 + while (1) {
24910 + jnode *gang[JNODE_GANG_SIZE];
24911 + int taken;
24912 + int i;
24913 + jnode *node;
24914 +
24915 + assert("nikita-3466", index <= end);
24916 +
24917 + read_lock_tree(tree);
24918 + taken =
24919 + radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24920 + (void **)gang, index,
24921 + JNODE_GANG_SIZE);
24922 + for (i = 0; i < taken; ++i) {
24923 + node = gang[i];
24924 + if (index_jnode(node) < end)
24925 + jref(node);
24926 + else
24927 + gang[i] = NULL;
24928 + }
24929 + read_unlock_tree(tree);
24930 +
24931 + for (i = 0; i < taken; ++i) {
24932 + node = gang[i];
24933 + if (node != NULL) {
24934 + index = max(index, index_jnode(node));
24935 + spin_lock_jnode(node);
24936 + assert("edward-1457", node->pg == NULL);
24937 + /* this is always called after
24938 + truncate_inode_pages_range(). Therefore, here
24939 + jnode can not have page. New pages can not be
24940 + created because truncate_jnodes_range goes
24941 + under exclusive access on file obtained,
24942 + where as new page creation requires
24943 + non-exclusive access obtained */
24944 + JF_SET(node, JNODE_HEARD_BANSHEE);
24945 + reiser4_uncapture_jnode(node);
24946 + unhash_unformatted_jnode(node);
24947 + truncated_jnodes++;
24948 + jput(node);
24949 + } else
24950 + break;
24951 + }
24952 + if (i != taken || taken == 0)
24953 + break;
24954 + }
24955 + return truncated_jnodes;
24956 +}
24957 +
24958 +/* Truncating files in reiser4: problems and solutions.
24959 +
24960 + VFS calls fs's truncate after it has called truncate_inode_pages()
24961 + to get rid of pages corresponding to part of file being truncated.
24962 + In reiser4 it may cause existence of unallocated extents which do
24963 + not have jnodes. Flush code does not expect that. Solution of this
24964 + problem is straightforward. As vfs's truncate is implemented using
24965 + setattr operation, it seems reasonable to have ->setattr() that
24966 + will cut file body. However, flush code also does not expect dirty
24967 + pages without parent items, so it is impossible to cut all items,
24968 + then truncate all pages in two steps. We resolve this problem by
24969 + cutting items one-by-one. Each such fine-grained step performed
24970 + under longterm znode lock calls at the end ->kill_hook() method of
24971 + a killed item to remove its binded pages and jnodes.
24972 +
24973 + The following function is a common part of mentioned kill hooks.
24974 + Also, this is called before tail-to-extent conversion (to not manage
24975 + few copies of the data).
24976 +*/
24977 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
24978 + unsigned long count, int even_cows)
24979 +{
24980 + loff_t from_bytes, count_bytes;
24981 +
24982 + if (count == 0)
24983 + return;
24984 + from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
24985 + count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
24986 +
24987 + unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
24988 + truncate_inode_pages_range(mapping, from_bytes,
24989 + from_bytes + count_bytes - 1);
24990 + truncate_jnodes_range(mapping->host, from, count);
24991 +}
24992 +
24993 +/*
24994 + * Local variables:
24995 + * c-indentation-style: "K&R"
24996 + * mode-name: "LC"
24997 + * c-basic-offset: 8
24998 + * tab-width: 8
24999 + * fill-column: 120
25000 + * scroll-step: 1
25001 + * End:
25002 + */
25003 diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.h linux-2.6.20/fs/reiser4/page_cache.h
25004 --- linux-2.6.20.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25005 +++ linux-2.6.20/fs/reiser4/page_cache.h 2007-05-06 14:50:43.746990723 +0400
25006 @@ -0,0 +1,68 @@
25007 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25008 + * reiser4/README */
25009 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25010 +
25011 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25012 +#define __REISER4_PAGE_CACHE_H__
25013 +
25014 +#include "forward.h"
25015 +#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25016 +
25017 +#include <linux/fs.h> /* for struct super_block, address_space */
25018 +#include <linux/mm.h> /* for struct page */
25019 +#include <linux/pagemap.h> /* for lock_page() */
25020 +#include <linux/vmalloc.h> /* for __vmalloc() */
25021 +
25022 +extern int reiser4_init_formatted_fake(struct super_block *);
25023 +extern void reiser4_done_formatted_fake(struct super_block *);
25024 +
25025 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25026 +
25027 +extern int reiser4_set_page_dirty_internal(struct page *);
25028 +
25029 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25030 +
25031 +extern void reiser4_wait_page_writeback(struct page *);
25032 +static inline void lock_and_wait_page_writeback(struct page *page)
25033 +{
25034 + lock_page(page);
25035 + if (unlikely(PageWriteback(page)))
25036 + reiser4_wait_page_writeback(page);
25037 +}
25038 +
25039 +#define jprivate(page) ((jnode *)page_private(page))
25040 +
25041 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25042 +extern void reiser4_drop_page(struct page *);
25043 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25044 + unsigned long count, int even_cows);
25045 +extern void capture_reiser4_inodes(struct super_block *,
25046 + struct writeback_control *);
25047 +static inline void * reiser4_vmalloc (unsigned long size)
25048 +{
25049 + return __vmalloc(size,
25050 + reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25051 + PAGE_KERNEL);
25052 +}
25053 +
25054 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25055 +
25056 +#if REISER4_DEBUG
25057 +extern void print_page(const char *prefix, struct page *page);
25058 +#else
25059 +#define print_page(prf, p) noop
25060 +#endif
25061 +
25062 +/* __REISER4_PAGE_CACHE_H__ */
25063 +#endif
25064 +
25065 +/* Make Linus happy.
25066 + Local variables:
25067 + c-indentation-style: "K&R"
25068 + mode-name: "LC"
25069 + c-basic-offset: 8
25070 + tab-width: 8
25071 + fill-column: 120
25072 + scroll-step: 1
25073 + End:
25074 +*/
25075 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.c linux-2.6.20/fs/reiser4/plugin/cluster.c
25076 --- linux-2.6.20.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25077 +++ linux-2.6.20/fs/reiser4/plugin/cluster.c 2007-05-06 14:50:43.746990723 +0400
25078 @@ -0,0 +1,71 @@
25079 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25080 +
25081 +/* Contains reiser4 cluster plugins (see
25082 + http://www.namesys.com/cryptcompress_design.html
25083 + "Concepts of clustering" for details). */
25084 +
25085 +#include "plugin_header.h"
25086 +#include "plugin.h"
25087 +#include "../inode.h"
25088 +
25089 +static int change_cluster(struct inode *inode,
25090 + reiser4_plugin * plugin,
25091 + pset_member memb)
25092 +{
25093 + assert("edward-1324", inode != NULL);
25094 + assert("edward-1325", plugin != NULL);
25095 + assert("edward-1326", is_reiser4_inode(inode));
25096 + assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25097 +
25098 + /* Can't change the cluster plugin for already existent regular files. */
25099 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25100 + return RETERR(-EINVAL);
25101 +
25102 + /* If matches, nothing to change. */
25103 + if (inode_hash_plugin(inode) != NULL &&
25104 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25105 + return 0;
25106 +
25107 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25108 + PSET_CLUSTER, plugin);
25109 +}
25110 +
25111 +static reiser4_plugin_ops cluster_plugin_ops = {
25112 + .init = NULL,
25113 + .load = NULL,
25114 + .save_len = NULL,
25115 + .save = NULL,
25116 + .change = &change_cluster
25117 +};
25118 +
25119 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25120 + [CLUSTER_ ## ID ## _ID] = { \
25121 + .h = { \
25122 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25123 + .id = CLUSTER_ ## ID ## _ID, \
25124 + .pops = &cluster_plugin_ops, \
25125 + .label = LABEL, \
25126 + .desc = DESC, \
25127 + .linkage = {NULL, NULL} \
25128 + }, \
25129 + .shift = SHIFT \
25130 + }
25131 +
25132 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25133 + SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25134 + SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25135 + SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25136 + SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25137 + SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25138 +};
25139 +
25140 +/*
25141 + Local variables:
25142 + c-indentation-style: "K&R"
25143 + mode-name: "LC"
25144 + c-basic-offset: 8
25145 + tab-width: 8
25146 + fill-column: 120
25147 + scroll-step: 1
25148 + End:
25149 +*/
25150 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.h linux-2.6.20/fs/reiser4/plugin/cluster.h
25151 --- linux-2.6.20.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25152 +++ linux-2.6.20/fs/reiser4/plugin/cluster.h 2007-05-06 14:50:43.746990723 +0400
25153 @@ -0,0 +1,343 @@
25154 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25155 +
25156 +/* This file contains page/cluster index translators and offset modulators
25157 + See http://www.namesys.com/cryptcompress_design.html for details */
25158 +
25159 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25160 +#define __FS_REISER4_CLUSTER_H__
25161 +
25162 +#include "../inode.h"
25163 +
25164 +static inline int inode_cluster_shift(struct inode *inode)
25165 +{
25166 + assert("edward-92", inode != NULL);
25167 + assert("edward-93", reiser4_inode_data(inode) != NULL);
25168 +
25169 + return inode_cluster_plugin(inode)->shift;
25170 +}
25171 +
25172 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25173 +{
25174 + return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25175 +}
25176 +
25177 +/* cluster size in page units */
25178 +static inline unsigned cluster_nrpages(struct inode *inode)
25179 +{
25180 + return 1U << cluster_nrpages_shift(inode);
25181 +}
25182 +
25183 +static inline size_t inode_cluster_size(struct inode *inode)
25184 +{
25185 + assert("edward-96", inode != NULL);
25186 +
25187 + return 1U << inode_cluster_shift(inode);
25188 +}
25189 +
25190 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25191 +{
25192 + return idx >> cluster_nrpages_shift(inode);
25193 +}
25194 +
25195 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25196 +{
25197 + return idx << cluster_nrpages_shift(inode);
25198 +}
25199 +
25200 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25201 +{
25202 + return clust_to_pg(pg_to_clust(idx, inode), inode);
25203 +}
25204 +
25205 +static inline pgoff_t off_to_pg(loff_t off)
25206 +{
25207 + return (off >> PAGE_CACHE_SHIFT);
25208 +}
25209 +
25210 +static inline loff_t pg_to_off(pgoff_t idx)
25211 +{
25212 + return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25213 +}
25214 +
25215 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25216 +{
25217 + return off >> inode_cluster_shift(inode);
25218 +}
25219 +
25220 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25221 +{
25222 + return (loff_t) idx << inode_cluster_shift(inode);
25223 +}
25224 +
25225 +static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25226 +{
25227 + return (count + (1UL << shift) - 1) >> shift;
25228 +}
25229 +
25230 +/* number of pages occupied by @count bytes */
25231 +static inline pgoff_t count_to_nrpages(loff_t count)
25232 +{
25233 + return count_to_nr(count, PAGE_CACHE_SHIFT);
25234 +}
25235 +
25236 +/* number of clusters occupied by @count bytes */
25237 +static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25238 +{
25239 + return count_to_nr(count, inode_cluster_shift(inode));
25240 +}
25241 +
25242 +/* number of clusters occupied by @count pages */
25243 +static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25244 +{
25245 + return count_to_nr(count, cluster_nrpages_shift(inode));
25246 +}
25247 +
25248 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25249 +{
25250 + return clust_to_off(off_to_clust(off, inode), inode);
25251 +}
25252 +
25253 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25254 +{
25255 + return clust_to_pg(off_to_clust(off, inode), inode);
25256 +}
25257 +
25258 +static inline unsigned off_to_pgoff(loff_t off)
25259 +{
25260 + return off & (PAGE_CACHE_SIZE - 1);
25261 +}
25262 +
25263 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25264 +{
25265 + return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25266 +}
25267 +
25268 +static inline unsigned
25269 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25270 +{
25271 + return off_to_cloff(pg_to_off(idx), inode);
25272 +}
25273 +
25274 +/* if @size != 0, returns index of the page
25275 + which contains the last byte of the file */
25276 +static inline pgoff_t size_to_pg(loff_t size)
25277 +{
25278 + return (size ? off_to_pg(size - 1) : 0);
25279 +}
25280 +
25281 +/* minimal index of the page which doesn't contain
25282 + file data */
25283 +static inline pgoff_t size_to_next_pg(loff_t size)
25284 +{
25285 + return (size ? off_to_pg(size - 1) + 1 : 0);
25286 +}
25287 +
25288 +/* how many bytes of file of size @cnt can be contained
25289 + in page of index @idx */
25290 +static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25291 +{
25292 + if (idx > off_to_pg(cnt))
25293 + return 0;
25294 + if (idx < off_to_pg(cnt))
25295 + return PAGE_CACHE_SIZE;
25296 + return off_to_pgoff(cnt);
25297 +}
25298 +
25299 +/* how many bytes of file of size @cnt can be contained
25300 + in logical cluster of index @idx */
25301 +static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25302 + struct inode *inode)
25303 +{
25304 + if (idx > off_to_clust(cnt, inode))
25305 + return 0;
25306 + if (idx < off_to_clust(cnt, inode))
25307 + return inode_cluster_size(inode);
25308 + return off_to_cloff(cnt, inode);
25309 +}
25310 +
25311 +static inline unsigned
25312 +fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25313 +{
25314 + assert("edward-288", clust != NULL);
25315 + assert("edward-289", inode != NULL);
25316 +
25317 + return cnt_to_clcnt(inode->i_size, clust->index, inode);
25318 +}
25319 +
25320 +static inline int
25321 +cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25322 +{
25323 + return clust->tc.lsize == inode_cluster_size(inode);
25324 +}
25325 +
25326 +static inline void reiser4_slide_init(reiser4_slide_t * win)
25327 +{
25328 + assert("edward-1084", win != NULL);
25329 + memset(win, 0, sizeof *win);
25330 +}
25331 +
25332 +static inline tfm_action
25333 +cluster_get_tfm_act(tfm_cluster_t * tc)
25334 +{
25335 + assert("edward-1356", tc != NULL);
25336 + return tc->act;
25337 +}
25338 +
25339 +static inline void
25340 +cluster_set_tfm_act(tfm_cluster_t * tc, tfm_action act)
25341 +{
25342 + assert("edward-1356", tc != NULL);
25343 + tc->act = act;
25344 +}
25345 +
25346 +static inline void
25347 +cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25348 + assert("edward-84", clust != NULL);
25349 + memset(clust, 0, sizeof *clust);
25350 + cluster_set_tfm_act(&clust->tc, act);
25351 + clust->dstat = INVAL_DISK_CLUSTER;
25352 + clust->win = window;
25353 +}
25354 +
25355 +static inline void
25356 +cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25357 +{
25358 + cluster_init_act (clust, TFMA_READ, window);
25359 +}
25360 +
25361 +static inline void
25362 +cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25363 +{
25364 + cluster_init_act (clust, TFMA_WRITE, window);
25365 +}
25366 +
25367 +static inline int dclust_get_extension_dsize(hint_t * hint)
25368 +{
25369 + return hint->ext_coord.extension.ctail.dsize;
25370 +}
25371 +
25372 +static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25373 +{
25374 + hint->ext_coord.extension.ctail.dsize = dsize;
25375 +}
25376 +
25377 +static inline int dclust_get_extension_shift(hint_t * hint)
25378 +{
25379 + return hint->ext_coord.extension.ctail.shift;
25380 +}
25381 +
25382 +static inline int dclust_get_extension_ncount(hint_t * hint)
25383 +{
25384 + return hint->ext_coord.extension.ctail.ncount;
25385 +}
25386 +
25387 +static inline void dclust_inc_extension_ncount(hint_t * hint)
25388 +{
25389 + hint->ext_coord.extension.ctail.ncount ++;
25390 +}
25391 +
25392 +static inline void dclust_init_extension(hint_t * hint)
25393 +{
25394 + memset(&hint->ext_coord.extension.ctail, 0,
25395 + sizeof(hint->ext_coord.extension.ctail));
25396 +}
25397 +
25398 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25399 +{
25400 + assert("edward-1451", hint_is_valid(hint));
25401 + return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25402 +}
25403 +
25404 +static inline void coord_set_between_clusters(coord_t * coord)
25405 +{
25406 +#if REISER4_DEBUG
25407 + int result;
25408 + result = zload(coord->node);
25409 + assert("edward-1296", !result);
25410 +#endif
25411 + if (!coord_is_between_items(coord)) {
25412 + coord->between = AFTER_ITEM;
25413 + coord->unit_pos = 0;
25414 + }
25415 +#if REISER4_DEBUG
25416 + zrelse(coord->node);
25417 +#endif
25418 +}
25419 +
25420 +int reiser4_inflate_cluster(reiser4_cluster_t *, struct inode *);
25421 +int find_disk_cluster(reiser4_cluster_t *, struct inode *, int read,
25422 + znode_lock_mode mode);
25423 +int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25424 +int reiser4_deflate_cluster(reiser4_cluster_t *, struct inode *);
25425 +void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t start,
25426 + int even_cows);
25427 +void invalidate_hint_cluster(reiser4_cluster_t * clust);
25428 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25429 + znode_lock_mode mode);
25430 +int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25431 + znode_lock_mode lock_mode);
25432 +void reset_cluster_params(reiser4_cluster_t * clust);
25433 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25434 + int count);
25435 +int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25436 + int capture);
25437 +void reiser4_release_cluster_pages(reiser4_cluster_t *);
25438 +void put_cluster_handle(reiser4_cluster_t * clust);
25439 +int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25440 +int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25441 +void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25442 +void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25443 +
25444 +/* move cluster handle to the target position
25445 + specified by the page of index @pgidx
25446 +*/
25447 +static inline void move_cluster_forward(reiser4_cluster_t * clust,
25448 + struct inode *inode,
25449 + pgoff_t pgidx)
25450 +{
25451 + assert("edward-1297", clust != NULL);
25452 + assert("edward-1298", inode != NULL);
25453 +
25454 + reset_cluster_params(clust);
25455 + if (clust->index_valid &&
25456 + /* Hole in the indices. Hint became invalid and can not be
25457 + used by find_cluster_item() even if seal/node versions
25458 + will coincide */
25459 + pg_to_clust(pgidx, inode) != clust->index + 1) {
25460 + reiser4_unset_hint(clust->hint);
25461 + invalidate_hint_cluster(clust);
25462 + }
25463 + clust->index = pg_to_clust(pgidx, inode);
25464 + clust->index_valid = 1;
25465 +}
25466 +
25467 +static inline int
25468 +alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25469 +{
25470 + assert("edward-791", clust != NULL);
25471 + assert("edward-792", inode != NULL);
25472 + clust->pages =
25473 + kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25474 + reiser4_ctx_gfp_mask_get());
25475 + if (!clust->pages)
25476 + return -ENOMEM;
25477 + return 0;
25478 +}
25479 +
25480 +static inline void free_clust_pages(reiser4_cluster_t * clust)
25481 +{
25482 + kfree(clust->pages);
25483 +}
25484 +
25485 +#endif /* __FS_REISER4_CLUSTER_H__ */
25486 +
25487 +/* Make Linus happy.
25488 + Local variables:
25489 + c-indentation-style: "K&R"
25490 + mode-name: "LC"
25491 + c-basic-offset: 8
25492 + tab-width: 8
25493 + fill-column: 120
25494 + scroll-step: 1
25495 + End:
25496 +*/
25497 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.20/fs/reiser4/plugin/compress/compress.c
25498 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25499 +++ linux-2.6.20/fs/reiser4/plugin/compress/compress.c 2007-05-06 14:50:43.746990723 +0400
25500 @@ -0,0 +1,381 @@
25501 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25502 +/* reiser4 compression transform plugins */
25503 +
25504 +#include "../../debug.h"
25505 +#include "../../inode.h"
25506 +#include "../plugin.h"
25507 +#include "minilzo.h"
25508 +
25509 +#include <linux/zlib.h>
25510 +#include <linux/types.h>
25511 +#include <linux/hardirq.h>
25512 +
25513 +static int change_compression(struct inode *inode,
25514 + reiser4_plugin * plugin,
25515 + pset_member memb)
25516 +{
25517 + assert("edward-1316", inode != NULL);
25518 + assert("edward-1317", plugin != NULL);
25519 + assert("edward-1318", is_reiser4_inode(inode));
25520 + assert("edward-1319",
25521 + plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25522 +
25523 + /* cannot change compression plugin of already existing regular object */
25524 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25525 + return RETERR(-EINVAL);
25526 +
25527 + /* If matches, nothing to change. */
25528 + if (inode_hash_plugin(inode) != NULL &&
25529 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25530 + return 0;
25531 +
25532 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25533 + PSET_COMPRESSION, plugin);
25534 +}
25535 +
25536 +static reiser4_plugin_ops compression_plugin_ops = {
25537 + .init = NULL,
25538 + .load = NULL,
25539 + .save_len = NULL,
25540 + .save = NULL,
25541 + .change = &change_compression
25542 +};
25543 +
25544 +/******************************************************************************/
25545 +/* gzip1 compression */
25546 +/******************************************************************************/
25547 +
25548 +#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25549 +#define GZIP1_DEF_WINBITS 15
25550 +#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25551 +
25552 +static int gzip1_init(void)
25553 +{
25554 + int ret = -EINVAL;
25555 +#if REISER4_ZLIB
25556 + ret = 0;
25557 +#endif
25558 + if (ret == -EINVAL)
25559 + warning("edward-1337", "Zlib not compiled into kernel");
25560 + return ret;
25561 +}
25562 +
25563 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25564 +{
25565 + return 0;
25566 +}
25567 +
25568 +static coa_t gzip1_alloc(tfm_action act)
25569 +{
25570 + coa_t coa = NULL;
25571 +#if REISER4_ZLIB
25572 + int ret = 0;
25573 + switch (act) {
25574 + case TFMA_WRITE: /* compress */
25575 + coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25576 + if (!coa) {
25577 + ret = -ENOMEM;
25578 + break;
25579 + }
25580 + memset(coa, 0, zlib_deflate_workspacesize());
25581 + break;
25582 + case TFMA_READ: /* decompress */
25583 + coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25584 + if (!coa) {
25585 + ret = -ENOMEM;
25586 + break;
25587 + }
25588 + memset(coa, 0, zlib_inflate_workspacesize());
25589 + break;
25590 + default:
25591 + impossible("edward-767",
25592 + "trying to alloc workspace for unknown tfm action");
25593 + }
25594 + if (ret) {
25595 + warning("edward-768",
25596 + "alloc workspace for gzip1 (tfm action = %d) failed\n",
25597 + act);
25598 + return ERR_PTR(ret);
25599 + }
25600 +#endif
25601 + return coa;
25602 +}
25603 +
25604 +static void gzip1_free(coa_t coa, tfm_action act)
25605 +{
25606 + assert("edward-769", coa != NULL);
25607 +
25608 + switch (act) {
25609 + case TFMA_WRITE: /* compress */
25610 + vfree(coa);
25611 + break;
25612 + case TFMA_READ: /* decompress */
25613 + vfree(coa);
25614 + break;
25615 + default:
25616 + impossible("edward-770", "unknown tfm action");
25617 + }
25618 + return;
25619 +}
25620 +
25621 +static int gzip1_min_size_deflate(void)
25622 +{
25623 + return 64;
25624 +}
25625 +
25626 +static void
25627 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25628 + __u8 * dst_first, unsigned *dst_len)
25629 +{
25630 +#if REISER4_ZLIB
25631 + int ret = 0;
25632 + struct z_stream_s stream;
25633 +
25634 + memset(&stream, 0, sizeof(stream));
25635 +
25636 + assert("edward-842", coa != NULL);
25637 + assert("edward-875", src_len != 0);
25638 +
25639 + stream.workspace = coa;
25640 + ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25641 + -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25642 + Z_DEFAULT_STRATEGY);
25643 + if (ret != Z_OK) {
25644 + warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25645 + goto rollback;
25646 + }
25647 + ret = zlib_deflateReset(&stream);
25648 + if (ret != Z_OK) {
25649 + warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25650 + goto rollback;
25651 + }
25652 + stream.next_in = src_first;
25653 + stream.avail_in = src_len;
25654 + stream.next_out = dst_first;
25655 + stream.avail_out = *dst_len;
25656 +
25657 + ret = zlib_deflate(&stream, Z_FINISH);
25658 + if (ret != Z_STREAM_END) {
25659 + if (ret != Z_OK)
25660 + warning("edward-773",
25661 + "zlib_deflate returned %d\n", ret);
25662 + goto rollback;
25663 + }
25664 + *dst_len = stream.total_out;
25665 + return;
25666 + rollback:
25667 + *dst_len = src_len;
25668 +#endif
25669 + return;
25670 +}
25671 +
25672 +static void
25673 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25674 + __u8 * dst_first, unsigned *dst_len)
25675 +{
25676 +#if REISER4_ZLIB
25677 + int ret = 0;
25678 + struct z_stream_s stream;
25679 +
25680 + memset(&stream, 0, sizeof(stream));
25681 +
25682 + assert("edward-843", coa != NULL);
25683 + assert("edward-876", src_len != 0);
25684 +
25685 + stream.workspace = coa;
25686 + ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25687 + if (ret != Z_OK) {
25688 + warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25689 + return;
25690 + }
25691 + ret = zlib_inflateReset(&stream);
25692 + if (ret != Z_OK) {
25693 + warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25694 + return;
25695 + }
25696 +
25697 + stream.next_in = src_first;
25698 + stream.avail_in = src_len;
25699 + stream.next_out = dst_first;
25700 + stream.avail_out = *dst_len;
25701 +
25702 + ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25703 + /*
25704 + * Work around a bug in zlib, which sometimes wants to taste an extra
25705 + * byte when being used in the (undocumented) raw deflate mode.
25706 + * (From USAGI).
25707 + */
25708 + if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25709 + u8 zerostuff = 0;
25710 + stream.next_in = &zerostuff;
25711 + stream.avail_in = 1;
25712 + ret = zlib_inflate(&stream, Z_FINISH);
25713 + }
25714 + if (ret != Z_STREAM_END) {
25715 + warning("edward-776", "zlib_inflate returned %d\n", ret);
25716 + return;
25717 + }
25718 + *dst_len = stream.total_out;
25719 +#endif
25720 + return;
25721 +}
25722 +
25723 +/******************************************************************************/
25724 +/* lzo1 compression */
25725 +/******************************************************************************/
25726 +
25727 +static int lzo1_init(void)
25728 +{
25729 + int ret;
25730 + ret = lzo_init();
25731 + if (ret != LZO_E_OK)
25732 + warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25733 + return ret;
25734 +}
25735 +
25736 +static int lzo1_overrun(unsigned in_len)
25737 +{
25738 + return in_len / 64 + 16 + 3;
25739 +}
25740 +
25741 +#define LZO_HEAP_SIZE(size) \
25742 + sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25743 +
25744 +static coa_t lzo1_alloc(tfm_action act)
25745 +{
25746 + int ret = 0;
25747 + coa_t coa = NULL;
25748 +
25749 + switch (act) {
25750 + case TFMA_WRITE: /* compress */
25751 + coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25752 + if (!coa) {
25753 + ret = -ENOMEM;
25754 + break;
25755 + }
25756 + memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25757 + case TFMA_READ: /* decompress */
25758 + break;
25759 + default:
25760 + impossible("edward-877",
25761 + "trying to alloc workspace for unknown tfm action");
25762 + }
25763 + if (ret) {
25764 + warning("edward-878",
25765 + "alloc workspace for lzo1 (tfm action = %d) failed\n",
25766 + act);
25767 + return ERR_PTR(ret);
25768 + }
25769 + return coa;
25770 +}
25771 +
25772 +static void lzo1_free(coa_t coa, tfm_action act)
25773 +{
25774 + assert("edward-879", coa != NULL);
25775 +
25776 + switch (act) {
25777 + case TFMA_WRITE: /* compress */
25778 + vfree(coa);
25779 + break;
25780 + case TFMA_READ: /* decompress */
25781 + impossible("edward-1304",
25782 + "trying to free non-allocated workspace");
25783 + default:
25784 + impossible("edward-880", "unknown tfm action");
25785 + }
25786 + return;
25787 +}
25788 +
25789 +static int lzo1_min_size_deflate(void)
25790 +{
25791 + return 256;
25792 +}
25793 +
25794 +static void
25795 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25796 + __u8 * dst_first, unsigned *dst_len)
25797 +{
25798 + int result;
25799 +
25800 + assert("edward-846", coa != NULL);
25801 + assert("edward-847", src_len != 0);
25802 +
25803 + result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25804 + if (result != LZO_E_OK) {
25805 + warning("edward-849", "lzo1x_1_compress failed\n");
25806 + goto out;
25807 + }
25808 + if (*dst_len >= src_len) {
25809 + //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25810 + goto out;
25811 + }
25812 + return;
25813 + out:
25814 + *dst_len = src_len;
25815 + return;
25816 +}
25817 +
25818 +static void
25819 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25820 + __u8 * dst_first, unsigned *dst_len)
25821 +{
25822 + int result;
25823 +
25824 + assert("edward-851", coa == NULL);
25825 + assert("edward-852", src_len != 0);
25826 +
25827 + result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
25828 + if (result != LZO_E_OK)
25829 + warning("edward-853", "lzo1x_1_decompress failed\n");
25830 + return;
25831 +}
25832 +
25833 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25834 + [LZO1_COMPRESSION_ID] = {
25835 + .h = {
25836 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25837 + .id = LZO1_COMPRESSION_ID,
25838 + .pops = &compression_plugin_ops,
25839 + .label = "lzo1",
25840 + .desc = "lzo1 compression transform",
25841 + .linkage = {NULL, NULL}
25842 + },
25843 + .init = lzo1_init,
25844 + .overrun = lzo1_overrun,
25845 + .alloc = lzo1_alloc,
25846 + .free = lzo1_free,
25847 + .min_size_deflate = lzo1_min_size_deflate,
25848 + .checksum = reiser4_adler32,
25849 + .compress = lzo1_compress,
25850 + .decompress = lzo1_decompress
25851 + },
25852 + [GZIP1_COMPRESSION_ID] = {
25853 + .h = {
25854 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25855 + .id = GZIP1_COMPRESSION_ID,
25856 + .pops = &compression_plugin_ops,
25857 + .label = "gzip1",
25858 + .desc = "gzip1 compression transform",
25859 + .linkage = {NULL, NULL}
25860 + },
25861 + .init = gzip1_init,
25862 + .overrun = gzip1_overrun,
25863 + .alloc = gzip1_alloc,
25864 + .free = gzip1_free,
25865 + .min_size_deflate = gzip1_min_size_deflate,
25866 + .checksum = reiser4_adler32,
25867 + .compress = gzip1_compress,
25868 + .decompress = gzip1_decompress
25869 + }
25870 +};
25871 +
25872 +/*
25873 + Local variables:
25874 + c-indentation-style: "K&R"
25875 + mode-name: "LC"
25876 + c-basic-offset: 8
25877 + tab-width: 8
25878 + fill-column: 120
25879 + scroll-step: 1
25880 + End:
25881 +*/
25882 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.20/fs/reiser4/plugin/compress/compress.h
25883 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25884 +++ linux-2.6.20/fs/reiser4/plugin/compress/compress.h 2007-05-06 14:50:43.746990723 +0400
25885 @@ -0,0 +1,38 @@
25886 +#if !defined( __FS_REISER4_COMPRESS_H__ )
25887 +#define __FS_REISER4_COMPRESS_H__
25888 +
25889 +#include <linux/types.h>
25890 +#include <linux/string.h>
25891 +
25892 +typedef enum {
25893 + TFMA_READ,
25894 + TFMA_WRITE,
25895 + TFMA_LAST
25896 +} tfm_action;
25897 +
25898 +/* builtin compression plugins */
25899 +
25900 +typedef enum {
25901 + LZO1_COMPRESSION_ID,
25902 + GZIP1_COMPRESSION_ID,
25903 + LAST_COMPRESSION_ID,
25904 +} reiser4_compression_id;
25905 +
25906 +typedef unsigned long cloff_t;
25907 +typedef void *coa_t;
25908 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
25909 +
25910 +__u32 reiser4_adler32(char *data, __u32 len);
25911 +
25912 +#endif /* __FS_REISER4_COMPRESS_H__ */
25913 +
25914 +/* Make Linus happy.
25915 + Local variables:
25916 + c-indentation-style: "K&R"
25917 + mode-name: "LC"
25918 + c-basic-offset: 8
25919 + tab-width: 8
25920 + fill-column: 120
25921 + scroll-step: 1
25922 + End:
25923 +*/
25924 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c
25925 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
25926 +++ linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c 2007-05-06 14:50:43.750991972 +0400
25927 @@ -0,0 +1,162 @@
25928 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25929 +/* This file contains Reiser4 compression mode plugins.
25930 +
25931 + Compression mode plugin is a set of handlers called by compressor
25932 + at flush time and represent some heuristics including the ones
25933 + which are to avoid compression of incompressible data, see
25934 + http://www.namesys.com/cryptcompress_design.html for more details.
25935 +*/
25936 +#include "../../inode.h"
25937 +#include "../plugin.h"
25938 +
25939 +static int should_deflate_none(struct inode * inode, cloff_t index)
25940 +{
25941 + return 0;
25942 +}
25943 +
25944 +static int should_deflate_common(struct inode * inode, cloff_t index)
25945 +{
25946 + return compression_is_on(cryptcompress_inode_data(inode));
25947 +}
25948 +
25949 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
25950 +{
25951 + turn_off_compression(cryptcompress_inode_data(inode));
25952 + return 0;
25953 +}
25954 +
25955 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
25956 +{
25957 + cryptcompress_info_t * info = cryptcompress_inode_data(inode);
25958 +
25959 + assert("edward-1462",
25960 + get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
25961 + get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
25962 +
25963 + turn_off_compression(info);
25964 + if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
25965 + set_lattice_factor(info, get_lattice_factor(info) << 1);
25966 + return 0;
25967 +}
25968 +
25969 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
25970 +{
25971 + turn_on_compression(cryptcompress_inode_data(inode));
25972 + set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
25973 + return 0;
25974 +}
25975 +
25976 +/* Check on dynamic lattice, the adaptive compression modes which
25977 + defines the following behavior:
25978 +
25979 + Compression is on: try to compress everything and turn
25980 + it off, whenever cluster is incompressible.
25981 +
25982 + Compression is off: try to compress clusters of indexes
25983 + k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
25984 + them is compressible. If incompressible, then increase FACTOR */
25985 +
25986 +/* check if @index belongs to one-dimensional lattice
25987 + of sparce factor @factor */
25988 +static int is_on_lattice(cloff_t index, int factor)
25989 +{
25990 + return (factor ? index % factor == 0: index == 0);
25991 +}
25992 +
25993 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
25994 +{
25995 + return should_deflate_common(inode, index) ||
25996 + is_on_lattice(index,
25997 + get_lattice_factor
25998 + (cryptcompress_inode_data(inode)));
25999 +}
26000 +
26001 +/* compression mode_plugins */
26002 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26003 + [NONE_COMPRESSION_MODE_ID] = {
26004 + .h = {
26005 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26006 + .id = NONE_COMPRESSION_MODE_ID,
26007 + .pops = NULL,
26008 + .label = "none",
26009 + .desc = "Compress nothing",
26010 + .linkage = {NULL, NULL}
26011 + },
26012 + .should_deflate = should_deflate_none,
26013 + .accept_hook = NULL,
26014 + .discard_hook = NULL
26015 + },
26016 + /* Check-on-dynamic-lattice adaptive compression mode */
26017 + [LATTD_COMPRESSION_MODE_ID] = {
26018 + .h = {
26019 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26020 + .id = LATTD_COMPRESSION_MODE_ID,
26021 + .pops = NULL,
26022 + .label = "lattd",
26023 + .desc = "Check on dynamic lattice",
26024 + .linkage = {NULL, NULL}
26025 + },
26026 + .should_deflate = should_deflate_lattd,
26027 + .accept_hook = accept_hook_lattd,
26028 + .discard_hook = discard_hook_lattd
26029 + },
26030 + /* Check-ultimately compression mode:
26031 + Turn off compression forever as soon as we meet
26032 + incompressible data */
26033 + [ULTIM_COMPRESSION_MODE_ID] = {
26034 + .h = {
26035 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26036 + .id = ULTIM_COMPRESSION_MODE_ID,
26037 + .pops = NULL,
26038 + .label = "ultim",
26039 + .desc = "Check ultimately",
26040 + .linkage = {NULL, NULL}
26041 + },
26042 + .should_deflate = should_deflate_common,
26043 + .accept_hook = NULL,
26044 + .discard_hook = discard_hook_ultim
26045 + },
26046 + /* Force-to-compress-everything compression mode */
26047 + [FORCE_COMPRESSION_MODE_ID] = {
26048 + .h = {
26049 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26050 + .id = FORCE_COMPRESSION_MODE_ID,
26051 + .pops = NULL,
26052 + .label = "force",
26053 + .desc = "Force to compress everything",
26054 + .linkage = {NULL, NULL}
26055 + },
26056 + .should_deflate = NULL,
26057 + .accept_hook = NULL,
26058 + .discard_hook = NULL
26059 + },
26060 + /* Convert-to-extent compression mode.
26061 + In this mode items will be converted to extents and management
26062 + will be passed to (classic) unix file plugin as soon as ->write()
26063 + detects that the first complete logical cluster (of index #0) is
26064 + incompressible. */
26065 + [CONVX_COMPRESSION_MODE_ID] = {
26066 + .h = {
26067 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26068 + .id = CONVX_COMPRESSION_MODE_ID,
26069 + .pops = NULL,
26070 + .label = "conv",
26071 + .desc = "Convert to extent",
26072 + .linkage = {NULL, NULL}
26073 + },
26074 + .should_deflate = should_deflate_common,
26075 + .accept_hook = NULL,
26076 + .discard_hook = NULL
26077 + }
26078 +};
26079 +
26080 +/*
26081 + Local variables:
26082 + c-indentation-style: "K&R"
26083 + mode-name: "LC"
26084 + c-basic-offset: 8
26085 + tab-width: 8
26086 + fill-column: 120
26087 + scroll-step: 1
26088 + End:
26089 +*/
26090 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h
26091 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 03:00:00.000000000 +0300
26092 +++ linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h 2007-05-06 14:50:43.750991972 +0400
26093 @@ -0,0 +1,216 @@
26094 +/* lzoconf.h -- configuration for the LZO real-time data compression library
26095 + adopted for reiser4 compression transform plugin.
26096 +
26097 + This file is part of the LZO real-time data compression library
26098 + and not included in any proprietary licenses of reiser4.
26099 +
26100 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26101 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26102 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26103 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26104 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26105 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26106 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26107 + All Rights Reserved.
26108 +
26109 + The LZO library is free software; you can redistribute it and/or
26110 + modify it under the terms of the GNU General Public License as
26111 + published by the Free Software Foundation; either version 2 of
26112 + the License, or (at your option) any later version.
26113 +
26114 + The LZO library is distributed in the hope that it will be useful,
26115 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26116 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26117 + GNU General Public License for more details.
26118 +
26119 + You should have received a copy of the GNU General Public License
26120 + along with the LZO library; see the file COPYING.
26121 + If not, write to the Free Software Foundation, Inc.,
26122 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26123 +
26124 + Markus F.X.J. Oberhumer
26125 + <markus@oberhumer.com>
26126 + http://www.oberhumer.com/opensource/lzo/
26127 + */
26128 +
26129 +#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26130 +
26131 +#ifndef __LZOCONF_H
26132 +#define __LZOCONF_H
26133 +
26134 +#define LZO_VERSION 0x1080
26135 +#define LZO_VERSION_STRING "1.08"
26136 +#define LZO_VERSION_DATE "Jul 12 2002"
26137 +
26138 +/* internal Autoconf configuration file - only used when building LZO */
26139 +
26140 +/***********************************************************************
26141 +// LZO requires a conforming <limits.h>
26142 +************************************************************************/
26143 +
26144 +#define CHAR_BIT 8
26145 +#define USHRT_MAX 0xffff
26146 +
26147 +/* workaround a cpp bug under hpux 10.20 */
26148 +#define LZO_0xffffffffL 4294967295ul
26149 +
26150 +/***********************************************************************
26151 +// architecture defines
26152 +************************************************************************/
26153 +
26154 +#if !defined(__LZO_i386)
26155 +# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
26156 +# define __LZO_i386
26157 +# endif
26158 +#endif
26159 +
26160 +/* memory checkers */
26161 +#if !defined(__LZO_CHECKER)
26162 +# if defined(__BOUNDS_CHECKING_ON)
26163 +# define __LZO_CHECKER
26164 +# elif defined(__CHECKER__)
26165 +# define __LZO_CHECKER
26166 +# elif defined(__INSURE__)
26167 +# define __LZO_CHECKER
26168 +# elif defined(__PURIFY__)
26169 +# define __LZO_CHECKER
26170 +# endif
26171 +#endif
26172 +
26173 +/***********************************************************************
26174 +// integral and pointer types
26175 +************************************************************************/
26176 +
26177 +/* Integral types with 32 bits or more */
26178 +#if !defined(LZO_UINT32_MAX)
26179 +# if (UINT_MAX >= LZO_0xffffffffL)
26180 + typedef unsigned int lzo_uint32;
26181 + typedef int lzo_int32;
26182 +# define LZO_UINT32_MAX UINT_MAX
26183 +# define LZO_INT32_MAX INT_MAX
26184 +# define LZO_INT32_MIN INT_MIN
26185 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26186 + typedef unsigned long lzo_uint32;
26187 + typedef long lzo_int32;
26188 +# define LZO_UINT32_MAX ULONG_MAX
26189 +# define LZO_INT32_MAX LONG_MAX
26190 +# define LZO_INT32_MIN LONG_MIN
26191 +# else
26192 +# error "lzo_uint32"
26193 +# endif
26194 +#endif
26195 +
26196 +/* lzo_uint is used like size_t */
26197 +#if !defined(LZO_UINT_MAX)
26198 +# if (UINT_MAX >= LZO_0xffffffffL)
26199 + typedef unsigned int lzo_uint;
26200 + typedef int lzo_int;
26201 +# define LZO_UINT_MAX UINT_MAX
26202 +# define LZO_INT_MAX INT_MAX
26203 +# define LZO_INT_MIN INT_MIN
26204 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26205 + typedef unsigned long lzo_uint;
26206 + typedef long lzo_int;
26207 +# define LZO_UINT_MAX ULONG_MAX
26208 +# define LZO_INT_MAX LONG_MAX
26209 +# define LZO_INT_MIN LONG_MIN
26210 +# else
26211 +# error "lzo_uint"
26212 +# endif
26213 +#endif
26214 +
26215 + typedef int lzo_bool;
26216 +
26217 +/***********************************************************************
26218 +// memory models
26219 +************************************************************************/
26220 +
26221 +/* Memory model that allows to access memory at offsets of lzo_uint. */
26222 +#if !defined(__LZO_MMODEL)
26223 +# if (LZO_UINT_MAX <= UINT_MAX)
26224 +# define __LZO_MMODEL
26225 +# else
26226 +# error "__LZO_MMODEL"
26227 +# endif
26228 +#endif
26229 +
26230 +/* no typedef here because of const-pointer issues */
26231 +#define lzo_byte unsigned char __LZO_MMODEL
26232 +#define lzo_bytep unsigned char __LZO_MMODEL *
26233 +#define lzo_charp char __LZO_MMODEL *
26234 +#define lzo_voidp void __LZO_MMODEL *
26235 +#define lzo_shortp short __LZO_MMODEL *
26236 +#define lzo_ushortp unsigned short __LZO_MMODEL *
26237 +#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26238 +#define lzo_int32p lzo_int32 __LZO_MMODEL *
26239 +#define lzo_uintp lzo_uint __LZO_MMODEL *
26240 +#define lzo_intp lzo_int __LZO_MMODEL *
26241 +#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26242 +#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26243 +
26244 +#ifndef lzo_sizeof_dict_t
26245 +# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26246 +#endif
26247 +
26248 +typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26249 + lzo_byte * dst, lzo_uintp dst_len,
26250 + lzo_voidp wrkmem);
26251 +
26252 +
26253 +/***********************************************************************
26254 +// error codes and prototypes
26255 +************************************************************************/
26256 +
26257 +/* Error codes for the compression/decompression functions. Negative
26258 + * values are errors, positive values will be used for special but
26259 + * normal events.
26260 + */
26261 +#define LZO_E_OK 0
26262 +#define LZO_E_ERROR (-1)
26263 +#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26264 +#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26265 +#define LZO_E_INPUT_OVERRUN (-4)
26266 +#define LZO_E_OUTPUT_OVERRUN (-5)
26267 +#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26268 +#define LZO_E_EOF_NOT_FOUND (-7)
26269 +#define LZO_E_INPUT_NOT_CONSUMED (-8)
26270 +
26271 +/* lzo_init() should be the first function you call.
26272 + * Check the return code !
26273 + *
26274 + * lzo_init() is a macro to allow checking that the library and the
26275 + * compiler's view of various types are consistent.
26276 + */
26277 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26278 + (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26279 + (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26280 + (int)sizeof(lzo_compress_t))
26281 + extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26282 + int, int, int);
26283 +
26284 +/* checksum functions */
26285 +extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26286 + lzo_uint _len);
26287 +/* misc. */
26288 + typedef union {
26289 + lzo_bytep p;
26290 + lzo_uint u;
26291 + } __lzo_pu_u;
26292 + typedef union {
26293 + lzo_bytep p;
26294 + lzo_uint32 u32;
26295 + } __lzo_pu32_u;
26296 + typedef union {
26297 + void *vp;
26298 + lzo_bytep bp;
26299 + lzo_uint32 u32;
26300 + long l;
26301 + } lzo_align_t;
26302 +
26303 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26304 + ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26305 +
26306 +/* deprecated - only for backward compatibility */
26307 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26308 +
26309 +#endif /* already included */
26310 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.20/fs/reiser4/plugin/compress/Makefile
26311 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26312 +++ linux-2.6.20/fs/reiser4/plugin/compress/Makefile 2007-05-06 14:50:43.750991972 +0400
26313 @@ -0,0 +1,6 @@
26314 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26315 +
26316 +compress_plugins-objs := \
26317 + compress.o \
26318 + minilzo.o \
26319 + compress_mode.o
26320 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c
26321 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 03:00:00.000000000 +0300
26322 +++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c 2007-05-06 14:50:43.754993222 +0400
26323 @@ -0,0 +1,1967 @@
26324 +/* minilzo.c -- mini subset of the LZO real-time data compression library
26325 + adopted for reiser4 compression transform plugin.
26326 +
26327 + This file is part of the LZO real-time data compression library
26328 + and not included in any proprietary licenses of reiser4.
26329 +
26330 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26331 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26332 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26333 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26334 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26335 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26336 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26337 + All Rights Reserved.
26338 +
26339 + The LZO library is free software; you can redistribute it and/or
26340 + modify it under the terms of the GNU General Public License as
26341 + published by the Free Software Foundation; either version 2 of
26342 + the License, or (at your option) any later version.
26343 +
26344 + The LZO library is distributed in the hope that it will be useful,
26345 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26346 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26347 + GNU General Public License for more details.
26348 +
26349 + You should have received a copy of the GNU General Public License
26350 + along with the LZO library; see the file COPYING.
26351 + If not, write to the Free Software Foundation, Inc.,
26352 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26353 +
26354 + Markus F.X.J. Oberhumer
26355 + <markus@oberhumer.com>
26356 + http://www.oberhumer.com/opensource/lzo/
26357 + */
26358 +
26359 +/*
26360 + * NOTE:
26361 + * the full LZO package can be found at
26362 + * http://www.oberhumer.com/opensource/lzo/
26363 + */
26364 +
26365 +#include "../../debug.h" /* for reiser4 assert macro -edward */
26366 +
26367 +#define __LZO_IN_MINILZO
26368 +#define LZO_BUILD
26369 +
26370 +#include "minilzo.h"
26371 +
26372 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26373 +# error "version mismatch in miniLZO source files"
26374 +#endif
26375 +
26376 +#ifndef __LZO_CONF_H
26377 +#define __LZO_CONF_H
26378 +
26379 +# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26380 +# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26381 +
26382 +# define HAVE_MEMCMP
26383 +# define HAVE_MEMCPY
26384 +# define HAVE_MEMMOVE
26385 +# define HAVE_MEMSET
26386 +
26387 +#undef NDEBUG
26388 +#if !defined(LZO_DEBUG)
26389 +# define NDEBUG
26390 +#endif
26391 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
26392 +# if !defined(NO_STDIO_H)
26393 +# include <stdio.h>
26394 +# endif
26395 +#endif
26396 +
26397 +#if !defined(LZO_COMPILE_TIME_ASSERT)
26398 +# define LZO_COMPILE_TIME_ASSERT(expr) \
26399 + { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26400 +#endif
26401 +
26402 +#if !defined(LZO_UNUSED)
26403 +# if 1
26404 +# define LZO_UNUSED(var) ((void)&var)
26405 +# elif 0
26406 +# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26407 +# else
26408 +# define LZO_UNUSED(parm) (parm = parm)
26409 +# endif
26410 +#endif
26411 +
26412 +#if defined(NO_MEMCMP)
26413 +# undef HAVE_MEMCMP
26414 +#endif
26415 +
26416 +#if !defined(HAVE_MEMSET)
26417 +# undef memset
26418 +# define memset lzo_memset
26419 +#endif
26420 +
26421 +# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26422 +
26423 +#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26424 +#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26425 +#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26426 +#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26427 +
26428 +#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26429 +
26430 +#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26431 +
26432 +#define LZO_SIZE(bits) (1u << (bits))
26433 +#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26434 +
26435 +#define LZO_LSIZE(bits) (1ul << (bits))
26436 +#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26437 +
26438 +#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26439 +#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26440 +
26441 +#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26442 +#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26443 +
26444 +#if !defined(SIZEOF_UNSIGNED)
26445 +# if (UINT_MAX == 0xffff)
26446 +# define SIZEOF_UNSIGNED 2
26447 +# elif (UINT_MAX == LZO_0xffffffffL)
26448 +# define SIZEOF_UNSIGNED 4
26449 +# elif (UINT_MAX >= LZO_0xffffffffL)
26450 +# define SIZEOF_UNSIGNED 8
26451 +# else
26452 +# error "SIZEOF_UNSIGNED"
26453 +# endif
26454 +#endif
26455 +
26456 +#if !defined(SIZEOF_UNSIGNED_LONG)
26457 +# if (ULONG_MAX == LZO_0xffffffffL)
26458 +# define SIZEOF_UNSIGNED_LONG 4
26459 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26460 +# define SIZEOF_UNSIGNED_LONG 8
26461 +# else
26462 +# error "SIZEOF_UNSIGNED_LONG"
26463 +# endif
26464 +#endif
26465 +
26466 +#if !defined(SIZEOF_SIZE_T)
26467 +# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26468 +#endif
26469 +#if !defined(SIZE_T_MAX)
26470 +# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26471 +#endif
26472 +
26473 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26474 +# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26475 +# define LZO_UNALIGNED_OK_2
26476 +# endif
26477 +# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26478 +# define LZO_UNALIGNED_OK_4
26479 +# endif
26480 +#endif
26481 +
26482 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26483 +# if !defined(LZO_UNALIGNED_OK)
26484 +# define LZO_UNALIGNED_OK
26485 +# endif
26486 +#endif
26487 +
26488 +#if defined(__LZO_NO_UNALIGNED)
26489 +# undef LZO_UNALIGNED_OK
26490 +# undef LZO_UNALIGNED_OK_2
26491 +# undef LZO_UNALIGNED_OK_4
26492 +#endif
26493 +
26494 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26495 +# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26496 +#endif
26497 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26498 +# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26499 +#endif
26500 +
26501 +#if defined(__LZO_NO_ALIGNED)
26502 +# undef LZO_ALIGNED_OK_4
26503 +#endif
26504 +
26505 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26506 +# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26507 +#endif
26508 +
26509 +#define LZO_LITTLE_ENDIAN 1234
26510 +#define LZO_BIG_ENDIAN 4321
26511 +#define LZO_PDP_ENDIAN 3412
26512 +
26513 +#if !defined(LZO_BYTE_ORDER)
26514 +# if defined(MFX_BYTE_ORDER)
26515 +# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26516 +# elif defined(__LZO_i386)
26517 +# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26518 +# elif defined(BYTE_ORDER)
26519 +# define LZO_BYTE_ORDER BYTE_ORDER
26520 +# elif defined(__BYTE_ORDER)
26521 +# define LZO_BYTE_ORDER __BYTE_ORDER
26522 +# endif
26523 +#endif
26524 +
26525 +#if defined(LZO_BYTE_ORDER)
26526 +# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26527 + (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26528 +# error "invalid LZO_BYTE_ORDER"
26529 +# endif
26530 +#endif
26531 +
26532 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26533 +# error "LZO_BYTE_ORDER is not defined"
26534 +#endif
26535 +
26536 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
26537 +
26538 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
26539 +# if defined(__GNUC__) && defined(__i386__)
26540 +# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
26541 +# define LZO_OPTIMIZE_GNUC_i386
26542 +# endif
26543 +# endif
26544 +#endif
26545 +
26546 +extern const lzo_uint32 _lzo_crc32_table[256];
26547 +
26548 +#define _LZO_STRINGIZE(x) #x
26549 +#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
26550 +
26551 +#define _LZO_CONCAT2(a,b) a ## b
26552 +#define _LZO_CONCAT3(a,b,c) a ## b ## c
26553 +#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
26554 +#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
26555 +
26556 +#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
26557 +#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
26558 +#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
26559 +#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
26560 +
26561 +#ifndef __LZO_PTR_H
26562 +#define __LZO_PTR_H
26563 +
26564 +#if !defined(lzo_ptrdiff_t)
26565 +# if (UINT_MAX >= LZO_0xffffffffL)
26566 +typedef ptrdiff_t lzo_ptrdiff_t;
26567 +# else
26568 +typedef long lzo_ptrdiff_t;
26569 +# endif
26570 +#endif
26571 +
26572 +#if !defined(__LZO_HAVE_PTR_T)
26573 +# if defined(lzo_ptr_t)
26574 +# define __LZO_HAVE_PTR_T
26575 +# endif
26576 +#endif
26577 +#if !defined(__LZO_HAVE_PTR_T)
26578 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
26579 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
26580 +typedef unsigned long lzo_ptr_t;
26581 +typedef long lzo_sptr_t;
26582 +# define __LZO_HAVE_PTR_T
26583 +# endif
26584 +# endif
26585 +#endif
26586 +#if !defined(__LZO_HAVE_PTR_T)
26587 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
26588 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
26589 +typedef unsigned int lzo_ptr_t;
26590 +typedef int lzo_sptr_t;
26591 +# define __LZO_HAVE_PTR_T
26592 +# endif
26593 +# endif
26594 +#endif
26595 +#if !defined(__LZO_HAVE_PTR_T)
26596 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
26597 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
26598 +typedef unsigned short lzo_ptr_t;
26599 +typedef short lzo_sptr_t;
26600 +# define __LZO_HAVE_PTR_T
26601 +# endif
26602 +# endif
26603 +#endif
26604 +#if !defined(__LZO_HAVE_PTR_T)
26605 +# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
26606 +# error "no suitable type for lzo_ptr_t"
26607 +# else
26608 +typedef unsigned long lzo_ptr_t;
26609 +typedef long lzo_sptr_t;
26610 +# define __LZO_HAVE_PTR_T
26611 +# endif
26612 +#endif
26613 +
26614 +#define PTR(a) ((lzo_ptr_t) (a))
26615 +#define PTR_LINEAR(a) PTR(a)
26616 +#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
26617 +#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
26618 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
26619 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
26620 +
26621 +#define PTR_LT(a,b) (PTR(a) < PTR(b))
26622 +#define PTR_GE(a,b) (PTR(a) >= PTR(b))
26623 +#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
26624 +#define pd(a,b) ((lzo_uint) ((a)-(b)))
26625 +
26626 +typedef union {
26627 + char a_char;
26628 + unsigned char a_uchar;
26629 + short a_short;
26630 + unsigned short a_ushort;
26631 + int a_int;
26632 + unsigned int a_uint;
26633 + long a_long;
26634 + unsigned long a_ulong;
26635 + lzo_int a_lzo_int;
26636 + lzo_uint a_lzo_uint;
26637 + lzo_int32 a_lzo_int32;
26638 + lzo_uint32 a_lzo_uint32;
26639 + ptrdiff_t a_ptrdiff_t;
26640 + lzo_ptrdiff_t a_lzo_ptrdiff_t;
26641 + lzo_ptr_t a_lzo_ptr_t;
26642 + lzo_voidp a_lzo_voidp;
26643 + void *a_void_p;
26644 + lzo_bytep a_lzo_bytep;
26645 + lzo_bytepp a_lzo_bytepp;
26646 + lzo_uintp a_lzo_uintp;
26647 + lzo_uint *a_lzo_uint_p;
26648 + lzo_uint32p a_lzo_uint32p;
26649 + lzo_uint32 *a_lzo_uint32_p;
26650 + unsigned char *a_uchar_p;
26651 + char *a_char_p;
26652 +} lzo_full_align_t;
26653 +
26654 +#endif
26655 +#define LZO_DETERMINISTIC
26656 +#define LZO_DICT_USE_PTR
26657 +# define lzo_dict_t const lzo_bytep
26658 +# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
26659 +#if !defined(lzo_moff_t)
26660 +#define lzo_moff_t lzo_uint
26661 +#endif
26662 +#endif
26663 +static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
26664 +{
26665 + return PTR_LINEAR(ptr);
26666 +}
26667 +
26668 +static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
26669 +{
26670 + lzo_ptr_t p, s, n;
26671 +
26672 + assert("lzo-01", size > 0);
26673 +
26674 + p = __lzo_ptr_linear(ptr);
26675 + s = (lzo_ptr_t) (size - 1);
26676 + n = (((p + s) / size) * size) - p;
26677 +
26678 + assert("lzo-02", (long)n >= 0);
26679 + assert("lzo-03", n <= s);
26680 +
26681 + return (unsigned)n;
26682 +}
26683 +
26684 +#ifndef __LZO_UTIL_H
26685 +#define __LZO_UTIL_H
26686 +
26687 +#ifndef __LZO_CONF_H
26688 +#endif
26689 +
26690 +#if 1 && defined(HAVE_MEMCPY)
26691 +#define MEMCPY8_DS(dest,src,len) \
26692 + memcpy(dest,src,len); \
26693 + dest += len; \
26694 + src += len
26695 +#endif
26696 +
26697 +#if !defined(MEMCPY8_DS)
26698 +
26699 +#define MEMCPY8_DS(dest,src,len) \
26700 + { register lzo_uint __l = (len) / 8; \
26701 + do { \
26702 + *dest++ = *src++; \
26703 + *dest++ = *src++; \
26704 + *dest++ = *src++; \
26705 + *dest++ = *src++; \
26706 + *dest++ = *src++; \
26707 + *dest++ = *src++; \
26708 + *dest++ = *src++; \
26709 + *dest++ = *src++; \
26710 + } while (--__l > 0); }
26711 +
26712 +#endif
26713 +
26714 +#define MEMCPY_DS(dest,src,len) \
26715 + do *dest++ = *src++; \
26716 + while (--len > 0)
26717 +
26718 +#define MEMMOVE_DS(dest,src,len) \
26719 + do *dest++ = *src++; \
26720 + while (--len > 0)
26721 +
26722 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
26723 +
26724 +#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
26725 +
26726 +#else
26727 +
26728 +#define BZERO8_PTR(s,l,n) \
26729 + lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
26730 +
26731 +#endif
26732 +#endif
26733 +
26734 +/* If you use the LZO library in a product, you *must* keep this
26735 + * copyright string in the executable of your product.
26736 + */
26737 +
26738 +static const lzo_byte __lzo_copyright[] =
26739 +#if !defined(__LZO_IN_MINLZO)
26740 + LZO_VERSION_STRING;
26741 +#else
26742 + "\n\n\n"
26743 + "LZO real-time data compression library.\n"
26744 + "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
26745 + "<markus.oberhumer@jk.uni-linz.ac.at>\n"
26746 + "http://www.oberhumer.com/opensource/lzo/\n"
26747 + "\n"
26748 + "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
26749 + "LZO build date: " __DATE__ " " __TIME__ "\n\n"
26750 + "LZO special compilation options:\n"
26751 +#ifdef __cplusplus
26752 + " __cplusplus\n"
26753 +#endif
26754 +#if defined(__PIC__)
26755 + " __PIC__\n"
26756 +#elif defined(__pic__)
26757 + " __pic__\n"
26758 +#endif
26759 +#if (UINT_MAX < LZO_0xffffffffL)
26760 + " 16BIT\n"
26761 +#endif
26762 +#if defined(__LZO_STRICT_16BIT)
26763 + " __LZO_STRICT_16BIT\n"
26764 +#endif
26765 +#if (UINT_MAX > LZO_0xffffffffL)
26766 + " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
26767 +#endif
26768 +#if (ULONG_MAX > LZO_0xffffffffL)
26769 + " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
26770 +#endif
26771 +#if defined(LZO_BYTE_ORDER)
26772 + " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
26773 +#endif
26774 +#if defined(LZO_UNALIGNED_OK_2)
26775 + " LZO_UNALIGNED_OK_2\n"
26776 +#endif
26777 +#if defined(LZO_UNALIGNED_OK_4)
26778 + " LZO_UNALIGNED_OK_4\n"
26779 +#endif
26780 +#if defined(LZO_ALIGNED_OK_4)
26781 + " LZO_ALIGNED_OK_4\n"
26782 +#endif
26783 +#if defined(LZO_DICT_USE_PTR)
26784 + " LZO_DICT_USE_PTR\n"
26785 +#endif
26786 +#if defined(__LZO_QUERY_COMPRESS)
26787 + " __LZO_QUERY_COMPRESS\n"
26788 +#endif
26789 +#if defined(__LZO_QUERY_DECOMPRESS)
26790 + " __LZO_QUERY_DECOMPRESS\n"
26791 +#endif
26792 +#if defined(__LZO_IN_MINILZO)
26793 + " __LZO_IN_MINILZO\n"
26794 +#endif
26795 + "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
26796 +#if defined(__GNUC__) && defined(__VERSION__)
26797 + " by gcc " __VERSION__
26798 +#elif defined(__BORLANDC__)
26799 + " by Borland C " _LZO_MEXPAND(__BORLANDC__)
26800 +#elif defined(_MSC_VER)
26801 + " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
26802 +#elif defined(__PUREC__)
26803 + " by Pure C " _LZO_MEXPAND(__PUREC__)
26804 +#elif defined(__SC__)
26805 + " by Symantec C " _LZO_MEXPAND(__SC__)
26806 +#elif defined(__TURBOC__)
26807 + " by Turbo C " _LZO_MEXPAND(__TURBOC__)
26808 +#elif defined(__WATCOMC__)
26809 + " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
26810 +#endif
26811 + " $\n"
26812 + "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
26813 +#endif
26814 +
26815 +#define LZO_BASE 65521u
26816 +#define LZO_NMAX 5552
26817 +
26818 +#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
26819 +#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
26820 +#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
26821 +#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
26822 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
26823 +
26824 +# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
26825 +# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
26826 +
26827 +#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
26828 +
26829 +static lzo_bool schedule_insns_bug(void);
26830 +static lzo_bool strength_reduce_bug(int *);
26831 +
26832 +# define __lzo_assert(x) ((x) ? 1 : 0)
26833 +
26834 +#undef COMPILE_TIME_ASSERT
26835 +
26836 +# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
26837 +
26838 +static lzo_bool basic_integral_check(void)
26839 +{
26840 + lzo_bool r = 1;
26841 +
26842 + COMPILE_TIME_ASSERT(CHAR_BIT == 8);
26843 + COMPILE_TIME_ASSERT(sizeof(char) == 1);
26844 + COMPILE_TIME_ASSERT(sizeof(short) >= 2);
26845 + COMPILE_TIME_ASSERT(sizeof(long) >= 4);
26846 + COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
26847 + COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
26848 +
26849 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
26850 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
26851 +
26852 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
26853 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
26854 +#if defined(__LZO_STRICT_16BIT)
26855 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
26856 +#else
26857 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
26858 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
26859 +#endif
26860 +
26861 +#if (USHRT_MAX == 65535u)
26862 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
26863 +#elif (USHRT_MAX == LZO_0xffffffffL)
26864 + COMPILE_TIME_ASSERT(sizeof(short) == 4);
26865 +#elif (USHRT_MAX >= LZO_0xffffffffL)
26866 + COMPILE_TIME_ASSERT(sizeof(short) > 4);
26867 +#endif
26868 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
26869 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
26870 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
26871 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
26872 + COMPILE_TIME_ASSERT(IS_SIGNED(short));
26873 + COMPILE_TIME_ASSERT(IS_SIGNED(int));
26874 + COMPILE_TIME_ASSERT(IS_SIGNED(long));
26875 +
26876 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
26877 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
26878 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
26879 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
26880 +
26881 + COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
26882 + COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
26883 + COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
26884 + COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
26885 + COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
26886 + COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
26887 + LZO_UTYPE_MAX(sizeof(lzo_uint32)));
26888 + COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
26889 +
26890 + r &= __lzo_assert(LZO_BYTE(257) == 1);
26891 +
26892 + return r;
26893 +}
26894 +
26895 +static lzo_bool basic_ptr_check(void)
26896 +{
26897 + lzo_bool r = 1;
26898 +
26899 + COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
26900 + COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
26901 +
26902 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
26903 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
26904 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
26905 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
26906 +
26907 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
26908 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
26909 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
26910 +
26911 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
26912 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
26913 +
26914 + COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
26915 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
26916 +
26917 +#if defined(SIZEOF_CHAR_P)
26918 + COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
26919 +#endif
26920 +#if defined(SIZEOF_PTRDIFF_T)
26921 + COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
26922 +#endif
26923 +
26924 + COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
26925 + COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
26926 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
26927 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
26928 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
26929 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
26930 +
26931 + return r;
26932 +}
26933 +
26934 +static lzo_bool ptr_check(void)
26935 +{
26936 + lzo_bool r = 1;
26937 + int i;
26938 + char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
26939 + lzo_bytep wrkmem;
26940 + lzo_bytepp dict;
26941 + unsigned char x[4 * sizeof(lzo_full_align_t)];
26942 + long d;
26943 + lzo_full_align_t a;
26944 + lzo_full_align_t u;
26945 +
26946 + for (i = 0; i < (int)sizeof(x); i++)
26947 + x[i] = LZO_BYTE(i);
26948 +
26949 + wrkmem =
26950 + LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
26951 +
26952 + u.a_lzo_bytep = wrkmem;
26953 + dict = u.a_lzo_bytepp;
26954 +
26955 + d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
26956 + r &= __lzo_assert(d >= 0);
26957 + r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
26958 +
26959 + memset(&a, 0, sizeof(a));
26960 + r &= __lzo_assert(a.a_lzo_voidp == NULL);
26961 +
26962 + memset(&a, 0xff, sizeof(a));
26963 + r &= __lzo_assert(a.a_ushort == USHRT_MAX);
26964 + r &= __lzo_assert(a.a_uint == UINT_MAX);
26965 + r &= __lzo_assert(a.a_ulong == ULONG_MAX);
26966 + r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
26967 + r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
26968 +
26969 + if (r == 1) {
26970 + for (i = 0; i < 8; i++)
26971 + r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
26972 + (const
26973 + lzo_voidp)(&wrkmem[i *
26974 + sizeof(lzo_byte
26975 + *)]));
26976 + }
26977 +
26978 + memset(&a, 0, sizeof(a));
26979 + r &= __lzo_assert(a.a_char_p == NULL);
26980 + r &= __lzo_assert(a.a_lzo_bytep == NULL);
26981 + r &= __lzo_assert(NULL == (void *)0);
26982 + if (r == 1) {
26983 + for (i = 0; i < 10; i++)
26984 + dict[i] = wrkmem;
26985 + BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
26986 + r &= __lzo_assert(dict[0] == wrkmem);
26987 + for (i = 1; i < 9; i++)
26988 + r &= __lzo_assert(dict[i] == NULL);
26989 + r &= __lzo_assert(dict[9] == wrkmem);
26990 + }
26991 +
26992 + if (r == 1) {
26993 + unsigned k = 1;
26994 + const unsigned n = (unsigned)sizeof(lzo_uint32);
26995 + lzo_byte *p0;
26996 + lzo_byte *p1;
26997 +
26998 + k += __lzo_align_gap(&x[k], n);
26999 + p0 = (lzo_bytep) & x[k];
27000 +#if defined(PTR_LINEAR)
27001 + r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27002 +#else
27003 + r &= __lzo_assert(n == 4);
27004 + r &= __lzo_assert(PTR_ALIGNED_4(p0));
27005 +#endif
27006 +
27007 + r &= __lzo_assert(k >= 1);
27008 + p1 = (lzo_bytep) & x[1];
27009 + r &= __lzo_assert(PTR_GE(p0, p1));
27010 +
27011 + r &= __lzo_assert(k < 1 + n);
27012 + p1 = (lzo_bytep) & x[1 + n];
27013 + r &= __lzo_assert(PTR_LT(p0, p1));
27014 +
27015 + if (r == 1) {
27016 + lzo_uint32 v0, v1;
27017 +
27018 + u.a_uchar_p = &x[k];
27019 + v0 = *u.a_lzo_uint32_p;
27020 + u.a_uchar_p = &x[k + n];
27021 + v1 = *u.a_lzo_uint32_p;
27022 +
27023 + r &= __lzo_assert(v0 > 0);
27024 + r &= __lzo_assert(v1 > 0);
27025 + }
27026 + }
27027 +
27028 + return r;
27029 +}
27030 +
27031 +static int _lzo_config_check(void)
27032 +{
27033 + lzo_bool r = 1;
27034 + int i;
27035 + union {
27036 + lzo_uint32 a;
27037 + unsigned short b;
27038 + lzo_uint32 aa[4];
27039 + unsigned char x[4 * sizeof(lzo_full_align_t)];
27040 + } u;
27041 +
27042 + COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27043 + COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27044 + < 0);
27045 +
27046 + r &= basic_integral_check();
27047 + r &= basic_ptr_check();
27048 + if (r != 1)
27049 + return LZO_E_ERROR;
27050 +
27051 + u.a = 0;
27052 + u.b = 0;
27053 + for (i = 0; i < (int)sizeof(u.x); i++)
27054 + u.x[i] = LZO_BYTE(i);
27055 +
27056 +#if defined(LZO_BYTE_ORDER)
27057 + if (r == 1) {
27058 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27059 + lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27060 + unsigned short b = (unsigned short)(u.b & 0xffff);
27061 + r &= __lzo_assert(a == 0x03020100L);
27062 + r &= __lzo_assert(b == 0x0100);
27063 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27064 + lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27065 + unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27066 + r &= __lzo_assert(a == 0x00010203L);
27067 + r &= __lzo_assert(b == 0x0001);
27068 +# else
27069 +# error "invalid LZO_BYTE_ORDER"
27070 +# endif
27071 + }
27072 +#endif
27073 +
27074 +#if defined(LZO_UNALIGNED_OK_2)
27075 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
27076 + if (r == 1) {
27077 + unsigned short b[4];
27078 +
27079 + for (i = 0; i < 4; i++)
27080 + b[i] = *(const unsigned short *)&u.x[i];
27081 +
27082 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27083 + r &= __lzo_assert(b[0] == 0x0100);
27084 + r &= __lzo_assert(b[1] == 0x0201);
27085 + r &= __lzo_assert(b[2] == 0x0302);
27086 + r &= __lzo_assert(b[3] == 0x0403);
27087 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27088 + r &= __lzo_assert(b[0] == 0x0001);
27089 + r &= __lzo_assert(b[1] == 0x0102);
27090 + r &= __lzo_assert(b[2] == 0x0203);
27091 + r &= __lzo_assert(b[3] == 0x0304);
27092 +# endif
27093 + }
27094 +#endif
27095 +
27096 +#if defined(LZO_UNALIGNED_OK_4)
27097 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27098 + if (r == 1) {
27099 + lzo_uint32 a[4];
27100 +
27101 + for (i = 0; i < 4; i++)
27102 + a[i] = *(const lzo_uint32 *)&u.x[i];
27103 +
27104 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27105 + r &= __lzo_assert(a[0] == 0x03020100L);
27106 + r &= __lzo_assert(a[1] == 0x04030201L);
27107 + r &= __lzo_assert(a[2] == 0x05040302L);
27108 + r &= __lzo_assert(a[3] == 0x06050403L);
27109 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27110 + r &= __lzo_assert(a[0] == 0x00010203L);
27111 + r &= __lzo_assert(a[1] == 0x01020304L);
27112 + r &= __lzo_assert(a[2] == 0x02030405L);
27113 + r &= __lzo_assert(a[3] == 0x03040506L);
27114 +# endif
27115 + }
27116 +#endif
27117 +
27118 +#if defined(LZO_ALIGNED_OK_4)
27119 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27120 +#endif
27121 +
27122 + COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27123 +
27124 + if (r == 1) {
27125 + r &= __lzo_assert(!schedule_insns_bug());
27126 + }
27127 +
27128 + if (r == 1) {
27129 + static int x[3];
27130 + static unsigned xn = 3;
27131 + register unsigned j;
27132 +
27133 + for (j = 0; j < xn; j++)
27134 + x[j] = (int)j - 3;
27135 + r &= __lzo_assert(!strength_reduce_bug(x));
27136 + }
27137 +
27138 + if (r == 1) {
27139 + r &= ptr_check();
27140 + }
27141 +
27142 + return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27143 +}
27144 +
27145 +static lzo_bool schedule_insns_bug(void)
27146 +{
27147 +#if defined(__LZO_CHECKER)
27148 + return 0;
27149 +#else
27150 + const int clone[] = { 1, 2, 0 };
27151 + const int *q;
27152 + q = clone;
27153 + return (*q) ? 0 : 1;
27154 +#endif
27155 +}
27156 +
27157 +static lzo_bool strength_reduce_bug(int *x)
27158 +{
27159 + return x[0] != -3 || x[1] != -2 || x[2] != -1;
27160 +}
27161 +
27162 +#undef COMPILE_TIME_ASSERT
27163 +
27164 +int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27165 + int s6, int s7, int s8, int s9)
27166 +{
27167 + int r;
27168 +
27169 + if (v == 0)
27170 + return LZO_E_ERROR;
27171 +
27172 + r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27173 + (s2 == -1 || s2 == (int)sizeof(int)) &&
27174 + (s3 == -1 || s3 == (int)sizeof(long)) &&
27175 + (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27176 + (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27177 + (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27178 + (s7 == -1 || s7 == (int)sizeof(char *)) &&
27179 + (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27180 + (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27181 + if (!r)
27182 + return LZO_E_ERROR;
27183 +
27184 + r = _lzo_config_check();
27185 + if (r != LZO_E_OK)
27186 + return r;
27187 +
27188 + return r;
27189 +}
27190 +
27191 +#define do_compress _lzo1x_1_do_compress
27192 +
27193 +#define LZO_NEED_DICT_H
27194 +#define D_BITS 14
27195 +#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27196 +#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27197 +
27198 +#ifndef __LZO_CONFIG1X_H
27199 +#define __LZO_CONFIG1X_H
27200 +
27201 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27202 +# define LZO1X
27203 +#endif
27204 +
27205 +#define LZO_EOF_CODE
27206 +#undef LZO_DETERMINISTIC
27207 +
27208 +#define M1_MAX_OFFSET 0x0400
27209 +#ifndef M2_MAX_OFFSET
27210 +#define M2_MAX_OFFSET 0x0800
27211 +#endif
27212 +#define M3_MAX_OFFSET 0x4000
27213 +#define M4_MAX_OFFSET 0xbfff
27214 +
27215 +#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27216 +
27217 +#define M1_MIN_LEN 2
27218 +#define M1_MAX_LEN 2
27219 +#define M2_MIN_LEN 3
27220 +#ifndef M2_MAX_LEN
27221 +#define M2_MAX_LEN 8
27222 +#endif
27223 +#define M3_MIN_LEN 3
27224 +#define M3_MAX_LEN 33
27225 +#define M4_MIN_LEN 3
27226 +#define M4_MAX_LEN 9
27227 +
27228 +#define M1_MARKER 0
27229 +#define M2_MARKER 64
27230 +#define M3_MARKER 32
27231 +#define M4_MARKER 16
27232 +
27233 +#ifndef MIN_LOOKAHEAD
27234 +#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27235 +#endif
27236 +
27237 +#if defined(LZO_NEED_DICT_H)
27238 +
27239 +#ifndef LZO_HASH
27240 +#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27241 +#endif
27242 +#define DL_MIN_LEN M2_MIN_LEN
27243 +
27244 +#ifndef __LZO_DICT_H
27245 +#define __LZO_DICT_H
27246 +
27247 +#if !defined(D_BITS) && defined(DBITS)
27248 +# define D_BITS DBITS
27249 +#endif
27250 +#if !defined(D_BITS)
27251 +# error "D_BITS is not defined"
27252 +#endif
27253 +#if (D_BITS < 16)
27254 +# define D_SIZE LZO_SIZE(D_BITS)
27255 +# define D_MASK LZO_MASK(D_BITS)
27256 +#else
27257 +# define D_SIZE LZO_USIZE(D_BITS)
27258 +# define D_MASK LZO_UMASK(D_BITS)
27259 +#endif
27260 +#define D_HIGH ((D_MASK >> 1) + 1)
27261 +
27262 +#if !defined(DD_BITS)
27263 +# define DD_BITS 0
27264 +#endif
27265 +#define DD_SIZE LZO_SIZE(DD_BITS)
27266 +#define DD_MASK LZO_MASK(DD_BITS)
27267 +
27268 +#if !defined(DL_BITS)
27269 +# define DL_BITS (D_BITS - DD_BITS)
27270 +#endif
27271 +#if (DL_BITS < 16)
27272 +# define DL_SIZE LZO_SIZE(DL_BITS)
27273 +# define DL_MASK LZO_MASK(DL_BITS)
27274 +#else
27275 +# define DL_SIZE LZO_USIZE(DL_BITS)
27276 +# define DL_MASK LZO_UMASK(DL_BITS)
27277 +#endif
27278 +
27279 +#if (D_BITS != DL_BITS + DD_BITS)
27280 +# error "D_BITS does not match"
27281 +#endif
27282 +#if (D_BITS < 8 || D_BITS > 18)
27283 +# error "invalid D_BITS"
27284 +#endif
27285 +#if (DL_BITS < 8 || DL_BITS > 20)
27286 +# error "invalid DL_BITS"
27287 +#endif
27288 +#if (DD_BITS < 0 || DD_BITS > 6)
27289 +# error "invalid DD_BITS"
27290 +#endif
27291 +
27292 +#if !defined(DL_MIN_LEN)
27293 +# define DL_MIN_LEN 3
27294 +#endif
27295 +#if !defined(DL_SHIFT)
27296 +# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27297 +#endif
27298 +
27299 +#define LZO_HASH_GZIP 1
27300 +#define LZO_HASH_GZIP_INCREMENTAL 2
27301 +#define LZO_HASH_LZO_INCREMENTAL_A 3
27302 +#define LZO_HASH_LZO_INCREMENTAL_B 4
27303 +
27304 +#if !defined(LZO_HASH)
27305 +# error "choose a hashing strategy"
27306 +#endif
27307 +
27308 +#if (DL_MIN_LEN == 3)
27309 +# define _DV2_A(p,shift1,shift2) \
27310 + (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27311 +# define _DV2_B(p,shift1,shift2) \
27312 + (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27313 +# define _DV3_B(p,shift1,shift2,shift3) \
27314 + ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27315 +#elif (DL_MIN_LEN == 2)
27316 +# define _DV2_A(p,shift1,shift2) \
27317 + (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27318 +# define _DV2_B(p,shift1,shift2) \
27319 + (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27320 +#else
27321 +# error "invalid DL_MIN_LEN"
27322 +#endif
27323 +#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27324 +#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27325 +#define DA2(p,s1,s2) \
27326 + (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27327 +#define DS2(p,s1,s2) \
27328 + (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27329 +#define DX2(p,s1,s2) \
27330 + (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27331 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27332 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27333 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27334 +#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27335 +#define DM(v) DMS(v,0)
27336 +
27337 +#if (LZO_HASH == LZO_HASH_GZIP)
27338 +# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27339 +
27340 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27341 +# define __LZO_HASH_INCREMENTAL
27342 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27343 +# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27344 +# define _DINDEX(dv,p) (dv)
27345 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27346 +
27347 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27348 +# define __LZO_HASH_INCREMENTAL
27349 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27350 +# define DVAL_NEXT(dv,p) \
27351 + dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27352 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27353 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27354 +
27355 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27356 +# define __LZO_HASH_INCREMENTAL
27357 +# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27358 +# define DVAL_NEXT(dv,p) \
27359 + dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27360 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27361 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27362 +
27363 +#else
27364 +# error "choose a hashing strategy"
27365 +#endif
27366 +
27367 +#ifndef DINDEX
27368 +#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27369 +#endif
27370 +#if !defined(DINDEX1) && defined(D_INDEX1)
27371 +#define DINDEX1 D_INDEX1
27372 +#endif
27373 +#if !defined(DINDEX2) && defined(D_INDEX2)
27374 +#define DINDEX2 D_INDEX2
27375 +#endif
27376 +
27377 +#if !defined(__LZO_HASH_INCREMENTAL)
27378 +# define DVAL_FIRST(dv,p) ((void) 0)
27379 +# define DVAL_NEXT(dv,p) ((void) 0)
27380 +# define DVAL_LOOKAHEAD 0
27381 +#endif
27382 +
27383 +#if !defined(DVAL_ASSERT)
27384 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27385 +static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27386 +{
27387 + lzo_uint32 df;
27388 + DVAL_FIRST(df, (p));
27389 + assert(DINDEX(dv, p) == DINDEX(df, p));
27390 +}
27391 +#else
27392 +# define DVAL_ASSERT(dv,p) ((void) 0)
27393 +#endif
27394 +#endif
27395 +
27396 +# define DENTRY(p,in) (p)
27397 +# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27398 +
27399 +#if (DD_BITS == 0)
27400 +
27401 +# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27402 +# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27403 +# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27404 +
27405 +#else
27406 +
27407 +# define UPDATE_D(dict,drun,dv,p,in) \
27408 + dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27409 +# define UPDATE_I(dict,drun,index,p,in) \
27410 + dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27411 +# define UPDATE_P(ptr,drun,p,in) \
27412 + (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27413 +
27414 +#endif
27415 +
27416 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27417 + (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27418 +
27419 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27420 + (BOUNDS_CHECKING_OFF_IN_EXPR( \
27421 + (PTR_LT(m_pos,in) || \
27422 + (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27423 + m_off > max_offset) ))
27424 +
27425 +#if defined(LZO_DETERMINISTIC)
27426 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
27427 +#else
27428 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
27429 +#endif
27430 +#endif
27431 +#endif
27432 +#endif
27433 +#define DO_COMPRESS lzo1x_1_compress
27434 +static
27435 +lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27436 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27437 +{
27438 + register const lzo_byte *ip;
27439 + lzo_byte *op;
27440 + const lzo_byte *const in_end = in + in_len;
27441 + const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27442 + const lzo_byte *ii;
27443 + lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27444 +
27445 + op = out;
27446 + ip = in;
27447 + ii = ip;
27448 +
27449 + ip += 4;
27450 + for (;;) {
27451 + register const lzo_byte *m_pos;
27452 +
27453 + lzo_moff_t m_off;
27454 + lzo_uint m_len;
27455 + lzo_uint dindex;
27456 +
27457 + DINDEX1(dindex, ip);
27458 + GINDEX(m_pos, m_off, dict, dindex, in);
27459 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27460 + goto literal;
27461 +#if 1
27462 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27463 + goto try_match;
27464 + DINDEX2(dindex, ip);
27465 +#endif
27466 + GINDEX(m_pos, m_off, dict, dindex, in);
27467 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27468 + goto literal;
27469 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27470 + goto try_match;
27471 + goto literal;
27472 +
27473 + try_match:
27474 +#if 1 && defined(LZO_UNALIGNED_OK_2)
27475 + if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27476 +#else
27477 + if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27478 +#endif
27479 + ;
27480 + } else {
27481 + if (m_pos[2] == ip[2]) {
27482 + goto match;
27483 + } else {
27484 + ;
27485 + }
27486 + }
27487 +
27488 + literal:
27489 + UPDATE_I(dict, 0, dindex, ip, in);
27490 + ++ip;
27491 + if (ip >= ip_end)
27492 + break;
27493 + continue;
27494 +
27495 + match:
27496 + UPDATE_I(dict, 0, dindex, ip, in);
27497 + if (pd(ip, ii) > 0) {
27498 + register lzo_uint t = pd(ip, ii);
27499 +
27500 + if (t <= 3) {
27501 + assert("lzo-04", op - 2 > out);
27502 + op[-2] |= LZO_BYTE(t);
27503 + } else if (t <= 18)
27504 + *op++ = LZO_BYTE(t - 3);
27505 + else {
27506 + register lzo_uint tt = t - 18;
27507 +
27508 + *op++ = 0;
27509 + while (tt > 255) {
27510 + tt -= 255;
27511 + *op++ = 0;
27512 + }
27513 + assert("lzo-05", tt > 0);
27514 + *op++ = LZO_BYTE(tt);
27515 + }
27516 + do
27517 + *op++ = *ii++;
27518 + while (--t > 0);
27519 + }
27520 +
27521 + assert("lzo-06", ii == ip);
27522 + ip += 3;
27523 + if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27524 + || m_pos[6] != *ip++ || m_pos[7] != *ip++
27525 + || m_pos[8] != *ip++
27526 +#ifdef LZO1Y
27527 + || m_pos[9] != *ip++ || m_pos[10] != *ip++
27528 + || m_pos[11] != *ip++ || m_pos[12] != *ip++
27529 + || m_pos[13] != *ip++ || m_pos[14] != *ip++
27530 +#endif
27531 + ) {
27532 + --ip;
27533 + m_len = ip - ii;
27534 + assert("lzo-07", m_len >= 3);
27535 + assert("lzo-08", m_len <= M2_MAX_LEN);
27536 +
27537 + if (m_off <= M2_MAX_OFFSET) {
27538 + m_off -= 1;
27539 +#if defined(LZO1X)
27540 + *op++ =
27541 + LZO_BYTE(((m_len -
27542 + 1) << 5) | ((m_off & 7) << 2));
27543 + *op++ = LZO_BYTE(m_off >> 3);
27544 +#elif defined(LZO1Y)
27545 + *op++ =
27546 + LZO_BYTE(((m_len +
27547 + 1) << 4) | ((m_off & 3) << 2));
27548 + *op++ = LZO_BYTE(m_off >> 2);
27549 +#endif
27550 + } else if (m_off <= M3_MAX_OFFSET) {
27551 + m_off -= 1;
27552 + *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
27553 + goto m3_m4_offset;
27554 + } else
27555 +#if defined(LZO1X)
27556 + {
27557 + m_off -= 0x4000;
27558 + assert("lzo-09", m_off > 0);
27559 + assert("lzo-10", m_off <= 0x7fff);
27560 + *op++ = LZO_BYTE(M4_MARKER |
27561 + ((m_off & 0x4000) >> 11) |
27562 + (m_len - 2));
27563 + goto m3_m4_offset;
27564 + }
27565 +#elif defined(LZO1Y)
27566 + goto m4_match;
27567 +#endif
27568 + } else {
27569 + {
27570 + const lzo_byte *end = in_end;
27571 + const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
27572 + while (ip < end && *m == *ip)
27573 + m++, ip++;
27574 + m_len = (ip - ii);
27575 + }
27576 + assert("lzo-11", m_len > M2_MAX_LEN);
27577 +
27578 + if (m_off <= M3_MAX_OFFSET) {
27579 + m_off -= 1;
27580 + if (m_len <= 33)
27581 + *op++ =
27582 + LZO_BYTE(M3_MARKER | (m_len - 2));
27583 + else {
27584 + m_len -= 33;
27585 + *op++ = M3_MARKER | 0;
27586 + goto m3_m4_len;
27587 + }
27588 + } else {
27589 +#if defined(LZO1Y)
27590 + m4_match:
27591 +#endif
27592 + m_off -= 0x4000;
27593 + assert("lzo-12", m_off > 0);
27594 + assert("lzo-13", m_off <= 0x7fff);
27595 + if (m_len <= M4_MAX_LEN)
27596 + *op++ = LZO_BYTE(M4_MARKER |
27597 + ((m_off & 0x4000) >>
27598 + 11) | (m_len - 2));
27599 + else {
27600 + m_len -= M4_MAX_LEN;
27601 + *op++ =
27602 + LZO_BYTE(M4_MARKER |
27603 + ((m_off & 0x4000) >> 11));
27604 + m3_m4_len:
27605 + while (m_len > 255) {
27606 + m_len -= 255;
27607 + *op++ = 0;
27608 + }
27609 + assert("lzo-14", m_len > 0);
27610 + *op++ = LZO_BYTE(m_len);
27611 + }
27612 + }
27613 +
27614 + m3_m4_offset:
27615 + *op++ = LZO_BYTE((m_off & 63) << 2);
27616 + *op++ = LZO_BYTE(m_off >> 6);
27617 + }
27618 +
27619 + ii = ip;
27620 + if (ip >= ip_end)
27621 + break;
27622 + }
27623 +
27624 + *out_len = op - out;
27625 + return pd(in_end, ii);
27626 +}
27627 +
27628 +int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
27629 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27630 +{
27631 + lzo_byte *op = out;
27632 + lzo_uint t;
27633 +
27634 +#if defined(__LZO_QUERY_COMPRESS)
27635 + if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27636 + return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
27637 + D_SIZE, lzo_sizeof(lzo_dict_t));
27638 +#endif
27639 +
27640 + if (in_len <= M2_MAX_LEN + 5)
27641 + t = in_len;
27642 + else {
27643 + t = do_compress(in, in_len, op, out_len, wrkmem);
27644 + op += *out_len;
27645 + }
27646 +
27647 + if (t > 0) {
27648 + const lzo_byte *ii = in + in_len - t;
27649 +
27650 + if (op == out && t <= 238)
27651 + *op++ = LZO_BYTE(17 + t);
27652 + else if (t <= 3)
27653 + op[-2] |= LZO_BYTE(t);
27654 + else if (t <= 18)
27655 + *op++ = LZO_BYTE(t - 3);
27656 + else {
27657 + lzo_uint tt = t - 18;
27658 +
27659 + *op++ = 0;
27660 + while (tt > 255) {
27661 + tt -= 255;
27662 + *op++ = 0;
27663 + }
27664 + assert("lzo-15", tt > 0);
27665 + *op++ = LZO_BYTE(tt);
27666 + }
27667 + do
27668 + *op++ = *ii++;
27669 + while (--t > 0);
27670 + }
27671 +
27672 + *op++ = M4_MARKER | 1;
27673 + *op++ = 0;
27674 + *op++ = 0;
27675 +
27676 + *out_len = op - out;
27677 + return LZO_E_OK;
27678 +}
27679 +
27680 +#undef do_compress
27681 +#undef DO_COMPRESS
27682 +#undef LZO_HASH
27683 +
27684 +#undef LZO_TEST_DECOMPRESS_OVERRUN
27685 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
27686 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
27687 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27688 +#undef DO_DECOMPRESS
27689 +#define DO_DECOMPRESS lzo1x_decompress
27690 +
27691 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
27692 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27693 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
27694 +# endif
27695 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27696 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
27697 +# endif
27698 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27699 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27700 +# endif
27701 +#endif
27702 +
27703 +#undef TEST_IP
27704 +#undef TEST_OP
27705 +#undef TEST_LOOKBEHIND
27706 +#undef NEED_IP
27707 +#undef NEED_OP
27708 +#undef HAVE_TEST_IP
27709 +#undef HAVE_TEST_OP
27710 +#undef HAVE_NEED_IP
27711 +#undef HAVE_NEED_OP
27712 +#undef HAVE_ANY_IP
27713 +#undef HAVE_ANY_OP
27714 +
27715 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27716 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
27717 +# define TEST_IP (ip < ip_end)
27718 +# endif
27719 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
27720 +# define NEED_IP(x) \
27721 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
27722 +# endif
27723 +#endif
27724 +
27725 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27726 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
27727 +# define TEST_OP (op <= op_end)
27728 +# endif
27729 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
27730 +# undef TEST_OP
27731 +# define NEED_OP(x) \
27732 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
27733 +# endif
27734 +#endif
27735 +
27736 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27737 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
27738 +#else
27739 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
27740 +#endif
27741 +
27742 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
27743 +# define TEST_IP (ip < ip_end)
27744 +#endif
27745 +
27746 +#if defined(TEST_IP)
27747 +# define HAVE_TEST_IP
27748 +#else
27749 +# define TEST_IP 1
27750 +#endif
27751 +#if defined(TEST_OP)
27752 +# define HAVE_TEST_OP
27753 +#else
27754 +# define TEST_OP 1
27755 +#endif
27756 +
27757 +#if defined(NEED_IP)
27758 +# define HAVE_NEED_IP
27759 +#else
27760 +# define NEED_IP(x) ((void) 0)
27761 +#endif
27762 +#if defined(NEED_OP)
27763 +# define HAVE_NEED_OP
27764 +#else
27765 +# define NEED_OP(x) ((void) 0)
27766 +#endif
27767 +
27768 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
27769 +# define HAVE_ANY_IP
27770 +#endif
27771 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
27772 +# define HAVE_ANY_OP
27773 +#endif
27774 +
27775 +#undef __COPY4
27776 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
27777 +
27778 +#undef COPY4
27779 +#if defined(LZO_UNALIGNED_OK_4)
27780 +# define COPY4(dst,src) __COPY4(dst,src)
27781 +#elif defined(LZO_ALIGNED_OK_4)
27782 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
27783 +#endif
27784 +
27785 +#if defined(DO_DECOMPRESS)
27786 +int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
27787 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27788 +#endif
27789 +{
27790 + register lzo_byte *op;
27791 + register const lzo_byte *ip;
27792 + register lzo_uint t;
27793 +#if defined(COPY_DICT)
27794 + lzo_uint m_off;
27795 + const lzo_byte *dict_end;
27796 +#else
27797 + register const lzo_byte *m_pos;
27798 +#endif
27799 +
27800 + const lzo_byte *const ip_end = in + in_len;
27801 +#if defined(HAVE_ANY_OP)
27802 + lzo_byte *const op_end = out + *out_len;
27803 +#endif
27804 +#if defined(LZO1Z)
27805 + lzo_uint last_m_off = 0;
27806 +#endif
27807 +
27808 + LZO_UNUSED(wrkmem);
27809 +
27810 +#if defined(__LZO_QUERY_DECOMPRESS)
27811 + if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27812 + return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
27813 + 0, 0);
27814 +#endif
27815 +
27816 +#if defined(COPY_DICT)
27817 + if (dict) {
27818 + if (dict_len > M4_MAX_OFFSET) {
27819 + dict += dict_len - M4_MAX_OFFSET;
27820 + dict_len = M4_MAX_OFFSET;
27821 + }
27822 + dict_end = dict + dict_len;
27823 + } else {
27824 + dict_len = 0;
27825 + dict_end = NULL;
27826 + }
27827 +#endif
27828 +
27829 + *out_len = 0;
27830 +
27831 + op = out;
27832 + ip = in;
27833 +
27834 + if (*ip > 17) {
27835 + t = *ip++ - 17;
27836 + if (t < 4)
27837 + goto match_next;
27838 + assert("lzo-16", t > 0);
27839 + NEED_OP(t);
27840 + NEED_IP(t + 1);
27841 + do
27842 + *op++ = *ip++;
27843 + while (--t > 0);
27844 + goto first_literal_run;
27845 + }
27846 +
27847 + while (TEST_IP && TEST_OP) {
27848 + t = *ip++;
27849 + if (t >= 16)
27850 + goto match;
27851 + if (t == 0) {
27852 + NEED_IP(1);
27853 + while (*ip == 0) {
27854 + t += 255;
27855 + ip++;
27856 + NEED_IP(1);
27857 + }
27858 + t += 15 + *ip++;
27859 + }
27860 + assert("lzo-17", t > 0);
27861 + NEED_OP(t + 3);
27862 + NEED_IP(t + 4);
27863 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
27864 +#if !defined(LZO_UNALIGNED_OK_4)
27865 + if (PTR_ALIGNED2_4(op, ip)) {
27866 +#endif
27867 + COPY4(op, ip);
27868 + op += 4;
27869 + ip += 4;
27870 + if (--t > 0) {
27871 + if (t >= 4) {
27872 + do {
27873 + COPY4(op, ip);
27874 + op += 4;
27875 + ip += 4;
27876 + t -= 4;
27877 + } while (t >= 4);
27878 + if (t > 0)
27879 + do
27880 + *op++ = *ip++;
27881 + while (--t > 0);
27882 + } else
27883 + do
27884 + *op++ = *ip++;
27885 + while (--t > 0);
27886 + }
27887 +#if !defined(LZO_UNALIGNED_OK_4)
27888 + } else
27889 +#endif
27890 +#endif
27891 +#if !defined(LZO_UNALIGNED_OK_4)
27892 + {
27893 + *op++ = *ip++;
27894 + *op++ = *ip++;
27895 + *op++ = *ip++;
27896 + do
27897 + *op++ = *ip++;
27898 + while (--t > 0);
27899 + }
27900 +#endif
27901 +
27902 + first_literal_run:
27903 +
27904 + t = *ip++;
27905 + if (t >= 16)
27906 + goto match;
27907 +#if defined(COPY_DICT)
27908 +#if defined(LZO1Z)
27909 + m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
27910 + last_m_off = m_off;
27911 +#else
27912 + m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
27913 +#endif
27914 + NEED_OP(3);
27915 + t = 3;
27916 + COPY_DICT(t, m_off)
27917 +#else
27918 +#if defined(LZO1Z)
27919 + t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
27920 + m_pos = op - t;
27921 + last_m_off = t;
27922 +#else
27923 + m_pos = op - (1 + M2_MAX_OFFSET);
27924 + m_pos -= t >> 2;
27925 + m_pos -= *ip++ << 2;
27926 +#endif
27927 + TEST_LOOKBEHIND(m_pos, out);
27928 + NEED_OP(3);
27929 + *op++ = *m_pos++;
27930 + *op++ = *m_pos++;
27931 + *op++ = *m_pos;
27932 +#endif
27933 + goto match_done;
27934 +
27935 + while (TEST_IP && TEST_OP) {
27936 + match:
27937 + if (t >= 64) {
27938 +#if defined(COPY_DICT)
27939 +#if defined(LZO1X)
27940 + m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
27941 + t = (t >> 5) - 1;
27942 +#elif defined(LZO1Y)
27943 + m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
27944 + t = (t >> 4) - 3;
27945 +#elif defined(LZO1Z)
27946 + m_off = t & 0x1f;
27947 + if (m_off >= 0x1c)
27948 + m_off = last_m_off;
27949 + else {
27950 + m_off = 1 + (m_off << 6) + (*ip++ >> 2);
27951 + last_m_off = m_off;
27952 + }
27953 + t = (t >> 5) - 1;
27954 +#endif
27955 +#else
27956 +#if defined(LZO1X)
27957 + m_pos = op - 1;
27958 + m_pos -= (t >> 2) & 7;
27959 + m_pos -= *ip++ << 3;
27960 + t = (t >> 5) - 1;
27961 +#elif defined(LZO1Y)
27962 + m_pos = op - 1;
27963 + m_pos -= (t >> 2) & 3;
27964 + m_pos -= *ip++ << 2;
27965 + t = (t >> 4) - 3;
27966 +#elif defined(LZO1Z)
27967 + {
27968 + lzo_uint off = t & 0x1f;
27969 + m_pos = op;
27970 + if (off >= 0x1c) {
27971 + assert(last_m_off > 0);
27972 + m_pos -= last_m_off;
27973 + } else {
27974 + off =
27975 + 1 + (off << 6) +
27976 + (*ip++ >> 2);
27977 + m_pos -= off;
27978 + last_m_off = off;
27979 + }
27980 + }
27981 + t = (t >> 5) - 1;
27982 +#endif
27983 + TEST_LOOKBEHIND(m_pos, out);
27984 + assert("lzo-18", t > 0);
27985 + NEED_OP(t + 3 - 1);
27986 + goto copy_match;
27987 +#endif
27988 + } else if (t >= 32) {
27989 + t &= 31;
27990 + if (t == 0) {
27991 + NEED_IP(1);
27992 + while (*ip == 0) {
27993 + t += 255;
27994 + ip++;
27995 + NEED_IP(1);
27996 + }
27997 + t += 31 + *ip++;
27998 + }
27999 +#if defined(COPY_DICT)
28000 +#if defined(LZO1Z)
28001 + m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28002 + last_m_off = m_off;
28003 +#else
28004 + m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28005 +#endif
28006 +#else
28007 +#if defined(LZO1Z)
28008 + {
28009 + lzo_uint off =
28010 + 1 + (ip[0] << 6) + (ip[1] >> 2);
28011 + m_pos = op - off;
28012 + last_m_off = off;
28013 + }
28014 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28015 + m_pos = op - 1;
28016 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28017 +#else
28018 + m_pos = op - 1;
28019 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28020 +#endif
28021 +#endif
28022 + ip += 2;
28023 + } else if (t >= 16) {
28024 +#if defined(COPY_DICT)
28025 + m_off = (t & 8) << 11;
28026 +#else
28027 + m_pos = op;
28028 + m_pos -= (t & 8) << 11;
28029 +#endif
28030 + t &= 7;
28031 + if (t == 0) {
28032 + NEED_IP(1);
28033 + while (*ip == 0) {
28034 + t += 255;
28035 + ip++;
28036 + NEED_IP(1);
28037 + }
28038 + t += 7 + *ip++;
28039 + }
28040 +#if defined(COPY_DICT)
28041 +#if defined(LZO1Z)
28042 + m_off += (ip[0] << 6) + (ip[1] >> 2);
28043 +#else
28044 + m_off += (ip[0] >> 2) + (ip[1] << 6);
28045 +#endif
28046 + ip += 2;
28047 + if (m_off == 0)
28048 + goto eof_found;
28049 + m_off += 0x4000;
28050 +#if defined(LZO1Z)
28051 + last_m_off = m_off;
28052 +#endif
28053 +#else
28054 +#if defined(LZO1Z)
28055 + m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28056 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28057 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28058 +#else
28059 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28060 +#endif
28061 + ip += 2;
28062 + if (m_pos == op)
28063 + goto eof_found;
28064 + m_pos -= 0x4000;
28065 +#if defined(LZO1Z)
28066 + last_m_off = op - m_pos;
28067 +#endif
28068 +#endif
28069 + } else {
28070 +#if defined(COPY_DICT)
28071 +#if defined(LZO1Z)
28072 + m_off = 1 + (t << 6) + (*ip++ >> 2);
28073 + last_m_off = m_off;
28074 +#else
28075 + m_off = 1 + (t >> 2) + (*ip++ << 2);
28076 +#endif
28077 + NEED_OP(2);
28078 + t = 2;
28079 + COPY_DICT(t, m_off)
28080 +#else
28081 +#if defined(LZO1Z)
28082 + t = 1 + (t << 6) + (*ip++ >> 2);
28083 + m_pos = op - t;
28084 + last_m_off = t;
28085 +#else
28086 + m_pos = op - 1;
28087 + m_pos -= t >> 2;
28088 + m_pos -= *ip++ << 2;
28089 +#endif
28090 + TEST_LOOKBEHIND(m_pos, out);
28091 + NEED_OP(2);
28092 + *op++ = *m_pos++;
28093 + *op++ = *m_pos;
28094 +#endif
28095 + goto match_done;
28096 + }
28097 +
28098 +#if defined(COPY_DICT)
28099 +
28100 + NEED_OP(t + 3 - 1);
28101 + t += 3 - 1;
28102 + COPY_DICT(t, m_off)
28103 +#else
28104 +
28105 + TEST_LOOKBEHIND(m_pos, out);
28106 + assert("lzo-19", t > 0);
28107 + NEED_OP(t + 3 - 1);
28108 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28109 +#if !defined(LZO_UNALIGNED_OK_4)
28110 + if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28111 + assert((op - m_pos) >= 4);
28112 +#else
28113 + if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28114 +#endif
28115 + COPY4(op, m_pos);
28116 + op += 4;
28117 + m_pos += 4;
28118 + t -= 4 - (3 - 1);
28119 + do {
28120 + COPY4(op, m_pos);
28121 + op += 4;
28122 + m_pos += 4;
28123 + t -= 4;
28124 + } while (t >= 4);
28125 + if (t > 0)
28126 + do
28127 + *op++ = *m_pos++;
28128 + while (--t > 0);
28129 + } else
28130 +#endif
28131 + {
28132 + copy_match:
28133 + *op++ = *m_pos++;
28134 + *op++ = *m_pos++;
28135 + do
28136 + *op++ = *m_pos++;
28137 + while (--t > 0);
28138 + }
28139 +
28140 +#endif
28141 +
28142 + match_done:
28143 +#if defined(LZO1Z)
28144 + t = ip[-1] & 3;
28145 +#else
28146 + t = ip[-2] & 3;
28147 +#endif
28148 + if (t == 0)
28149 + break;
28150 +
28151 + match_next:
28152 + assert("lzo-20", t > 0);
28153 + NEED_OP(t);
28154 + NEED_IP(t + 1);
28155 + do
28156 + *op++ = *ip++;
28157 + while (--t > 0);
28158 + t = *ip++;
28159 + }
28160 + }
28161 +
28162 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28163 + *out_len = op - out;
28164 + return LZO_E_EOF_NOT_FOUND;
28165 +#endif
28166 +
28167 + eof_found:
28168 + assert("lzo-21", t == 1);
28169 + *out_len = op - out;
28170 + return (ip == ip_end ? LZO_E_OK :
28171 + (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28172 +
28173 +#if defined(HAVE_NEED_IP)
28174 + input_overrun:
28175 + *out_len = op - out;
28176 + return LZO_E_INPUT_OVERRUN;
28177 +#endif
28178 +
28179 +#if defined(HAVE_NEED_OP)
28180 + output_overrun:
28181 + *out_len = op - out;
28182 + return LZO_E_OUTPUT_OVERRUN;
28183 +#endif
28184 +
28185 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28186 + lookbehind_overrun:
28187 + *out_len = op - out;
28188 + return LZO_E_LOOKBEHIND_OVERRUN;
28189 +#endif
28190 +}
28191 +
28192 +#define LZO_TEST_DECOMPRESS_OVERRUN
28193 +#undef DO_DECOMPRESS
28194 +#define DO_DECOMPRESS lzo1x_decompress_safe
28195 +
28196 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28197 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28198 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28199 +# endif
28200 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28201 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28202 +# endif
28203 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28204 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28205 +# endif
28206 +#endif
28207 +
28208 +#undef TEST_IP
28209 +#undef TEST_OP
28210 +#undef TEST_LOOKBEHIND
28211 +#undef NEED_IP
28212 +#undef NEED_OP
28213 +#undef HAVE_TEST_IP
28214 +#undef HAVE_TEST_OP
28215 +#undef HAVE_NEED_IP
28216 +#undef HAVE_NEED_OP
28217 +#undef HAVE_ANY_IP
28218 +#undef HAVE_ANY_OP
28219 +
28220 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28221 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28222 +# define TEST_IP (ip < ip_end)
28223 +# endif
28224 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28225 +# define NEED_IP(x) \
28226 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28227 +# endif
28228 +#endif
28229 +
28230 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28231 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28232 +# define TEST_OP (op <= op_end)
28233 +# endif
28234 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28235 +# undef TEST_OP
28236 +# define NEED_OP(x) \
28237 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28238 +# endif
28239 +#endif
28240 +
28241 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28242 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28243 +#else
28244 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28245 +#endif
28246 +
28247 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28248 +# define TEST_IP (ip < ip_end)
28249 +#endif
28250 +
28251 +#if defined(TEST_IP)
28252 +# define HAVE_TEST_IP
28253 +#else
28254 +# define TEST_IP 1
28255 +#endif
28256 +#if defined(TEST_OP)
28257 +# define HAVE_TEST_OP
28258 +#else
28259 +# define TEST_OP 1
28260 +#endif
28261 +
28262 +#if defined(NEED_IP)
28263 +# define HAVE_NEED_IP
28264 +#else
28265 +# define NEED_IP(x) ((void) 0)
28266 +#endif
28267 +#if defined(NEED_OP)
28268 +# define HAVE_NEED_OP
28269 +#else
28270 +# define NEED_OP(x) ((void) 0)
28271 +#endif
28272 +
28273 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28274 +# define HAVE_ANY_IP
28275 +#endif
28276 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28277 +# define HAVE_ANY_OP
28278 +#endif
28279 +
28280 +#undef __COPY4
28281 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28282 +
28283 +#undef COPY4
28284 +#if defined(LZO_UNALIGNED_OK_4)
28285 +# define COPY4(dst,src) __COPY4(dst,src)
28286 +#elif defined(LZO_ALIGNED_OK_4)
28287 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28288 +#endif
28289 +
28290 +/***** End of minilzo.c *****/
28291 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h
28292 --- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 03:00:00.000000000 +0300
28293 +++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h 2007-05-06 14:50:43.754993222 +0400
28294 @@ -0,0 +1,70 @@
28295 +/* minilzo.h -- mini subset of the LZO real-time data compression library
28296 + adopted for reiser4 compression transform plugin.
28297 +
28298 + This file is part of the LZO real-time data compression library
28299 + and not included in any proprietary licenses of reiser4.
28300 +
28301 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28302 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28303 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28304 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28305 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28306 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28307 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28308 + All Rights Reserved.
28309 +
28310 + The LZO library is free software; you can redistribute it and/or
28311 + modify it under the terms of the GNU General Public License as
28312 + published by the Free Software Foundation; either version 2 of
28313 + the License, or (at your option) any later version.
28314 +
28315 + The LZO library is distributed in the hope that it will be useful,
28316 + but WITHOUT ANY WARRANTY; without even the implied warranty of
28317 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28318 + GNU General Public License for more details.
28319 +
28320 + You should have received a copy of the GNU General Public License
28321 + along with the LZO library; see the file COPYING.
28322 + If not, write to the Free Software Foundation, Inc.,
28323 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28324 +
28325 + Markus F.X.J. Oberhumer
28326 + <markus@oberhumer.com>
28327 + http://www.oberhumer.com/opensource/lzo/
28328 + */
28329 +
28330 +/*
28331 + * NOTE:
28332 + * the full LZO package can be found at
28333 + * http://www.oberhumer.com/opensource/lzo/
28334 + */
28335 +
28336 +#ifndef __MINILZO_H
28337 +#define __MINILZO_H
28338 +
28339 +#define MINILZO_VERSION 0x1080
28340 +
28341 +#include "lzoconf.h"
28342 +
28343 +/* Memory required for the wrkmem parameter.
28344 + * When the required size is 0, you can also pass a NULL pointer.
28345 + */
28346 +
28347 +#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28348 +#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28349 +#define LZO1X_MEM_DECOMPRESS (0)
28350 +
28351 +/* compression */
28352 +extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28353 + lzo_byte * dst, lzo_uintp dst_len,
28354 + lzo_voidp wrkmem);
28355 +/* decompression */
28356 +extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28357 + lzo_byte * dst, lzo_uintp dst_len,
28358 + lzo_voidp wrkmem /* NOT USED */);
28359 +/* safe decompression with overrun testing */
28360 +extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28361 + lzo_byte * dst, lzo_uintp dst_len,
28362 + lzo_voidp wrkmem /* NOT USED */ );
28363 +
28364 +#endif /* already included */
28365 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c
28366 --- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
28367 +++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c 2007-05-06 14:50:43.754993222 +0400
28368 @@ -0,0 +1,37 @@
28369 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
28370 + licensing governed by reiser4/README */
28371 +/* Reiser4 cipher transform plugins */
28372 +
28373 +#include "../../debug.h"
28374 +#include "../plugin.h"
28375 +
28376 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28377 + [NONE_CIPHER_ID] = {
28378 + .h = {
28379 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28380 + .id = NONE_CIPHER_ID,
28381 + .pops = NULL,
28382 + .label = "none",
28383 + .desc = "no cipher transform",
28384 + .linkage = {NULL, NULL}
28385 + },
28386 + .alloc = NULL,
28387 + .free = NULL,
28388 + .scale = NULL,
28389 + .align_stream = NULL,
28390 + .setkey = NULL,
28391 + .encrypt = NULL,
28392 + .decrypt = NULL
28393 + }
28394 +};
28395 +
28396 +/* Make Linus happy.
28397 + Local variables:
28398 + c-indentation-style: "K&R"
28399 + mode-name: "LC"
28400 + c-basic-offset: 8
28401 + tab-width: 8
28402 + fill-column: 120
28403 + scroll-step: 1
28404 + End:
28405 +*/
28406 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h
28407 --- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
28408 +++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h 2007-05-06 14:50:43.754993222 +0400
28409 @@ -0,0 +1,55 @@
28410 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28411 +/* This file contains definitions for the objects operated
28412 + by reiser4 key manager, which is something like keyring
28413 + wrapped by appropriate reiser4 plugin */
28414 +
28415 +#if !defined( __FS_REISER4_CRYPT_H__ )
28416 +#define __FS_REISER4_CRYPT_H__
28417 +
28418 +#include <linux/crypto.h>
28419 +
28420 +/* key info imported from user space */
28421 +typedef struct crypto_data {
28422 + int keysize; /* uninstantiated key size */
28423 + __u8 * key; /* uninstantiated key */
28424 + int keyid_size; /* size of passphrase */
28425 + __u8 * keyid; /* passphrase */
28426 +} crypto_data_t;
28427 +
28428 +/* This object contains all needed infrastructure to implement
28429 + cipher transform. This is operated (allocating, inheriting,
28430 + validating, binding to host inode, etc..) by reiser4 key manager.
28431 +
28432 + This info can be allocated in two cases:
28433 + 1. importing a key from user space.
28434 + 2. reading inode from disk */
28435 +typedef struct crypto_stat {
28436 + struct inode * host;
28437 + struct crypto_hash * digest;
28438 + struct crypto_blkcipher * cipher;
28439 +#if 0
28440 + cipher_key_plugin * kplug; /* key manager */
28441 +#endif
28442 + __u8 * keyid; /* key fingerprint, created by digest plugin,
28443 + using uninstantiated key and passphrase.
28444 + supposed to be stored in disk stat-data */
28445 + int inst; /* this indicates if the cipher key is
28446 + instantiated (case 1 above) */
28447 + int keysize; /* uninstantiated key size (bytes), supposed
28448 + to be stored in disk stat-data */
28449 + int keyload_count; /* number of the objects which has this
28450 + crypto-stat attached */
28451 +} crypto_stat_t;
28452 +
28453 +#endif /* __FS_REISER4_CRYPT_H__ */
28454 +
28455 +/*
28456 + Local variables:
28457 + c-indentation-style: "K&R"
28458 + mode-name: "LC"
28459 + c-basic-offset: 8
28460 + tab-width: 8
28461 + fill-column: 120
28462 + scroll-step: 1
28463 + End:
28464 +*/
28465 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.20/fs/reiser4/plugin/crypto/digest.c
28466 --- linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
28467 +++ linux-2.6.20/fs/reiser4/plugin/crypto/digest.c 2007-05-06 14:50:43.754993222 +0400
28468 @@ -0,0 +1,58 @@
28469 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28470 +
28471 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28472 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28473 +#include "../../debug.h"
28474 +#include "../plugin_header.h"
28475 +#include "../plugin.h"
28476 +#include "../file/cryptcompress.h"
28477 +
28478 +#include <linux/types.h>
28479 +
28480 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28481 +
28482 +static struct crypto_hash * alloc_sha256 (void)
28483 +{
28484 +#if REISER4_SHA256
28485 + return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
28486 +#else
28487 + warning("edward-1418", "sha256 unsupported");
28488 + return ERR_PTR(-EINVAL);
28489 +#endif
28490 +}
28491 +
28492 +static void free_sha256 (struct crypto_hash * tfm)
28493 +{
28494 +#if REISER4_SHA256
28495 + crypto_free_hash(tfm);
28496 +#endif
28497 + return;
28498 +}
28499 +
28500 +/* digest plugins */
28501 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28502 + [SHA256_32_DIGEST_ID] = {
28503 + .h = {
28504 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28505 + .id = SHA256_32_DIGEST_ID,
28506 + .pops = NULL,
28507 + .label = "sha256_32",
28508 + .desc = "sha256_32 digest transform",
28509 + .linkage = {NULL, NULL}
28510 + },
28511 + .fipsize = sizeof(__u32),
28512 + .alloc = alloc_sha256,
28513 + .free = free_sha256
28514 + }
28515 +};
28516 +
28517 +/*
28518 + Local variables:
28519 + c-indentation-style: "K&R"
28520 + mode-name: "LC"
28521 + c-basic-offset: 8
28522 + tab-width: 8
28523 + fill-column: 120
28524 + scroll-step: 1
28525 + End:
28526 +*/
28527 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.20/fs/reiser4/plugin/dir/dir.h
28528 --- linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
28529 +++ linux-2.6.20/fs/reiser4/plugin/dir/dir.h 2007-05-06 14:50:43.754993222 +0400
28530 @@ -0,0 +1,36 @@
28531 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28532 + * reiser4/README */
28533 +
28534 +/* this file contains declarations of methods implementing directory plugins */
28535 +
28536 +#if !defined( __REISER4_DIR_H__ )
28537 +#define __REISER4_DIR_H__
28538 +
28539 +/*#include "../../key.h"
28540 +
28541 +#include <linux/fs.h>*/
28542 +
28543 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
28544 +
28545 +/* "hashed" directory methods of dir plugin */
28546 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
28547 + reiser4_key *);
28548 +
28549 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
28550 +
28551 +/* "seekable" directory methods of dir plugin */
28552 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
28553 + reiser4_key *);
28554 +
28555 +/* __REISER4_DIR_H__ */
28556 +#endif
28557 +
28558 +/*
28559 + Local variables:
28560 + c-indentation-style: "K&R"
28561 + mode-name: "LC"
28562 + c-basic-offset: 8
28563 + tab-width: 8
28564 + fill-column: 120
28565 + End:
28566 +*/
28567 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c
28568 --- linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
28569 +++ linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c 2007-05-06 14:50:43.754993222 +0400
28570 @@ -0,0 +1,81 @@
28571 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28572 + * reiser4/README */
28573 +
28574 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
28575 + names to the files. */
28576 +
28577 +/*
28578 + * Hashed directory logically consists of persistent directory
28579 + * entries. Directory entry is a pair of a file name and a key of stat-data of
28580 + * a file that has this name in the given directory.
28581 + *
28582 + * Directory entries are stored in the tree in the form of directory
28583 + * items. Directory item should implement dir_entry_ops portion of item plugin
28584 + * interface (see plugin/item/item.h). Hashed directory interacts with
28585 + * directory item plugin exclusively through dir_entry_ops operations.
28586 + *
28587 + * Currently there are two implementations of directory items: "simple
28588 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
28589 + * (plugin/item/cde.[ch]) with the latter being the default.
28590 + *
28591 + * There is, however some delicate way through which directory code interferes
28592 + * with item plugin: key assignment policy. A key for a directory item is
28593 + * chosen by directory code, and as described in kassign.c, this key contains
28594 + * a portion of file name. Directory item uses this knowledge to avoid storing
28595 + * this portion of file name twice: in the key and in the directory item body.
28596 + *
28597 + */
28598 +
28599 +#include "../../inode.h"
28600 +
28601 +void complete_entry_key(const struct inode *, const char *name,
28602 + int len, reiser4_key * result);
28603 +
28604 +/* this is implementation of build_entry_key method of dir
28605 + plugin for HASHED_DIR_PLUGIN_ID
28606 + */
28607 +void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
28608 + * (or will be) in.*/
28609 + const struct qstr *qname, /* name of file referenced
28610 + * by this entry */
28611 + reiser4_key * result /* resulting key of directory
28612 + * entry */ )
28613 +{
28614 + const char *name;
28615 + int len;
28616 +
28617 + assert("nikita-1139", dir != NULL);
28618 + assert("nikita-1140", qname != NULL);
28619 + assert("nikita-1141", qname->name != NULL);
28620 + assert("nikita-1142", result != NULL);
28621 +
28622 + name = qname->name;
28623 + len = qname->len;
28624 +
28625 + assert("nikita-2867", strlen(name) == len);
28626 +
28627 + reiser4_key_init(result);
28628 + /* locality of directory entry's key is objectid of parent
28629 + directory */
28630 + set_key_locality(result, get_inode_oid(dir));
28631 + /* minor packing locality is constant */
28632 + set_key_type(result, KEY_FILE_NAME_MINOR);
28633 + /* dot is special case---we always want it to be first entry in
28634 + a directory. Actually, we just want to have smallest
28635 + directory entry.
28636 + */
28637 + if (len == 1 && name[0] == '.')
28638 + return;
28639 +
28640 + /* initialize part of entry key which depends on file name */
28641 + complete_entry_key(dir, name, len, result);
28642 +}
28643 +
28644 +/* Local variables:
28645 + c-indentation-style: "K&R"
28646 + mode-name: "LC"
28647 + c-basic-offset: 8
28648 + tab-width: 8
28649 + fill-column: 120
28650 + End:
28651 +*/
28652 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.20/fs/reiser4/plugin/dir/Makefile
28653 --- linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
28654 +++ linux-2.6.20/fs/reiser4/plugin/dir/Makefile 2007-05-06 14:50:43.758994472 +0400
28655 @@ -0,0 +1,5 @@
28656 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
28657 +
28658 +dir_plugins-objs := \
28659 + hashed_dir.o \
28660 + seekable_dir.o
28661 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c
28662 --- linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
28663 +++ linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c 2007-05-06 14:50:43.758994472 +0400
28664 @@ -0,0 +1,46 @@
28665 +/* Copyright 2005 by Hans Reiser, licensing governed by
28666 + * reiser4/README */
28667 +
28668 +#include "../../inode.h"
28669 +
28670 +/* this is implementation of build_entry_key method of dir
28671 + plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
28672 + This is for directories where we want repeatable and restartable readdir()
28673 + even in case 32bit user level struct dirent (readdir(3)).
28674 +*/
28675 +void
28676 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
28677 + reiser4_key * result)
28678 +{
28679 + oid_t objectid;
28680 +
28681 + assert("nikita-2283", dir != NULL);
28682 + assert("nikita-2284", name != NULL);
28683 + assert("nikita-2285", name->name != NULL);
28684 + assert("nikita-2286", result != NULL);
28685 +
28686 + reiser4_key_init(result);
28687 + /* locality of directory entry's key is objectid of parent
28688 + directory */
28689 + set_key_locality(result, get_inode_oid(dir));
28690 + /* minor packing locality is constant */
28691 + set_key_type(result, KEY_FILE_NAME_MINOR);
28692 + /* dot is special case---we always want it to be first entry in
28693 + a directory. Actually, we just want to have smallest
28694 + directory entry.
28695 + */
28696 + if ((name->len == 1) && (name->name[0] == '.'))
28697 + return;
28698 +
28699 + /* objectid of key is 31 lowest bits of hash. */
28700 + objectid =
28701 + inode_hash_plugin(dir)->hash(name->name,
28702 + (int)name->len) & 0x7fffffff;
28703 +
28704 + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
28705 + set_key_objectid(result, objectid);
28706 +
28707 + /* offset is always 0. */
28708 + set_key_offset(result, (__u64) 0);
28709 + return;
28710 +}
28711 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c
28712 --- linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
28713 +++ linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c 2007-05-06 14:50:43.758994472 +0400
28714 @@ -0,0 +1,872 @@
28715 +/* Copyright 2005 by Hans Reiser, licensing governed by
28716 + reiser4/README */
28717 +
28718 +/* this file contains typical implementations for most of methods of
28719 + directory plugin
28720 +*/
28721 +
28722 +#include "../inode.h"
28723 +
28724 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
28725 + lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
28726 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
28727 +void check_light_weight(struct inode *inode, struct inode *parent);
28728 +
28729 +/* this is common implementation of get_parent method of dir plugin
28730 + this is used by NFS kernel server to "climb" up directory tree to
28731 + check permissions
28732 + */
28733 +struct dentry *get_parent_common(struct inode *child)
28734 +{
28735 + struct super_block *s;
28736 + struct inode *parent;
28737 + struct dentry dotdot;
28738 + struct dentry *dentry;
28739 + reiser4_key key;
28740 + int result;
28741 +
28742 + /*
28743 + * lookup dotdot entry.
28744 + */
28745 +
28746 + s = child->i_sb;
28747 + memset(&dotdot, 0, sizeof(dotdot));
28748 + dotdot.d_name.name = "..";
28749 + dotdot.d_name.len = 2;
28750 + dotdot.d_op = &get_super_private(s)->ops.dentry;
28751 +
28752 + result = reiser4_lookup_name(child, &dotdot, &key);
28753 + if (result != 0)
28754 + return ERR_PTR(result);
28755 +
28756 + parent = reiser4_iget(s, &key, 1);
28757 + if (!IS_ERR(parent)) {
28758 + /*
28759 + * FIXME-NIKITA dubious: attributes are inherited from @child
28760 + * to @parent. But:
28761 + *
28762 + * (*) this is the only this we can do
28763 + *
28764 + * (*) attributes of light-weight object are inherited
28765 + * from a parent through which object was looked up first,
28766 + * so it is ambiguous anyway.
28767 + *
28768 + */
28769 + check_light_weight(parent, child);
28770 + reiser4_iget_complete(parent);
28771 + dentry = d_alloc_anon(parent);
28772 + if (dentry == NULL) {
28773 + iput(parent);
28774 + dentry = ERR_PTR(RETERR(-ENOMEM));
28775 + } else
28776 + dentry->d_op = &get_super_private(s)->ops.dentry;
28777 + } else if (PTR_ERR(parent) == -ENOENT)
28778 + dentry = ERR_PTR(RETERR(-ESTALE));
28779 + else
28780 + dentry = (void *)parent;
28781 + return dentry;
28782 +}
28783 +
28784 +/* this is common implementation of is_name_acceptable method of dir
28785 + plugin
28786 + */
28787 +int is_name_acceptable_common(const struct inode *inode, /* directory to check */
28788 + const char *name UNUSED_ARG, /* name to check */
28789 + int len /* @name's length */ )
28790 +{
28791 + assert("nikita-733", inode != NULL);
28792 + assert("nikita-734", name != NULL);
28793 + assert("nikita-735", len > 0);
28794 +
28795 + return len <= reiser4_max_filename_len(inode);
28796 +}
28797 +
28798 +/* there is no common implementation of build_entry_key method of dir
28799 + plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
28800 + plugin/dir/seekable.c:build_entry_key_seekable() for example
28801 +*/
28802 +
28803 +/* this is common implementation of build_readdir_key method of dir
28804 + plugin
28805 + see reiser4_readdir_common for more details
28806 +*/
28807 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
28808 + reiser4_key * result /* where to store key */ )
28809 +{
28810 + reiser4_file_fsdata *fdata;
28811 + struct inode *inode;
28812 +
28813 + assert("nikita-1361", dir != NULL);
28814 + assert("nikita-1362", result != NULL);
28815 + assert("nikita-1363", dir->f_dentry != NULL);
28816 + inode = dir->f_dentry->d_inode;
28817 + assert("nikita-1373", inode != NULL);
28818 +
28819 + fdata = reiser4_get_file_fsdata(dir);
28820 + if (IS_ERR(fdata))
28821 + return PTR_ERR(fdata);
28822 + assert("nikita-1364", fdata != NULL);
28823 + return extract_key_from_de_id(get_inode_oid(inode),
28824 + &fdata->dir.readdir.position.
28825 + dir_entry_key, result);
28826 +
28827 +}
28828 +
28829 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
28830 + int adj);
28831 +
28832 +/* this is common implementation of add_entry method of dir plugin
28833 +*/
28834 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
28835 + * in */
28836 + struct dentry *where, /* new name */
28837 + reiser4_object_create_data * data, /* parameters of
28838 + * new object */
28839 + reiser4_dir_entry_desc * entry /* parameters of
28840 + * new directory
28841 + * entry */)
28842 +{
28843 + int result;
28844 + coord_t *coord;
28845 + lock_handle lh;
28846 + reiser4_dentry_fsdata *fsdata;
28847 + reiser4_block_nr reserve;
28848 +
28849 + assert("nikita-1114", object != NULL);
28850 + assert("nikita-1250", where != NULL);
28851 +
28852 + fsdata = reiser4_get_dentry_fsdata(where);
28853 + if (unlikely(IS_ERR(fsdata)))
28854 + return PTR_ERR(fsdata);
28855 +
28856 + reserve = inode_dir_plugin(object)->estimate.add_entry(object);
28857 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28858 + return RETERR(-ENOSPC);
28859 +
28860 + init_lh(&lh);
28861 + coord = &fsdata->dec.entry_coord;
28862 + coord_clear_iplug(coord);
28863 +
28864 + /* check for this entry in a directory. This is plugin method. */
28865 + result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
28866 + entry);
28867 + if (likely(result == -ENOENT)) {
28868 + /* add new entry. Just pass control to the directory
28869 + item plugin. */
28870 + assert("nikita-1709", inode_dir_item_plugin(object));
28871 + assert("nikita-2230", coord->node == lh.node);
28872 + reiser4_seal_done(&fsdata->dec.entry_seal);
28873 + result =
28874 + inode_dir_item_plugin(object)->s.dir.add_entry(object,
28875 + coord, &lh,
28876 + where,
28877 + entry);
28878 + if (result == 0) {
28879 + reiser4_adjust_dir_file(object, where,
28880 + fsdata->dec.pos + 1, +1);
28881 + INODE_INC_FIELD(object, i_size);
28882 + }
28883 + } else if (result == 0) {
28884 + assert("nikita-2232", coord->node == lh.node);
28885 + result = RETERR(-EEXIST);
28886 + }
28887 + done_lh(&lh);
28888 +
28889 + return result;
28890 +}
28891 +
28892 +/**
28893 + * rem_entry - remove entry from directory item
28894 + * @dir:
28895 + * @dentry:
28896 + * @entry:
28897 + * @coord:
28898 + * @lh:
28899 + *
28900 + * Checks that coordinate @coord is set properly and calls item plugin
28901 + * method to cut entry.
28902 + */
28903 +static int
28904 +rem_entry(struct inode *dir, struct dentry *dentry,
28905 + reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
28906 +{
28907 + item_plugin *iplug;
28908 + struct inode *child;
28909 +
28910 + iplug = inode_dir_item_plugin(dir);
28911 + child = dentry->d_inode;
28912 + assert("nikita-3399", child != NULL);
28913 +
28914 + /* check that we are really destroying an entry for @child */
28915 + if (REISER4_DEBUG) {
28916 + int result;
28917 + reiser4_key key;
28918 +
28919 + result = iplug->s.dir.extract_key(coord, &key);
28920 + if (result != 0)
28921 + return result;
28922 + if (get_key_objectid(&key) != get_inode_oid(child)) {
28923 + warning("nikita-3397",
28924 + "rem_entry: %#llx != %#llx\n",
28925 + get_key_objectid(&key),
28926 + (unsigned long long)get_inode_oid(child));
28927 + return RETERR(-EIO);
28928 + }
28929 + }
28930 + return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
28931 +}
28932 +
28933 +/**
28934 + * reiser4_rem_entry_common - remove entry from a directory
28935 + * @dir: directory to remove entry from
28936 + * @where: name that is being removed
28937 + * @entry: description of entry being removed
28938 + *
28939 + * This is common implementation of rem_entry method of dir plugin.
28940 + */
28941 +int reiser4_rem_entry_common(struct inode *dir,
28942 + struct dentry *dentry,
28943 + reiser4_dir_entry_desc *entry)
28944 +{
28945 + int result;
28946 + coord_t *coord;
28947 + lock_handle lh;
28948 + reiser4_dentry_fsdata *fsdata;
28949 + __u64 tograb;
28950 +
28951 + assert("nikita-1124", dir != NULL);
28952 + assert("nikita-1125", dentry != NULL);
28953 +
28954 + tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
28955 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
28956 + if (result != 0)
28957 + return RETERR(-ENOSPC);
28958 +
28959 + init_lh(&lh);
28960 +
28961 + /* check for this entry in a directory. This is plugin method. */
28962 + result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
28963 + fsdata = reiser4_get_dentry_fsdata(dentry);
28964 + if (IS_ERR(fsdata)) {
28965 + done_lh(&lh);
28966 + return PTR_ERR(fsdata);
28967 + }
28968 +
28969 + coord = &fsdata->dec.entry_coord;
28970 +
28971 + assert("nikita-3404",
28972 + get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
28973 + dir->i_size <= 1);
28974 +
28975 + coord_clear_iplug(coord);
28976 + if (result == 0) {
28977 + /* remove entry. Just pass control to the directory item
28978 + plugin. */
28979 + assert("vs-542", inode_dir_item_plugin(dir));
28980 + reiser4_seal_done(&fsdata->dec.entry_seal);
28981 + reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
28982 + result =
28983 + WITH_COORD(coord,
28984 + rem_entry(dir, dentry, entry, coord, &lh));
28985 + if (result == 0) {
28986 + if (dir->i_size >= 1)
28987 + INODE_DEC_FIELD(dir, i_size);
28988 + else {
28989 + warning("nikita-2509", "Dir %llu is runt",
28990 + (unsigned long long)
28991 + get_inode_oid(dir));
28992 + result = RETERR(-EIO);
28993 + }
28994 +
28995 + assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
28996 + dentry->d_inode->i_size != 2 ||
28997 + inode_dir_plugin(dentry->d_inode) == NULL);
28998 + }
28999 + }
29000 + done_lh(&lh);
29001 +
29002 + return result;
29003 +}
29004 +
29005 +static reiser4_block_nr estimate_init(struct inode *parent,
29006 + struct inode *object);
29007 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
29008 +
29009 +/* this is common implementation of init method of dir plugin
29010 + create "." and ".." entries
29011 +*/
29012 +int reiser4_dir_init_common(struct inode *object, /* new directory */
29013 + struct inode *parent, /* parent directory */
29014 + reiser4_object_create_data * data /* info passed
29015 + * to us, this
29016 + * is filled by
29017 + * reiser4()
29018 + * syscall in
29019 + * particular */)
29020 +{
29021 + reiser4_block_nr reserve;
29022 +
29023 + assert("nikita-680", object != NULL);
29024 + assert("nikita-681", S_ISDIR(object->i_mode));
29025 + assert("nikita-682", parent != NULL);
29026 + assert("nikita-684", data != NULL);
29027 + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29028 + assert("nikita-687", object->i_mode & S_IFDIR);
29029 +
29030 + reserve = estimate_init(parent, object);
29031 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29032 + return RETERR(-ENOSPC);
29033 +
29034 + return create_dot_dotdot(object, parent);
29035 +}
29036 +
29037 +/* this is common implementation of done method of dir plugin
29038 + remove "." entry
29039 +*/
29040 +int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
29041 +{
29042 + int result;
29043 + reiser4_block_nr reserve;
29044 + struct dentry goodby_dots;
29045 + reiser4_dir_entry_desc entry;
29046 +
29047 + assert("nikita-1449", object != NULL);
29048 +
29049 + if (reiser4_inode_get_flag(object, REISER4_NO_SD))
29050 + return 0;
29051 +
29052 + /* of course, this can be rewritten to sweep everything in one
29053 + reiser4_cut_tree(). */
29054 + memset(&entry, 0, sizeof entry);
29055 +
29056 + /* FIXME: this done method is called from reiser4_delete_dir_common which
29057 + * reserved space already */
29058 + reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29059 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29060 + return RETERR(-ENOSPC);
29061 +
29062 + memset(&goodby_dots, 0, sizeof goodby_dots);
29063 + entry.obj = goodby_dots.d_inode = object;
29064 + goodby_dots.d_name.name = ".";
29065 + goodby_dots.d_name.len = 1;
29066 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29067 + reiser4_free_dentry_fsdata(&goodby_dots);
29068 + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29069 + /* only worth a warning
29070 +
29071 + "values of \ eB\ f will give rise to dom!\n"
29072 + -- v6src/s2/mv.c:89
29073 + */
29074 + warning("nikita-2252", "Cannot remove dot of %lli: %i",
29075 + (unsigned long long)get_inode_oid(object), result);
29076 + return 0;
29077 +}
29078 +
29079 +/* this is common implementation of attach method of dir plugin
29080 +*/
29081 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
29082 + struct inode *parent UNUSED_ARG)
29083 +{
29084 + assert("nikita-2647", child != NULL);
29085 + assert("nikita-2648", parent != NULL);
29086 +
29087 + return 0;
29088 +}
29089 +
29090 +/* this is common implementation of detach method of dir plugin
29091 + remove "..", decrease nlink on parent
29092 +*/
29093 +int reiser4_detach_common(struct inode *object, struct inode *parent)
29094 +{
29095 + int result;
29096 + struct dentry goodby_dots;
29097 + reiser4_dir_entry_desc entry;
29098 +
29099 + assert("nikita-2885", object != NULL);
29100 + assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
29101 +
29102 + memset(&entry, 0, sizeof entry);
29103 +
29104 + /* NOTE-NIKITA this only works if @parent is -the- parent of
29105 + @object, viz. object whose key is stored in dotdot
29106 + entry. Wouldn't work with hard-links on directories. */
29107 + memset(&goodby_dots, 0, sizeof goodby_dots);
29108 + entry.obj = goodby_dots.d_inode = parent;
29109 + goodby_dots.d_name.name = "..";
29110 + goodby_dots.d_name.len = 2;
29111 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29112 + reiser4_free_dentry_fsdata(&goodby_dots);
29113 + if (result == 0) {
29114 + /* the dot should be the only entry remaining at this time... */
29115 + assert("nikita-3400",
29116 + object->i_size == 1 && object->i_nlink <= 2);
29117 +#if 0
29118 + /* and, together with the only name directory can have, they
29119 + * provides for the last 2 remaining references. If we get
29120 + * here as part of error handling during mkdir, @object
29121 + * possibly has no name yet, so its nlink == 1. If we get here
29122 + * from rename (targeting empty directory), it has no name
29123 + * already, so its nlink == 1. */
29124 + assert("nikita-3401",
29125 + object->i_nlink == 2 || object->i_nlink == 1);
29126 +#endif
29127 +
29128 + /* decrement nlink of directory removed ".." pointed
29129 + to */
29130 + reiser4_del_nlink(parent, NULL, 0);
29131 + }
29132 + return result;
29133 +}
29134 +
29135 +/* this is common implementation of estimate.add_entry method of
29136 + dir plugin
29137 + estimation of adding entry which supposes that entry is inserting a
29138 + unit into item
29139 +*/
29140 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29141 +{
29142 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
29143 +}
29144 +
29145 +/* this is common implementation of estimate.rem_entry method of dir
29146 + plugin
29147 +*/
29148 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29149 +{
29150 + return estimate_one_item_removal(reiser4_tree_by_inode(inode));
29151 +}
29152 +
29153 +/* this is common implementation of estimate.unlink method of dir
29154 + plugin
29155 +*/
29156 +reiser4_block_nr
29157 +dir_estimate_unlink_common(const struct inode * parent,
29158 + const struct inode * object)
29159 +{
29160 + reiser4_block_nr res;
29161 +
29162 + /* hashed_rem_entry(object) */
29163 + res = inode_dir_plugin(object)->estimate.rem_entry(object);
29164 + /* del_nlink(parent) */
29165 + res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29166 +
29167 + return res;
29168 +}
29169 +
29170 +/*
29171 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29172 + * methods: if @inode is a light-weight file, setup its credentials
29173 + * that are not stored in the stat-data in this case
29174 + */
29175 +void check_light_weight(struct inode *inode, struct inode *parent)
29176 +{
29177 + if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29178 + inode->i_uid = parent->i_uid;
29179 + inode->i_gid = parent->i_gid;
29180 + /* clear light-weight flag. If inode would be read by any
29181 + other name, [ug]id wouldn't change. */
29182 + reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29183 + }
29184 +}
29185 +
29186 +/* looks for name specified in @dentry in directory @parent and if name is
29187 + found - key of object found entry points to is stored in @entry->key */
29188 +int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
29189 + * name in */
29190 + struct dentry *dentry, /* name to look for */
29191 + reiser4_key * key /* place to store key */ )
29192 +{
29193 + int result;
29194 + coord_t *coord;
29195 + lock_handle lh;
29196 + const char *name;
29197 + int len;
29198 + reiser4_dir_entry_desc entry;
29199 + reiser4_dentry_fsdata *fsdata;
29200 +
29201 + assert("nikita-1247", parent != NULL);
29202 + assert("nikita-1248", dentry != NULL);
29203 + assert("nikita-1123", dentry->d_name.name != NULL);
29204 + assert("vs-1486",
29205 + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29206 +
29207 + name = dentry->d_name.name;
29208 + len = dentry->d_name.len;
29209 +
29210 + if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29211 + /* some arbitrary error code to return */
29212 + return RETERR(-ENAMETOOLONG);
29213 +
29214 + fsdata = reiser4_get_dentry_fsdata(dentry);
29215 + if (IS_ERR(fsdata))
29216 + return PTR_ERR(fsdata);
29217 +
29218 + coord = &fsdata->dec.entry_coord;
29219 + coord_clear_iplug(coord);
29220 + init_lh(&lh);
29221 +
29222 + /* find entry in a directory. This is plugin method. */
29223 + result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29224 + &entry);
29225 + if (result == 0) {
29226 + /* entry was found, extract object key from it. */
29227 + result =
29228 + WITH_COORD(coord,
29229 + item_plugin_by_coord(coord)->s.dir.
29230 + extract_key(coord, key));
29231 + }
29232 + done_lh(&lh);
29233 + return result;
29234 +
29235 +}
29236 +
29237 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
29238 +static reiser4_block_nr
29239 +estimate_init(struct inode *parent, struct inode *object)
29240 +{
29241 + reiser4_block_nr res = 0;
29242 +
29243 + assert("vpf-321", parent != NULL);
29244 + assert("vpf-322", object != NULL);
29245 +
29246 + /* hashed_add_entry(object) */
29247 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29248 + /* reiser4_add_nlink(object) */
29249 + res += inode_file_plugin(object)->estimate.update(object);
29250 + /* hashed_add_entry(object) */
29251 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29252 + /* reiser4_add_nlink(parent) */
29253 + res += inode_file_plugin(parent)->estimate.update(parent);
29254 +
29255 + return 0;
29256 +}
29257 +
29258 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29259 +static int create_dot_dotdot(struct inode *object /* object to create dot and
29260 + * dotdot for */ ,
29261 + struct inode *parent /* parent of @object */)
29262 +{
29263 + int result;
29264 + struct dentry dots_entry;
29265 + reiser4_dir_entry_desc entry;
29266 +
29267 + assert("nikita-688", object != NULL);
29268 + assert("nikita-689", S_ISDIR(object->i_mode));
29269 + assert("nikita-691", parent != NULL);
29270 +
29271 + /* We store dot and dotdot as normal directory entries. This is
29272 + not necessary, because almost all information stored in them
29273 + is already in the stat-data of directory, the only thing
29274 + being missed is objectid of grand-parent directory that can
29275 + easily be added there as extension.
29276 +
29277 + But it is done the way it is done, because not storing dot
29278 + and dotdot will lead to the following complications:
29279 +
29280 + . special case handling in ->lookup().
29281 + . addition of another extension to the sd.
29282 + . dependency on key allocation policy for stat data.
29283 +
29284 + */
29285 +
29286 + memset(&entry, 0, sizeof entry);
29287 + memset(&dots_entry, 0, sizeof dots_entry);
29288 + entry.obj = dots_entry.d_inode = object;
29289 + dots_entry.d_name.name = ".";
29290 + dots_entry.d_name.len = 1;
29291 + result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
29292 + reiser4_free_dentry_fsdata(&dots_entry);
29293 +
29294 + if (result == 0) {
29295 + result = reiser4_add_nlink(object, object, 0);
29296 + if (result == 0) {
29297 + entry.obj = dots_entry.d_inode = parent;
29298 + dots_entry.d_name.name = "..";
29299 + dots_entry.d_name.len = 2;
29300 + result = reiser4_add_entry_common(object,
29301 + &dots_entry, NULL, &entry);
29302 + reiser4_free_dentry_fsdata(&dots_entry);
29303 + /* if creation of ".." failed, iput() will delete
29304 + object with ".". */
29305 + if (result == 0) {
29306 + result = reiser4_add_nlink(parent, object, 0);
29307 + if (result != 0)
29308 + /*
29309 + * if we failed to bump i_nlink, try
29310 + * to remove ".."
29311 + */
29312 + reiser4_detach_common(object, parent);
29313 + }
29314 + }
29315 + }
29316 +
29317 + if (result != 0) {
29318 + /*
29319 + * in the case of error, at least update stat-data so that,
29320 + * ->i_nlink updates are not lingering.
29321 + */
29322 + reiser4_update_sd(object);
29323 + reiser4_update_sd(parent);
29324 + }
29325 +
29326 + return result;
29327 +}
29328 +
29329 +/*
29330 + * return 0 iff @coord contains a directory entry for the file with the name
29331 + * @name.
29332 + */
29333 +static int
29334 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
29335 +{
29336 + item_plugin *iplug;
29337 + char buf[DE_NAME_BUF_LEN];
29338 +
29339 + iplug = item_plugin_by_coord(coord);
29340 + if (iplug == NULL) {
29341 + warning("nikita-1135", "Cannot get item plugin");
29342 + print_coord("coord", coord, 1);
29343 + return RETERR(-EIO);
29344 + } else if (item_id_by_coord(coord) !=
29345 + item_id_by_plugin(inode_dir_item_plugin(dir))) {
29346 + /* item id of current item does not match to id of items a
29347 + directory is built of */
29348 + warning("nikita-1136", "Wrong item plugin");
29349 + print_coord("coord", coord, 1);
29350 + return RETERR(-EIO);
29351 + }
29352 + assert("nikita-1137", iplug->s.dir.extract_name);
29353 +
29354 + /* Compare name stored in this entry with name we are looking for.
29355 +
29356 + NOTE-NIKITA Here should go code for support of something like
29357 + unicode, code tables, etc.
29358 + */
29359 + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29360 +}
29361 +
29362 +static int
29363 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29364 +{
29365 + return WITH_COORD(coord, check_item(dir, coord, name->name));
29366 +}
29367 +
29368 +/*
29369 + * argument package used by entry_actor to scan entries with identical keys.
29370 + */
29371 +typedef struct entry_actor_args {
29372 + /* name we are looking for */
29373 + const char *name;
29374 + /* key of directory entry. entry_actor() scans through sequence of
29375 + * items/units having the same key */
29376 + reiser4_key *key;
29377 + /* how many entries with duplicate key was scanned so far. */
29378 + int non_uniq;
29379 +#if REISER4_USE_COLLISION_LIMIT
29380 + /* scan limit */
29381 + int max_non_uniq;
29382 +#endif
29383 + /* return parameter: set to true, if ->name wasn't found */
29384 + int not_found;
29385 + /* what type of lock to take when moving to the next node during
29386 + * scan */
29387 + znode_lock_mode mode;
29388 +
29389 + /* last coord that was visited during scan */
29390 + coord_t last_coord;
29391 + /* last node locked during scan */
29392 + lock_handle last_lh;
29393 + /* inode of directory */
29394 + const struct inode *inode;
29395 +} entry_actor_args;
29396 +
29397 +/* Function called by reiser4_find_entry() to look for given name
29398 + in the directory. */
29399 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29400 + coord_t * coord /* current coord */ ,
29401 + lock_handle * lh /* current lock handle */ ,
29402 + void *entry_actor_arg /* argument to scan */ )
29403 +{
29404 + reiser4_key unit_key;
29405 + entry_actor_args *args;
29406 +
29407 + assert("nikita-1131", tree != NULL);
29408 + assert("nikita-1132", coord != NULL);
29409 + assert("nikita-1133", entry_actor_arg != NULL);
29410 +
29411 + args = entry_actor_arg;
29412 + ++args->non_uniq;
29413 +#if REISER4_USE_COLLISION_LIMIT
29414 + if (args->non_uniq > args->max_non_uniq) {
29415 + args->not_found = 1;
29416 + /* hash collision overflow. */
29417 + return RETERR(-EBUSY);
29418 + }
29419 +#endif
29420 +
29421 + /*
29422 + * did we just reach the end of the sequence of items/units with
29423 + * identical keys?
29424 + */
29425 + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29426 + assert("nikita-1791",
29427 + keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29428 + args->not_found = 1;
29429 + args->last_coord.between = AFTER_UNIT;
29430 + return 0;
29431 + }
29432 +
29433 + coord_dup(&args->last_coord, coord);
29434 + /*
29435 + * did scan just moved to the next node?
29436 + */
29437 + if (args->last_lh.node != lh->node) {
29438 + int lock_result;
29439 +
29440 + /*
29441 + * if so, lock new node with the mode requested by the caller
29442 + */
29443 + done_lh(&args->last_lh);
29444 + assert("nikita-1896", znode_is_any_locked(lh->node));
29445 + lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29446 + args->mode, ZNODE_LOCK_HIPRI);
29447 + if (lock_result != 0)
29448 + return lock_result;
29449 + }
29450 + return check_item(args->inode, coord, args->name);
29451 +}
29452 +
29453 +/* Look for given @name within directory @dir.
29454 +
29455 + This is called during lookup, creation and removal of directory
29456 + entries and on reiser4_rename_common
29457 +
29458 + First calculate key that directory entry for @name would have. Search
29459 + for this key in the tree. If such key is found, scan all items with
29460 + the same key, checking name in each directory entry along the way.
29461 +*/
29462 +int reiser4_find_entry(struct inode *dir, /* directory to scan */
29463 + struct dentry *de, /* name to search for */
29464 + lock_handle * lh, /* resulting lock handle */
29465 + znode_lock_mode mode, /* required lock mode */
29466 + reiser4_dir_entry_desc * entry /* parameters of found
29467 + directory entry */)
29468 +{
29469 + const struct qstr *name;
29470 + seal_t *seal;
29471 + coord_t *coord;
29472 + int result;
29473 + __u32 flags;
29474 + de_location *dec;
29475 + reiser4_dentry_fsdata *fsdata;
29476 +
29477 + assert("nikita-1130", lh != NULL);
29478 + assert("nikita-1128", dir != NULL);
29479 +
29480 + name = &de->d_name;
29481 + assert("nikita-1129", name != NULL);
29482 +
29483 + /* dentry private data don't require lock, because dentry
29484 + manipulations are protected by i_mutex on parent.
29485 +
29486 + This is not so for inodes, because there is no -the- parent in
29487 + inode case.
29488 + */
29489 + fsdata = reiser4_get_dentry_fsdata(de);
29490 + if (IS_ERR(fsdata))
29491 + return PTR_ERR(fsdata);
29492 + dec = &fsdata->dec;
29493 +
29494 + coord = &dec->entry_coord;
29495 + coord_clear_iplug(coord);
29496 + seal = &dec->entry_seal;
29497 + /* compose key of directory entry for @name */
29498 + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29499 +
29500 + if (reiser4_seal_is_set(seal)) {
29501 + /* check seal */
29502 + result = reiser4_seal_validate(seal, coord, &entry->key,
29503 + lh, mode, ZNODE_LOCK_LOPRI);
29504 + if (result == 0) {
29505 + /* key was found. Check that it is really item we are
29506 + looking for. */
29507 + result = check_entry(dir, coord, name);
29508 + if (result == 0)
29509 + return 0;
29510 + }
29511 + }
29512 + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
29513 + /*
29514 + * find place in the tree where directory item should be located.
29515 + */
29516 + result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
29517 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
29518 + flags, NULL /*ra_info */ );
29519 + if (result == CBK_COORD_FOUND) {
29520 + entry_actor_args arg;
29521 +
29522 + /* fast path: no hash collisions */
29523 + result = check_entry(dir, coord, name);
29524 + if (result == 0) {
29525 + reiser4_seal_init(seal, coord, &entry->key);
29526 + dec->pos = 0;
29527 + } else if (result > 0) {
29528 + /* Iterate through all units with the same keys. */
29529 + arg.name = name->name;
29530 + arg.key = &entry->key;
29531 + arg.not_found = 0;
29532 + arg.non_uniq = 0;
29533 +#if REISER4_USE_COLLISION_LIMIT
29534 + arg.max_non_uniq = max_hash_collisions(dir);
29535 + assert("nikita-2851", arg.max_non_uniq > 1);
29536 +#endif
29537 + arg.mode = mode;
29538 + arg.inode = dir;
29539 + coord_init_zero(&arg.last_coord);
29540 + init_lh(&arg.last_lh);
29541 +
29542 + result = reiser4_iterate_tree
29543 + (reiser4_tree_by_inode(dir),
29544 + coord, lh,
29545 + entry_actor, &arg, mode, 1);
29546 + /* if end of the tree or extent was reached during
29547 + scanning. */
29548 + if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
29549 + /* step back */
29550 + done_lh(lh);
29551 +
29552 + result = zload(arg.last_coord.node);
29553 + if (result == 0) {
29554 + coord_clear_iplug(&arg.last_coord);
29555 + coord_dup(coord, &arg.last_coord);
29556 + move_lh(lh, &arg.last_lh);
29557 + result = RETERR(-ENOENT);
29558 + zrelse(arg.last_coord.node);
29559 + --arg.non_uniq;
29560 + }
29561 + }
29562 +
29563 + done_lh(&arg.last_lh);
29564 + if (result == 0)
29565 + reiser4_seal_init(seal, coord, &entry->key);
29566 +
29567 + if (result == 0 || result == -ENOENT) {
29568 + assert("nikita-2580", arg.non_uniq > 0);
29569 + dec->pos = arg.non_uniq - 1;
29570 + }
29571 + }
29572 + } else
29573 + dec->pos = -1;
29574 + return result;
29575 +}
29576 +
29577 +/*
29578 + Local variables:
29579 + c-indentation-style: "K&R"
29580 + mode-name: "LC"
29581 + c-basic-offset: 8
29582 + tab-width: 8
29583 + fill-column: 120
29584 + scroll-step: 1
29585 + End:
29586 +*/
29587 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c
29588 --- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
29589 +++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c 2007-05-06 14:50:43.762995722 +0400
29590 @@ -0,0 +1,655 @@
29591 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29592 +
29593 +#include "../../debug.h"
29594 +#include "../../dformat.h"
29595 +#include "../../key.h"
29596 +#include "../node/node.h"
29597 +#include "../space/space_allocator.h"
29598 +#include "disk_format40.h"
29599 +#include "../plugin.h"
29600 +#include "../../txnmgr.h"
29601 +#include "../../jnode.h"
29602 +#include "../../tree.h"
29603 +#include "../../super.h"
29604 +#include "../../wander.h"
29605 +#include "../../inode.h"
29606 +#include "../../ktxnmgrd.h"
29607 +#include "../../status_flags.h"
29608 +
29609 +#include <linux/types.h> /* for __u?? */
29610 +#include <linux/fs.h> /* for struct super_block */
29611 +#include <linux/buffer_head.h>
29612 +
29613 +/* reiser 4.0 default disk layout */
29614 +
29615 +/* Amount of free blocks needed to perform release_format40 when fs gets
29616 + mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
29617 + & tx record. */
29618 +#define RELEASE_RESERVED 4
29619 +
29620 +/* The greatest supported format40 version number */
29621 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
29622 +
29623 +/* This flag indicates that backup should be updated
29624 + (the update is performed by fsck) */
29625 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
29626 +
29627 +/* functions to access fields of format40_disk_super_block */
29628 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
29629 +{
29630 + return le64_to_cpu(get_unaligned(&sb->block_count));
29631 +}
29632 +
29633 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
29634 +{
29635 + return le64_to_cpu(get_unaligned(&sb->free_blocks));
29636 +}
29637 +
29638 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
29639 +{
29640 + return le64_to_cpu(get_unaligned(&sb->root_block));
29641 +}
29642 +
29643 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
29644 +{
29645 + return le16_to_cpu(get_unaligned(&sb->tree_height));
29646 +}
29647 +
29648 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
29649 +{
29650 + return le64_to_cpu(get_unaligned(&sb->file_count));
29651 +}
29652 +
29653 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
29654 +{
29655 + return le64_to_cpu(get_unaligned(&sb->oid));
29656 +}
29657 +
29658 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
29659 +{
29660 + return le32_to_cpu(get_unaligned(&sb->mkfs_id));
29661 +}
29662 +
29663 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
29664 +{
29665 + return le64_to_cpu(get_unaligned(&sb->flags));
29666 +}
29667 +
29668 +static __u32 get_format40_version(const format40_disk_super_block * sb)
29669 +{
29670 + return le32_to_cpu(get_unaligned(&sb->version)) &
29671 + ~FORMAT40_UPDATE_BACKUP;
29672 +}
29673 +
29674 +static int update_backup_version(const format40_disk_super_block * sb)
29675 +{
29676 + return (le32_to_cpu(get_unaligned(&sb->version)) &
29677 + FORMAT40_UPDATE_BACKUP);
29678 +}
29679 +
29680 +static int update_disk_version(const format40_disk_super_block * sb)
29681 +{
29682 + return (get_format40_version(sb) < FORMAT40_VERSION);
29683 +}
29684 +
29685 +static int incomplete_compatibility(const format40_disk_super_block * sb)
29686 +{
29687 + return (get_format40_version(sb) > FORMAT40_VERSION);
29688 +}
29689 +
29690 +static format40_super_info *get_sb_info(struct super_block *super)
29691 +{
29692 + return &get_super_private(super)->u.format40;
29693 +}
29694 +
29695 +static int consult_diskmap(struct super_block *s)
29696 +{
29697 + format40_super_info *info;
29698 + journal_location *jloc;
29699 +
29700 + info = get_sb_info(s);
29701 + jloc = &get_super_private(s)->jloc;
29702 + /* Default format-specific locations, if there is nothing in
29703 + * diskmap */
29704 + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
29705 + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
29706 + info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
29707 +#ifdef CONFIG_REISER4_BADBLOCKS
29708 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
29709 + &jloc->footer);
29710 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
29711 + &jloc->header);
29712 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
29713 + &info->loc.super);
29714 +#endif
29715 + return 0;
29716 +}
29717 +
29718 +/* find any valid super block of disk_format40 (even if the first
29719 + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
29720 + if needed */
29721 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
29722 + *s)
29723 +{
29724 + struct buffer_head *super_bh;
29725 + format40_disk_super_block *disk_sb;
29726 + format40_super_info *info;
29727 +
29728 + assert("umka-487", s != NULL);
29729 +
29730 + info = get_sb_info(s);
29731 +
29732 + super_bh = sb_bread(s, info->loc.super);
29733 + if (super_bh == NULL)
29734 + return ERR_PTR(RETERR(-EIO));
29735 +
29736 + disk_sb = (format40_disk_super_block *) super_bh->b_data;
29737 + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
29738 + brelse(super_bh);
29739 + return ERR_PTR(RETERR(-EINVAL));
29740 + }
29741 +
29742 + reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
29743 + reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
29744 + le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29745 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29746 +
29747 + return super_bh;
29748 +}
29749 +
29750 +/* find the most recent version of super block. This is called after journal is
29751 + replayed */
29752 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
29753 +{
29754 + /* Here the most recent superblock copy has to be read. However, as
29755 + journal replay isn't complete, we are using
29756 + find_a_disk_format40_super_block() function. */
29757 + return find_a_disk_format40_super_block(s);
29758 +}
29759 +
29760 +static int get_super_jnode(struct super_block *s)
29761 +{
29762 + reiser4_super_info_data *sbinfo = get_super_private(s);
29763 + jnode *sb_jnode;
29764 + int ret;
29765 +
29766 + sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
29767 +
29768 + ret = jload(sb_jnode);
29769 +
29770 + if (ret) {
29771 + reiser4_drop_io_head(sb_jnode);
29772 + return ret;
29773 + }
29774 +
29775 + pin_jnode_data(sb_jnode);
29776 + jrelse(sb_jnode);
29777 +
29778 + sbinfo->u.format40.sb_jnode = sb_jnode;
29779 +
29780 + return 0;
29781 +}
29782 +
29783 +static void done_super_jnode(struct super_block *s)
29784 +{
29785 + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29786 +
29787 + if (sb_jnode) {
29788 + unpin_jnode_data(sb_jnode);
29789 + reiser4_drop_io_head(sb_jnode);
29790 + }
29791 +}
29792 +
29793 +typedef enum format40_init_stage {
29794 + NONE_DONE = 0,
29795 + CONSULT_DISKMAP,
29796 + FIND_A_SUPER,
29797 + INIT_JOURNAL_INFO,
29798 + INIT_STATUS,
29799 + JOURNAL_REPLAY,
29800 + READ_SUPER,
29801 + KEY_CHECK,
29802 + INIT_OID,
29803 + INIT_TREE,
29804 + JOURNAL_RECOVER,
29805 + INIT_SA,
29806 + INIT_JNODE,
29807 + ALL_DONE
29808 +} format40_init_stage;
29809 +
29810 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
29811 +{
29812 + format40_disk_super_block *sb_copy;
29813 +
29814 + sb_copy = kmalloc(sizeof(format40_disk_super_block),
29815 + reiser4_ctx_gfp_mask_get());
29816 + if (sb_copy == NULL)
29817 + return ERR_PTR(RETERR(-ENOMEM));
29818 + memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
29819 + sizeof(format40_disk_super_block));
29820 + return sb_copy;
29821 +}
29822 +
29823 +static int check_key_format(const format40_disk_super_block *sb_copy)
29824 +{
29825 + if (!equi(REISER4_LARGE_KEY,
29826 + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
29827 + warning("nikita-3228", "Key format mismatch. "
29828 + "Only %s keys are supported.",
29829 + REISER4_LARGE_KEY ? "large" : "small");
29830 + return RETERR(-EINVAL);
29831 + }
29832 + return 0;
29833 +}
29834 +
29835 +/**
29836 + * try_init_format40
29837 + * @super:
29838 + * @stage:
29839 + *
29840 + */
29841 +static int try_init_format40(struct super_block *super,
29842 + format40_init_stage *stage)
29843 +{
29844 + int result;
29845 + struct buffer_head *super_bh;
29846 + reiser4_super_info_data *sbinfo;
29847 + format40_disk_super_block *sb_copy;
29848 + tree_level height;
29849 + reiser4_block_nr root_block;
29850 + node_plugin *nplug;
29851 +
29852 + assert("vs-475", super != NULL);
29853 + assert("vs-474", get_super_private(super));
29854 +
29855 + *stage = NONE_DONE;
29856 +
29857 + result = consult_diskmap(super);
29858 + if (result)
29859 + return result;
29860 + *stage = CONSULT_DISKMAP;
29861 +
29862 + super_bh = find_a_disk_format40_super_block(super);
29863 + if (IS_ERR(super_bh))
29864 + return PTR_ERR(super_bh);
29865 + brelse(super_bh);
29866 + *stage = FIND_A_SUPER;
29867 +
29868 + /* ok, we are sure that filesystem format is a format40 format */
29869 +
29870 + /* map jnodes for journal control blocks (header, footer) to disk */
29871 + result = reiser4_init_journal_info(super);
29872 + if (result)
29873 + return result;
29874 + *stage = INIT_JOURNAL_INFO;
29875 +
29876 + /* ok, we are sure that filesystem format is a format40 format */
29877 + /* Now check it's state */
29878 + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
29879 + if (result != 0 && result != -EINVAL)
29880 + /* -EINVAL means there is no magic, so probably just old
29881 + * fs. */
29882 + return result;
29883 + *stage = INIT_STATUS;
29884 +
29885 + result = reiser4_status_query(NULL, NULL);
29886 + if (result == REISER4_STATUS_MOUNT_WARN)
29887 + notice("vpf-1363", "Warning: mounting %s with errors.",
29888 + super->s_id);
29889 + if (result == REISER4_STATUS_MOUNT_RO)
29890 + notice("vpf-1364", "Warning: mounting %s with fatal errors,"
29891 + " forcing read-only mount.", super->s_id);
29892 + result = reiser4_journal_replay(super);
29893 + if (result)
29894 + return result;
29895 + *stage = JOURNAL_REPLAY;
29896 +
29897 + super_bh = read_super_block(super);
29898 + if (IS_ERR(super_bh))
29899 + return PTR_ERR(super_bh);
29900 + *stage = READ_SUPER;
29901 +
29902 + /* allocate and make a copy of format40_disk_super_block */
29903 + sb_copy = copy_sb(super_bh);
29904 + brelse(super_bh);
29905 +
29906 + if (IS_ERR(sb_copy))
29907 + return PTR_ERR(sb_copy);
29908 + printk("reiser4: %s: found disk format 4.0.%u.\n",
29909 + super->s_id,
29910 + get_format40_version(sb_copy));
29911 + if (incomplete_compatibility(sb_copy))
29912 + printk("reiser4: Warning: The last completely supported "
29913 + "version of disk format40 is %u. Some objects of "
29914 + "the semantic tree can be unaccessible.\n",
29915 + FORMAT40_VERSION);
29916 + /* make sure that key format of kernel and filesystem match */
29917 + result = check_key_format(sb_copy);
29918 + if (result) {
29919 + kfree(sb_copy);
29920 + return result;
29921 + }
29922 + *stage = KEY_CHECK;
29923 +
29924 + result = oid_init_allocator(super, get_format40_file_count(sb_copy),
29925 + get_format40_oid(sb_copy));
29926 + if (result) {
29927 + kfree(sb_copy);
29928 + return result;
29929 + }
29930 + *stage = INIT_OID;
29931 +
29932 + /* get things necessary to init reiser4_tree */
29933 + root_block = get_format40_root_block(sb_copy);
29934 + height = get_format40_tree_height(sb_copy);
29935 + nplug = node_plugin_by_id(NODE40_ID);
29936 +
29937 + /* initialize reiser4_super_info_data */
29938 + sbinfo = get_super_private(super);
29939 + assert("", sbinfo->tree.super == super);
29940 + /* init reiser4_tree for the filesystem */
29941 + result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
29942 + if (result) {
29943 + kfree(sb_copy);
29944 + return result;
29945 + }
29946 + *stage = INIT_TREE;
29947 +
29948 + /*
29949 + * initialize reiser4_super_info_data with data from format40 super
29950 + * block
29951 + */
29952 + sbinfo->default_uid = 0;
29953 + sbinfo->default_gid = 0;
29954 + sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
29955 + /* number of blocks in filesystem and reserved space */
29956 + reiser4_set_block_count(super, get_format40_block_count(sb_copy));
29957 + sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
29958 + sbinfo->version = get_format40_version(sb_copy);
29959 + kfree(sb_copy);
29960 +
29961 + if (update_backup_version(sb_copy))
29962 + printk("reiser4: Warning: metadata backup is not updated. "
29963 + "Please run 'fsck.reiser4 --fix' on %s.\n",
29964 + super->s_id);
29965 +
29966 + sbinfo->fsuid = 0;
29967 + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
29968 + * are not supported */
29969 + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
29970 + * layout 40 are
29971 + * of one
29972 + * plugin */
29973 + /* sbinfo->tmgr is initialized already */
29974 +
29975 + /* recover sb data which were logged separately from sb block */
29976 +
29977 + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
29978 + * oid_init_allocator() and reiser4_set_free_blocks() with new
29979 + * data. What's the reason to call them above? */
29980 + result = reiser4_journal_recover_sb_data(super);
29981 + if (result != 0)
29982 + return result;
29983 + *stage = JOURNAL_RECOVER;
29984 +
29985 + /*
29986 + * Set number of used blocks. The number of used blocks is not stored
29987 + * neither in on-disk super block nor in the journal footer blocks. At
29988 + * this moment actual values of total blocks and free block counters
29989 + * are set in the reiser4 super block (in-memory structure) and we can
29990 + * calculate number of used blocks from them.
29991 + */
29992 + reiser4_set_data_blocks(super,
29993 + reiser4_block_count(super) -
29994 + reiser4_free_blocks(super));
29995 +
29996 +#if REISER4_DEBUG
29997 + sbinfo->min_blocks_used = 16 /* reserved area */ +
29998 + 2 /* super blocks */ +
29999 + 2 /* journal footer and header */ ;
30000 +#endif
30001 +
30002 + /* init disk space allocator */
30003 + result = sa_init_allocator(reiser4_get_space_allocator(super),
30004 + super, NULL);
30005 + if (result)
30006 + return result;
30007 + *stage = INIT_SA;
30008 +
30009 + result = get_super_jnode(super);
30010 + if (result == 0)
30011 + *stage = ALL_DONE;
30012 + return result;
30013 +}
30014 +
30015 +/* plugin->u.format.get_ready */
30016 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30017 +{
30018 + int result;
30019 + format40_init_stage stage;
30020 +
30021 + result = try_init_format40(s, &stage);
30022 + switch (stage) {
30023 + case ALL_DONE:
30024 + assert("nikita-3458", result == 0);
30025 + break;
30026 + case INIT_JNODE:
30027 + done_super_jnode(s);
30028 + case INIT_SA:
30029 + sa_destroy_allocator(reiser4_get_space_allocator(s), s);
30030 + case JOURNAL_RECOVER:
30031 + case INIT_TREE:
30032 + reiser4_done_tree(&get_super_private(s)->tree);
30033 + case INIT_OID:
30034 + case KEY_CHECK:
30035 + case READ_SUPER:
30036 + case JOURNAL_REPLAY:
30037 + case INIT_STATUS:
30038 + reiser4_status_finish();
30039 + case INIT_JOURNAL_INFO:
30040 + reiser4_done_journal_info(s);
30041 + case FIND_A_SUPER:
30042 + case CONSULT_DISKMAP:
30043 + case NONE_DONE:
30044 + break;
30045 + default:
30046 + impossible("nikita-3457", "init stage: %i", stage);
30047 + }
30048 +
30049 + if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30050 + return RETERR(-ENOSPC);
30051 +
30052 + return result;
30053 +}
30054 +
30055 +static void pack_format40_super(const struct super_block *s, char *data)
30056 +{
30057 + format40_disk_super_block *super_data =
30058 + (format40_disk_super_block *) data;
30059 +
30060 + reiser4_super_info_data *sbinfo = get_super_private(s);
30061 +
30062 + assert("zam-591", data != NULL);
30063 +
30064 + put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30065 + &super_data->free_blocks);
30066 +
30067 + put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30068 + &super_data->root_block);
30069 +
30070 + put_unaligned(cpu_to_le64(oid_next(s)),
30071 + &super_data->oid);
30072 +
30073 + put_unaligned(cpu_to_le64(oids_used(s)),
30074 + &super_data->file_count);
30075 +
30076 + put_unaligned(cpu_to_le16(sbinfo->tree.height),
30077 + &super_data->tree_height);
30078 +
30079 + if (update_disk_version(super_data)) {
30080 + __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30081 +
30082 + put_unaligned(cpu_to_le32(version), &super_data->version);
30083 + }
30084 +}
30085 +
30086 +/* plugin->u.format.log_super
30087 + return a jnode which should be added to transaction when the super block
30088 + gets logged */
30089 +jnode *log_super_format40(struct super_block *s)
30090 +{
30091 + jnode *sb_jnode;
30092 +
30093 + sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30094 +
30095 + jload(sb_jnode);
30096 +
30097 + pack_format40_super(s, jdata(sb_jnode));
30098 +
30099 + jrelse(sb_jnode);
30100 +
30101 + return sb_jnode;
30102 +}
30103 +
30104 +/* plugin->u.format.release */
30105 +int release_format40(struct super_block *s)
30106 +{
30107 + int ret;
30108 + reiser4_super_info_data *sbinfo;
30109 +
30110 + sbinfo = get_super_private(s);
30111 + assert("zam-579", sbinfo != NULL);
30112 +
30113 + if (!rofs_super(s)) {
30114 + ret = reiser4_capture_super_block(s);
30115 + if (ret != 0)
30116 + warning("vs-898",
30117 + "reiser4_capture_super_block failed: %d",
30118 + ret);
30119 +
30120 + ret = txnmgr_force_commit_all(s, 1);
30121 + if (ret != 0)
30122 + warning("jmacd-74438", "txn_force failed: %d", ret);
30123 +
30124 + all_grabbed2free();
30125 + }
30126 +
30127 + sa_destroy_allocator(&sbinfo->space_allocator, s);
30128 + reiser4_done_journal_info(s);
30129 + done_super_jnode(s);
30130 +
30131 + rcu_barrier();
30132 + reiser4_done_tree(&sbinfo->tree);
30133 + /* call finish_rcu(), because some znode were "released" in
30134 + * reiser4_done_tree(). */
30135 + rcu_barrier();
30136 +
30137 + return 0;
30138 +}
30139 +
30140 +#define FORMAT40_ROOT_LOCALITY 41
30141 +#define FORMAT40_ROOT_OBJECTID 42
30142 +
30143 +/* plugin->u.format.root_dir_key */
30144 +const reiser4_key *root_dir_key_format40(const struct super_block *super
30145 + UNUSED_ARG)
30146 +{
30147 + static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30148 + .el = {
30149 + __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30150 +#if REISER4_LARGE_KEY
30151 + ON_LARGE_KEY(0ull,)
30152 +#endif
30153 + __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30154 + 0ull
30155 + }
30156 + };
30157 +
30158 + return &FORMAT40_ROOT_DIR_KEY;
30159 +}
30160 +
30161 +/* plugin->u.format.check_open.
30162 + Check the opened object for validness. For now it checks for the valid oid &
30163 + locality only, can be improved later and it its work may depend on the mount
30164 + options. */
30165 +int check_open_format40(const struct inode *object)
30166 +{
30167 + oid_t max, oid;
30168 +
30169 + max = oid_next(object->i_sb) - 1;
30170 +
30171 + /* Check the oid. */
30172 + oid = get_inode_oid(object);
30173 + if (oid > max) {
30174 + warning("vpf-1360", "The object with the oid %llu "
30175 + "greater then the max used oid %llu found.",
30176 + (unsigned long long)oid, (unsigned long long)max);
30177 +
30178 + return RETERR(-EIO);
30179 + }
30180 +
30181 + /* Check the locality. */
30182 + oid = reiser4_inode_data(object)->locality_id;
30183 + if (oid > max) {
30184 + warning("vpf-1361", "The object with the locality %llu "
30185 + "greater then the max used oid %llu found.",
30186 + (unsigned long long)oid, (unsigned long long)max);
30187 +
30188 + return RETERR(-EIO);
30189 + }
30190 +
30191 + return 0;
30192 +}
30193 +
30194 +/* plugin->u.format.version_update.
30195 + Perform all version update operations from the on-disk
30196 + format40_disk_super_block.version on disk to FORMAT40_VERSION.
30197 + */
30198 +int version_update_format40(struct super_block *super) {
30199 + txn_handle * trans;
30200 + lock_handle lh;
30201 + txn_atom *atom;
30202 + int ret;
30203 +
30204 + /* Nothing to do if RO mount or the on-disk version is not less. */
30205 + if (super->s_flags & MS_RDONLY)
30206 + return 0;
30207 +
30208 + if (get_super_private(super)->version >= FORMAT40_VERSION)
30209 + return 0;
30210 +
30211 + printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30212 + "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30213 + "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30214 +
30215 + /* Mark the uber znode dirty to call log_super on write_logs. */
30216 + init_lh(&lh);
30217 + ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30218 + ZNODE_LOCK_HIPRI, &lh);
30219 + if (ret != 0)
30220 + return ret;
30221 +
30222 + znode_make_dirty(lh.node);
30223 + done_lh(&lh);
30224 +
30225 + /* Update the backup blocks. */
30226 +
30227 + /* Force write_logs immediately. */
30228 + trans = get_current_context()->trans;
30229 + atom = get_current_atom_locked();
30230 + assert("vpf-1906", atom != NULL);
30231 +
30232 + spin_lock_txnh(trans);
30233 + return force_commit_atom(trans);
30234 +}
30235 +
30236 +/* Make Linus happy.
30237 + Local variables:
30238 + c-indentation-style: "K&R"
30239 + mode-name: "LC"
30240 + c-basic-offset: 8
30241 + tab-width: 8
30242 + fill-column: 120
30243 + scroll-step: 1
30244 + End:
30245 +*/
30246 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h
30247 --- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
30248 +++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h 2007-05-06 14:50:43.762995722 +0400
30249 @@ -0,0 +1,109 @@
30250 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30251 +
30252 +/* this file contains:
30253 + - definition of ondisk super block of standart disk layout for
30254 + reiser 4.0 (layout 40)
30255 + - definition of layout 40 specific portion of in-core super block
30256 + - declarations of functions implementing methods of layout plugin
30257 + for layout 40
30258 + - declarations of functions used to get/set fields in layout 40 super block
30259 +*/
30260 +
30261 +#ifndef __DISK_FORMAT40_H__
30262 +#define __DISK_FORMAT40_H__
30263 +
30264 +/* magic for default reiser4 layout */
30265 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30266 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30267 +
30268 +#include "../../dformat.h"
30269 +
30270 +#include <linux/fs.h> /* for struct super_block */
30271 +
30272 +typedef enum {
30273 + FORMAT40_LARGE_KEYS
30274 +} format40_flags;
30275 +
30276 +/* ondisk super block for format 40. It is 512 bytes long */
30277 +typedef struct format40_disk_super_block {
30278 + /* 0 */ d64 block_count;
30279 + /* number of block in a filesystem */
30280 + /* 8 */ d64 free_blocks;
30281 + /* number of free blocks */
30282 + /* 16 */ d64 root_block;
30283 + /* filesystem tree root block */
30284 + /* 24 */ d64 oid;
30285 + /* smallest free objectid */
30286 + /* 32 */ d64 file_count;
30287 + /* number of files in a filesystem */
30288 + /* 40 */ d64 flushes;
30289 + /* number of times super block was
30290 + flushed. Needed if format 40
30291 + will have few super blocks */
30292 + /* 48 */ d32 mkfs_id;
30293 + /* unique identifier of fs */
30294 + /* 52 */ char magic[16];
30295 + /* magic string ReIsEr40FoRmAt */
30296 + /* 68 */ d16 tree_height;
30297 + /* height of filesystem tree */
30298 + /* 70 */ d16 formatting_policy;
30299 + /* not used anymore */
30300 + /* 72 */ d64 flags;
30301 + /* 80 */ d32 version;
30302 + /* on-disk format version number
30303 + initially assigned by mkfs as the greatest format40
30304 + version number supported by reiser4progs and updated
30305 + in mount time in accordance with the greatest format40
30306 + version number supported by kernel.
30307 + Is used by fsck to catch possible corruption and
30308 + for various compatibility issues */
30309 + /* 84 */ char not_used[428];
30310 +} format40_disk_super_block;
30311 +
30312 +/* format 40 specific part of reiser4_super_info_data */
30313 +typedef struct format40_super_info {
30314 +/* format40_disk_super_block actual_sb; */
30315 + jnode *sb_jnode;
30316 + struct {
30317 + reiser4_block_nr super;
30318 + } loc;
30319 +} format40_super_info;
30320 +
30321 +/* Defines for journal header and footer respectively. */
30322 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30323 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30324 +
30325 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30326 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30327 +
30328 +#define FORMAT40_STATUS_BLOCKNR \
30329 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30330 +
30331 +/* Diskmap declarations */
30332 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30333 +#define FORMAT40_SUPER 1
30334 +#define FORMAT40_JH 2
30335 +#define FORMAT40_JF 3
30336 +
30337 +/* declarations of functions implementing methods of layout plugin for
30338 + format 40. The functions theirself are in disk_format40.c */
30339 +extern int init_format_format40(struct super_block *, void *data);
30340 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30341 +extern int release_format40(struct super_block *s);
30342 +extern jnode *log_super_format40(struct super_block *s);
30343 +extern int check_open_format40(const struct inode *object);
30344 +extern int version_update_format40(struct super_block *super);
30345 +
30346 +/* __DISK_FORMAT40_H__ */
30347 +#endif
30348 +
30349 +/* Make Linus happy.
30350 + Local variables:
30351 + c-indentation-style: "K&R"
30352 + mode-name: "LC"
30353 + c-basic-offset: 8
30354 + tab-width: 8
30355 + fill-column: 120
30356 + scroll-step: 1
30357 + End:
30358 +*/
30359 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c
30360 --- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
30361 +++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c 2007-05-06 14:50:43.762995722 +0400
30362 @@ -0,0 +1,38 @@
30363 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30364 +
30365 +#include "../../debug.h"
30366 +#include "../plugin_header.h"
30367 +#include "disk_format40.h"
30368 +#include "disk_format.h"
30369 +#include "../plugin.h"
30370 +
30371 +/* initialization of disk layout plugins */
30372 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30373 + [FORMAT40_ID] = {
30374 + .h = {
30375 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30376 + .id = FORMAT40_ID,
30377 + .pops = NULL,
30378 + .label = "reiser40",
30379 + .desc = "standard disk layout for reiser40",
30380 + .linkage = {NULL, NULL}
30381 + },
30382 + .init_format = init_format_format40,
30383 + .root_dir_key = root_dir_key_format40,
30384 + .release = release_format40,
30385 + .log_super = log_super_format40,
30386 + .check_open = check_open_format40,
30387 + .version_update = version_update_format40
30388 + }
30389 +};
30390 +
30391 +/* Make Linus happy.
30392 + Local variables:
30393 + c-indentation-style: "K&R"
30394 + mode-name: "LC"
30395 + c-basic-offset: 8
30396 + tab-width: 8
30397 + fill-column: 120
30398 + scroll-step: 1
30399 + End:
30400 +*/
30401 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h
30402 --- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
30403 +++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h 2007-05-06 14:50:43.762995722 +0400
30404 @@ -0,0 +1,27 @@
30405 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30406 +
30407 +/* identifiers for disk layouts, they are also used as indexes in array of disk
30408 + plugins */
30409 +
30410 +#if !defined( __REISER4_DISK_FORMAT_H__ )
30411 +#define __REISER4_DISK_FORMAT_H__
30412 +
30413 +typedef enum {
30414 + /* standard reiser4 disk layout plugin id */
30415 + FORMAT40_ID,
30416 + LAST_FORMAT_ID
30417 +} disk_format_id;
30418 +
30419 +/* __REISER4_DISK_FORMAT_H__ */
30420 +#endif
30421 +
30422 +/* Make Linus happy.
30423 + Local variables:
30424 + c-indentation-style: "K&R"
30425 + mode-name: "LC"
30426 + c-basic-offset: 8
30427 + tab-width: 8
30428 + fill-column: 120
30429 + scroll-step: 1
30430 + End:
30431 +*/
30432 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile
30433 --- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
30434 +++ linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile 2007-05-06 14:50:43.762995722 +0400
30435 @@ -0,0 +1,5 @@
30436 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
30437 +
30438 +df_plugins-objs := \
30439 + disk_format40.o \
30440 + disk_format.o
30441 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.c linux-2.6.20/fs/reiser4/plugin/fibration.c
30442 --- linux-2.6.20.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
30443 +++ linux-2.6.20/fs/reiser4/plugin/fibration.c 2007-05-06 14:50:43.762995722 +0400
30444 @@ -0,0 +1,175 @@
30445 +/* Copyright 2004 by Hans Reiser, licensing governed by
30446 + * reiser4/README */
30447 +
30448 +/* Directory fibrations */
30449 +
30450 +/*
30451 + * Suppose we have a directory tree with sources of some project. During
30452 + * compilation .o files are created within this tree. This makes access
30453 + * to the original source files less efficient, because source files are
30454 + * now "diluted" by object files: default directory plugin uses prefix
30455 + * of a file name as a part of the key for directory entry (and this
30456 + * part is also inherited by the key of file body). This means that
30457 + * foo.o will be located close to foo.c and foo.h in the tree.
30458 + *
30459 + * To avoid this effect directory plugin fill highest 7 (unused
30460 + * originally) bits of the second component of the directory entry key
30461 + * by bit-pattern depending on the file name (see
30462 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30463 + * "fibre". Fibre of the file name key is inherited by key of stat data
30464 + * and keys of file body (in the case of REISER4_LARGE_KEY).
30465 + *
30466 + * Fibre for a given file is chosen by per-directory fibration
30467 + * plugin. Names within given fibre are ordered lexicographically.
30468 + */
30469 +
30470 +#include "../debug.h"
30471 +#include "plugin_header.h"
30472 +#include "plugin.h"
30473 +#include "../super.h"
30474 +#include "../inode.h"
30475 +
30476 +#include <linux/types.h>
30477 +
30478 +static const int fibre_shift = 57;
30479 +
30480 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30481 +
30482 +/*
30483 + * Trivial fibration: all files of directory are just ordered
30484 + * lexicographically.
30485 + */
30486 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30487 +{
30488 + return FIBRE_NO(0);
30489 +}
30490 +
30491 +/*
30492 + * dot-o fibration: place .o files after all others.
30493 + */
30494 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30495 +{
30496 + /* special treatment for .*\.o */
30497 + if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30498 + return FIBRE_NO(1);
30499 + else
30500 + return FIBRE_NO(0);
30501 +}
30502 +
30503 +/*
30504 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
30505 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
30506 + * default fibre for the rest.
30507 + */
30508 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
30509 +{
30510 + if (len > 2 && name[len - 2] == '.')
30511 + return FIBRE_NO(name[len - 1]);
30512 + else
30513 + return FIBRE_NO(0);
30514 +}
30515 +
30516 +/*
30517 + * ext.3 fibration: try to separate files with different 3-character
30518 + * extensions from each other.
30519 + */
30520 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
30521 +{
30522 + if (len > 4 && name[len - 4] == '.')
30523 + return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
30524 + else
30525 + return FIBRE_NO(0);
30526 +}
30527 +
30528 +static int change_fibration(struct inode *inode,
30529 + reiser4_plugin * plugin,
30530 + pset_member memb)
30531 +{
30532 + int result;
30533 +
30534 + assert("nikita-3503", inode != NULL);
30535 + assert("nikita-3504", plugin != NULL);
30536 +
30537 + assert("nikita-3505", is_reiser4_inode(inode));
30538 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
30539 + assert("nikita-3507",
30540 + plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
30541 +
30542 + result = 0;
30543 + if (inode_fibration_plugin(inode) == NULL ||
30544 + inode_fibration_plugin(inode)->h.id != plugin->h.id) {
30545 + if (is_dir_empty(inode) == 0)
30546 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
30547 + PSET_FIBRATION, plugin);
30548 + else
30549 + result = RETERR(-ENOTEMPTY);
30550 +
30551 + }
30552 + return result;
30553 +}
30554 +
30555 +static reiser4_plugin_ops fibration_plugin_ops = {
30556 + .init = NULL,
30557 + .load = NULL,
30558 + .save_len = NULL,
30559 + .save = NULL,
30560 + .change = change_fibration
30561 +};
30562 +
30563 +/* fibration plugins */
30564 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
30565 + [FIBRATION_LEXICOGRAPHIC] = {
30566 + .h = {
30567 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30568 + .id = FIBRATION_LEXICOGRAPHIC,
30569 + .pops = &fibration_plugin_ops,
30570 + .label = "lexicographic",
30571 + .desc = "no fibration",
30572 + .linkage = {NULL, NULL}
30573 + },
30574 + .fibre = fibre_trivial
30575 + },
30576 + [FIBRATION_DOT_O] = {
30577 + .h = {
30578 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30579 + .id = FIBRATION_DOT_O,
30580 + .pops = &fibration_plugin_ops,
30581 + .label = "dot-o",
30582 + .desc = "fibrate .o files separately",
30583 + .linkage = {NULL, NULL}
30584 + },
30585 + .fibre = fibre_dot_o
30586 + },
30587 + [FIBRATION_EXT_1] = {
30588 + .h = {
30589 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30590 + .id = FIBRATION_EXT_1,
30591 + .pops = &fibration_plugin_ops,
30592 + .label = "ext-1",
30593 + .desc = "fibrate file by single character extension",
30594 + .linkage = {NULL, NULL}
30595 + },
30596 + .fibre = fibre_ext_1
30597 + },
30598 + [FIBRATION_EXT_3] = {
30599 + .h = {
30600 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30601 + .id = FIBRATION_EXT_3,
30602 + .pops = &fibration_plugin_ops,
30603 + .label = "ext-3",
30604 + .desc = "fibrate file by three character extension",
30605 + .linkage = {NULL, NULL}
30606 + },
30607 + .fibre = fibre_ext_3
30608 + }
30609 +};
30610 +
30611 +/*
30612 + * Local variables:
30613 + * c-indentation-style: "K&R"
30614 + * mode-name: "LC"
30615 + * c-basic-offset: 8
30616 + * tab-width: 8
30617 + * fill-column: 79
30618 + * End:
30619 + */
30620 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.h linux-2.6.20/fs/reiser4/plugin/fibration.h
30621 --- linux-2.6.20.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
30622 +++ linux-2.6.20/fs/reiser4/plugin/fibration.h 2007-05-06 14:50:43.762995722 +0400
30623 @@ -0,0 +1,37 @@
30624 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
30625 +
30626 +/* Fibration plugin used by hashed directory plugin to segment content
30627 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
30628 +
30629 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
30630 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
30631 +
30632 +#include "plugin_header.h"
30633 +
30634 +typedef struct fibration_plugin {
30635 + /* generic fields */
30636 + plugin_header h;
30637 +
30638 + __u64(*fibre) (const struct inode * dir, const char *name, int len);
30639 +} fibration_plugin;
30640 +
30641 +typedef enum {
30642 + FIBRATION_LEXICOGRAPHIC,
30643 + FIBRATION_DOT_O,
30644 + FIBRATION_EXT_1,
30645 + FIBRATION_EXT_3,
30646 + LAST_FIBRATION_ID
30647 +} reiser4_fibration_id;
30648 +
30649 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
30650 +#endif
30651 +
30652 +/* Make Linus happy.
30653 + Local variables:
30654 + c-indentation-style: "K&R"
30655 + mode-name: "LC"
30656 + c-basic-offset: 8
30657 + tab-width: 8
30658 + fill-column: 120
30659 + End:
30660 +*/
30661 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c
30662 --- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
30663 +++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c 2007-05-06 14:50:43.770998222 +0400
30664 @@ -0,0 +1,3760 @@
30665 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
30666 + reiser4/README */
30667 +
30668 +/* This file contains implementations of inode/file/address_space/file plugin
30669 + * operations specific for cryptcompress file plugin which manages files with
30670 + * compressed and encrypted bodies. "Cryptcompress file" is built of items of
30671 + * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
30672 + */
30673 +
30674 +#include "../../inode.h"
30675 +#include "../cluster.h"
30676 +#include "../object.h"
30677 +#include "../../tree_walk.h"
30678 +#include "cryptcompress.h"
30679 +
30680 +#include <asm/scatterlist.h>
30681 +#include <linux/pagevec.h>
30682 +#include <asm/uaccess.h>
30683 +#include <linux/swap.h>
30684 +#include <linux/writeback.h>
30685 +#include <linux/random.h>
30686 +
30687 +/* get cryptcompress specific portion of inode */
30688 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
30689 +{
30690 + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
30691 +}
30692 +
30693 +/* plugin->u.file.init_inode_data */
30694 +void init_inode_data_cryptcompress(struct inode *inode,
30695 + reiser4_object_create_data * crd,
30696 + int create)
30697 +{
30698 + cryptcompress_info_t *data;
30699 +
30700 + data = cryptcompress_inode_data(inode);
30701 + assert("edward-685", data != NULL);
30702 +
30703 + memset(data, 0, sizeof(*data));
30704 +
30705 + turn_on_compression(data);
30706 + set_lattice_factor(data, MIN_LATTICE_FACTOR);
30707 + init_inode_ordering(inode, crd, create);
30708 +}
30709 +
30710 +#if REISER4_DEBUG
30711 +int cryptcompress_inode_ok(struct inode *inode)
30712 +{
30713 + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
30714 + return 0;
30715 + if (!cluster_shift_ok(inode_cluster_shift(inode)))
30716 + return 0;
30717 + return 1;
30718 +}
30719 +#endif
30720 +
30721 +/* The following is a part of reiser4 cipher key manager
30722 + which is called when opening/creating a cryptcompress file */
30723 +
30724 +/* get/set cipher key info */
30725 +crypto_stat_t * inode_crypto_stat (struct inode * inode)
30726 +{
30727 + assert("edward-90", inode != NULL);
30728 + assert("edward-91", reiser4_inode_data(inode) != NULL);
30729 + return cryptcompress_inode_data(inode)->crypt;
30730 +}
30731 +
30732 +static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
30733 +{
30734 + cryptcompress_inode_data(inode)->crypt = stat;
30735 +}
30736 +
30737 +/* allocate a cipher key info */
30738 +crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode)
30739 +{
30740 + crypto_stat_t * info;
30741 + int fipsize;
30742 +
30743 + info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
30744 + if (!info)
30745 + return ERR_PTR(-ENOMEM);
30746 + memset(info, 0, sizeof (*info));
30747 + fipsize = inode_digest_plugin(inode)->fipsize;
30748 + info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
30749 + if (!info->keyid) {
30750 + kfree(info);
30751 + return ERR_PTR(-ENOMEM);
30752 + }
30753 + info->host = inode;
30754 + return info;
30755 +}
30756 +
30757 +#if 0
30758 +/* allocate/free low-level info for cipher and digest
30759 + transforms */
30760 +static int alloc_crypto_tfms(crypto_stat_t * info)
30761 +{
30762 + struct crypto_blkcipher * ctfm = NULL;
30763 + struct crypto_hash * dtfm = NULL;
30764 + cipher_plugin * cplug = inode_cipher_plugin(info->host);
30765 + digest_plugin * dplug = inode_digest_plugin(info->host);
30766 +
30767 + if (cplug->alloc) {
30768 + ctfm = cplug->alloc();
30769 + if (IS_ERR(ctfm)) {
30770 + warning("edward-1364",
30771 + "Can not allocate info for %s\n",
30772 + cplug->h.desc);
30773 + return RETERR(PTR_ERR(ctfm));
30774 + }
30775 + }
30776 + info_set_cipher(info, ctfm);
30777 + if (dplug->alloc) {
30778 + dtfm = dplug->alloc();
30779 + if (IS_ERR(dtfm)) {
30780 + warning("edward-1365",
30781 + "Can not allocate info for %s\n",
30782 + dplug->h.desc);
30783 + goto unhappy_with_digest;
30784 + }
30785 + }
30786 + info_set_digest(info, dtfm);
30787 + return 0;
30788 + unhappy_with_digest:
30789 + if (cplug->free) {
30790 + cplug->free(ctfm);
30791 + info_set_cipher(info, NULL);
30792 + }
30793 + return RETERR(PTR_ERR(dtfm));
30794 +}
30795 +#endif
30796 +
30797 +static void
30798 +free_crypto_tfms(crypto_stat_t * info)
30799 +{
30800 + assert("edward-1366", info != NULL);
30801 + if (!info_get_cipher(info)) {
30802 + assert("edward-1601", !info_get_digest(info));
30803 + return;
30804 + }
30805 + inode_cipher_plugin(info->host)->free(info_get_cipher(info));
30806 + info_set_cipher(info, NULL);
30807 + inode_digest_plugin(info->host)->free(info_get_digest(info));
30808 + info_set_digest(info, NULL);
30809 + return;
30810 +}
30811 +
30812 +#if 0
30813 +/* create a key fingerprint for disk stat-data */
30814 +static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
30815 +{
30816 + int ret = -ENOMEM;
30817 + size_t blk, pad;
30818 + __u8 * dmem;
30819 + __u8 * cmem;
30820 + struct hash_desc ddesc;
30821 + struct blkcipher_desc cdesc;
30822 + struct scatterlist sg;
30823 +
30824 + assert("edward-1367", info != NULL);
30825 + assert("edward-1368", info->keyid != NULL);
30826 +
30827 + ddesc.tfm = info_get_digest(info);
30828 + ddesc.flags = 0;
30829 + cdesc.tfm = info_get_cipher(info);
30830 + cdesc.flags = 0;
30831 +
30832 + dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
30833 + reiser4_ctx_gfp_mask_get());
30834 + if (!dmem)
30835 + goto exit1;
30836 +
30837 + blk = crypto_blkcipher_blocksize(cdesc.tfm);
30838 +
30839 + pad = data->keyid_size % blk;
30840 + pad = (pad ? blk - pad : 0);
30841 +
30842 + cmem = kmalloc((size_t)data->keyid_size + pad,
30843 + reiser4_ctx_gfp_mask_get());
30844 + if (!cmem)
30845 + goto exit2;
30846 + memcpy(cmem, data->keyid, data->keyid_size);
30847 + memset(cmem + data->keyid_size, 0, pad);
30848 +
30849 + sg.page = virt_to_page(cmem);
30850 + sg.offset = offset_in_page(cmem);
30851 + sg.length = data->keyid_size + pad;
30852 +
30853 + ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
30854 + data->keyid_size + pad);
30855 + if (ret) {
30856 + warning("edward-1369",
30857 + "encryption failed flags=%x\n", cdesc.flags);
30858 + goto exit3;
30859 + }
30860 + ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
30861 + if (ret) {
30862 + warning("edward-1602",
30863 + "digest failed flags=%x\n", ddesc.flags);
30864 + goto exit3;
30865 + }
30866 + memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
30867 + exit3:
30868 + kfree(cmem);
30869 + exit2:
30870 + kfree(dmem);
30871 + exit1:
30872 + return ret;
30873 +}
30874 +#endif
30875 +
30876 +static void destroy_keyid(crypto_stat_t * info)
30877 +{
30878 + assert("edward-1370", info != NULL);
30879 + assert("edward-1371", info->keyid != NULL);
30880 + kfree(info->keyid);
30881 + return;
30882 +}
30883 +
30884 +static void __free_crypto_stat (struct inode * inode)
30885 +{
30886 + crypto_stat_t * info = inode_crypto_stat(inode);
30887 + assert("edward-1372", info != NULL);
30888 +
30889 + free_crypto_tfms(info);
30890 + destroy_keyid(info);
30891 + kfree(info);
30892 +}
30893 +
30894 +#if 0
30895 +static void instantiate_crypto_stat(crypto_stat_t * info)
30896 +{
30897 + assert("edward-1373", info != NULL);
30898 + assert("edward-1374", info->inst == 0);
30899 + info->inst = 1;
30900 +}
30901 +#endif
30902 +
30903 +static void uninstantiate_crypto_stat(crypto_stat_t * info)
30904 +{
30905 + assert("edward-1375", info != NULL);
30906 + info->inst = 0;
30907 +}
30908 +
30909 +static int crypto_stat_instantiated(crypto_stat_t * info)
30910 +{
30911 + return info->inst;
30912 +}
30913 +
30914 +static int inode_has_cipher_key(struct inode * inode)
30915 +{
30916 + assert("edward-1376", inode != NULL);
30917 + return inode_crypto_stat(inode) &&
30918 + crypto_stat_instantiated(inode_crypto_stat(inode));
30919 +}
30920 +
30921 +static void free_crypto_stat (struct inode * inode)
30922 +{
30923 + uninstantiate_crypto_stat(inode_crypto_stat(inode));
30924 + __free_crypto_stat(inode);
30925 +}
30926 +
30927 +static int need_cipher(struct inode * inode)
30928 +{
30929 + return inode_cipher_plugin(inode) !=
30930 + cipher_plugin_by_id(NONE_CIPHER_ID);
30931 +}
30932 +
30933 +/* Create a crypto-stat and attach result to the @object.
30934 + If success is returned, then low-level cipher info contains
30935 + an instantiated key */
30936 +#if 0
30937 +crypto_stat_t *
30938 +create_crypto_stat(struct inode * object,
30939 + crypto_data_t * data /* this contains a (uninstantiated)
30940 + cipher key imported from user
30941 + space */)
30942 +{
30943 + int ret;
30944 + crypto_stat_t * info;
30945 +
30946 + assert("edward-1377", data != NULL);
30947 + assert("edward-1378", need_cipher(object));
30948 +
30949 + if (inode_file_plugin(object) !=
30950 + file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
30951 + return ERR_PTR(-EINVAL);
30952 +
30953 + info = reiser4_alloc_crypto_stat(object);
30954 + if (IS_ERR(info))
30955 + return info;
30956 + ret = alloc_crypto_tfms(info);
30957 + if (ret)
30958 + goto err;
30959 + /* instantiating a key */
30960 + ret = crypto_blkcipher_setkey(info_get_cipher(info),
30961 + data->key,
30962 + data->keysize);
30963 + if (ret) {
30964 + warning("edward-1379",
30965 + "setkey failed flags=%x\n",
30966 + crypto_blkcipher_get_flags(info_get_cipher(info)));
30967 + goto err;
30968 + }
30969 + info->keysize = data->keysize;
30970 + ret = create_keyid(info, data);
30971 + if (ret)
30972 + goto err;
30973 + instantiate_crypto_stat(info);
30974 + return info;
30975 + err:
30976 + __free_crypto_stat(object);
30977 + return ERR_PTR(ret);
30978 +}
30979 +#endif
30980 +
30981 +/* increment/decrement a load counter when
30982 + attaching/detaching the crypto-stat to any object */
30983 +static void load_crypto_stat(crypto_stat_t * info)
30984 +{
30985 + assert("edward-1380", info != NULL);
30986 + inc_keyload_count(info);
30987 +}
30988 +
30989 +static void unload_crypto_stat(struct inode * inode)
30990 +{
30991 + crypto_stat_t * info = inode_crypto_stat(inode);
30992 + assert("edward-1381", info->keyload_count > 0);
30993 +
30994 + dec_keyload_count(inode_crypto_stat(inode));
30995 + if (info->keyload_count == 0)
30996 + /* final release */
30997 + free_crypto_stat(inode);
30998 +}
30999 +
31000 +/* attach/detach an existing crypto-stat */
31001 +void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31002 +{
31003 + assert("edward-1382", inode != NULL);
31004 + assert("edward-1383", info != NULL);
31005 + assert("edward-1384", inode_crypto_stat(inode) == NULL);
31006 +
31007 + set_inode_crypto_stat(inode, info);
31008 + load_crypto_stat(info);
31009 +}
31010 +
31011 +/* returns true, if crypto stat can be attached to the @host */
31012 +#if REISER4_DEBUG
31013 +static int host_allows_crypto_stat(struct inode * host)
31014 +{
31015 + int ret;
31016 + file_plugin * fplug = inode_file_plugin(host);
31017 +
31018 + switch (fplug->h.id) {
31019 + case CRYPTCOMPRESS_FILE_PLUGIN_ID:
31020 + ret = 1;
31021 + break;
31022 + default:
31023 + ret = 0;
31024 + }
31025 + return ret;
31026 +}
31027 +#endif /* REISER4_DEBUG */
31028 +
31029 +static void reiser4_detach_crypto_stat(struct inode * inode)
31030 +{
31031 + assert("edward-1385", inode != NULL);
31032 + assert("edward-1386", host_allows_crypto_stat(inode));
31033 +
31034 + if (inode_crypto_stat(inode))
31035 + unload_crypto_stat(inode);
31036 + set_inode_crypto_stat(inode, NULL);
31037 +}
31038 +
31039 +#if 0
31040 +
31041 +/* compare fingerprints of @child and @parent */
31042 +static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31043 +{
31044 + return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31045 +}
31046 +
31047 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
31048 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
31049 +{
31050 + if (!need_cipher(child))
31051 + return 0;
31052 + /* the child is created */
31053 + if (!inode_crypto_stat(child))
31054 + return 1;
31055 + /* the child is looked up */
31056 + if (!inode_crypto_stat(parent))
31057 + return 0;
31058 + return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31059 + inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31060 + inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31061 + keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31062 +}
31063 +#endif
31064 +
31065 +/* helper functions for ->create() method of the cryptcompress plugin */
31066 +static int inode_set_crypto(struct inode * object)
31067 +{
31068 + reiser4_inode * info;
31069 + if (!inode_crypto_stat(object)) {
31070 + if (need_cipher(object))
31071 + return RETERR(-EINVAL);
31072 + /* the file is not to be encrypted */
31073 + return 0;
31074 + }
31075 + info = reiser4_inode_data(object);
31076 + info->extmask |= (1 << CRYPTO_STAT);
31077 + return 0;
31078 +}
31079 +
31080 +static int inode_init_compression(struct inode * object)
31081 +{
31082 + int result = 0;
31083 + assert("edward-1461", object != NULL);
31084 + if (inode_compression_plugin(object)->init)
31085 + result = inode_compression_plugin(object)->init();
31086 + return result;
31087 +}
31088 +
31089 +static int inode_check_cluster(struct inode * object)
31090 +{
31091 + assert("edward-696", object != NULL);
31092 +
31093 + if (inode_cluster_size(object) < PAGE_CACHE_SIZE) {
31094 + warning("edward-1320", "Can not support '%s' "
31095 + "logical clusters (less then page size)",
31096 + inode_cluster_plugin(object)->h.label);
31097 + return RETERR(-EINVAL);
31098 + }
31099 + return 0;
31100 +}
31101 +
31102 +/* ->destroy_inode() method of the cryptcompress plugin */
31103 +void destroy_inode_cryptcompress(struct inode * inode)
31104 +{
31105 + assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31106 + reiser4_detach_crypto_stat(inode);
31107 + return;
31108 +}
31109 +
31110 +/* ->create() method of the cryptcompress plugin
31111 +
31112 +. install plugins
31113 +. attach crypto info if specified
31114 +. attach compression info if specified
31115 +. attach cluster info
31116 +*/
31117 +int
31118 +create_cryptcompress(struct inode *object, struct inode *parent,
31119 + reiser4_object_create_data * data)
31120 +{
31121 + int result;
31122 + reiser4_inode *info;
31123 +
31124 + assert("edward-23", object != NULL);
31125 + assert("edward-24", parent != NULL);
31126 + assert("edward-30", data != NULL);
31127 + assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31128 + assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
31129 +
31130 + info = reiser4_inode_data(object);
31131 +
31132 + assert("edward-29", info != NULL);
31133 +
31134 + /* set file bit */
31135 + info->plugin_mask |= (1 << PSET_FILE);
31136 +
31137 + /* set crypto */
31138 + result = inode_set_crypto(object);
31139 + if (result)
31140 + goto error;
31141 + /* set compression */
31142 + result = inode_init_compression(object);
31143 + if (result)
31144 + goto error;
31145 + /* set cluster */
31146 + result = inode_check_cluster(object);
31147 + if (result)
31148 + goto error;
31149 +
31150 + /* save everything in disk stat-data */
31151 + result = write_sd_by_inode_common(object);
31152 + if (!result)
31153 + return 0;
31154 + error:
31155 + reiser4_detach_crypto_stat(object);
31156 + return result;
31157 +}
31158 +
31159 +/* ->open() method of the cryptcompress plugin */
31160 +int open_object_cryptcompress(struct inode * inode, struct file * file)
31161 +{
31162 + int result;
31163 + struct inode * parent;
31164 +
31165 + assert("edward-1394", inode != NULL);
31166 + assert("edward-1395", file != NULL);
31167 + assert("edward-1396", file != NULL);
31168 + assert("edward-1397", file->f_dentry->d_inode == inode);
31169 + assert("edward-1398", file->f_dentry->d_parent != NULL);
31170 + assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31171 + assert("edward-698",
31172 + inode_file_plugin(inode) ==
31173 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31174 + result = inode_check_cluster(inode);
31175 + if (result)
31176 + return result;
31177 + result = inode_init_compression(inode);
31178 + if (result)
31179 + return result;
31180 + if (!need_cipher(inode))
31181 + /* the file is not to be ciphered */
31182 + return 0;
31183 + parent = file->f_dentry->d_parent->d_inode;
31184 + if (!inode_has_cipher_key(inode))
31185 + return RETERR(-EINVAL);
31186 + return 0;
31187 +}
31188 +
31189 +/* returns a blocksize, the attribute of a cipher algorithm */
31190 +static unsigned int
31191 +cipher_blocksize(struct inode * inode)
31192 +{
31193 + assert("edward-758", need_cipher(inode));
31194 + assert("edward-1400", inode_crypto_stat(inode) != NULL);
31195 + return crypto_blkcipher_blocksize
31196 + (info_get_cipher(inode_crypto_stat(inode)));
31197 +}
31198 +
31199 +/* returns offset translated by scale factor of the crypto-algorithm */
31200 +static loff_t inode_scaled_offset (struct inode * inode,
31201 + const loff_t src_off /* input offset */)
31202 +{
31203 + assert("edward-97", inode != NULL);
31204 +
31205 + if (!need_cipher(inode) ||
31206 + src_off == get_key_offset(reiser4_min_key()) ||
31207 + src_off == get_key_offset(reiser4_max_key()))
31208 + return src_off;
31209 +
31210 + return inode_cipher_plugin(inode)->scale(inode,
31211 + cipher_blocksize(inode),
31212 + src_off);
31213 +}
31214 +
31215 +/* returns disk cluster size */
31216 +size_t inode_scaled_cluster_size(struct inode * inode)
31217 +{
31218 + assert("edward-110", inode != NULL);
31219 +
31220 + return inode_scaled_offset(inode, inode_cluster_size(inode));
31221 +}
31222 +
31223 +static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31224 +{
31225 + return (clust_to_off(clust->index, inode) >= inode->i_size);
31226 +}
31227 +
31228 +/* set number of cluster pages */
31229 +static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31230 +{
31231 + reiser4_slide_t *win;
31232 +
31233 + assert("edward-180", clust != NULL);
31234 + assert("edward-1040", inode != NULL);
31235 +
31236 + win = clust->win;
31237 + if (!win) {
31238 + /* NOTE-EDWARD: i_size should be protected */
31239 + clust->nr_pages =
31240 + count_to_nrpages(fsize_to_count(clust, inode));
31241 + return;
31242 + }
31243 + assert("edward-1176", clust->op != PCL_UNKNOWN);
31244 + assert("edward-1064", win->off + win->count + win->delta != 0);
31245 +
31246 + if (win->stat == HOLE_WINDOW &&
31247 + win->off == 0 && win->count == inode_cluster_size(inode)) {
31248 + /* special case: we start write hole from fake cluster */
31249 + clust->nr_pages = 0;
31250 + return;
31251 + }
31252 + clust->nr_pages =
31253 + count_to_nrpages(max_count(win->off + win->count + win->delta,
31254 + fsize_to_count(clust, inode)));
31255 + return;
31256 +}
31257 +
31258 +/* ->key_by_inode() method of the cryptcompress plugin */
31259 +/* see plugin/plugin.h for details */
31260 +int
31261 +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31262 +{
31263 + loff_t clust_off;
31264 +
31265 + assert("edward-64", inode != 0);
31266 + // assert("edward-112", ergo(off != get_key_offset(reiser4_max_key()), !off_to_cloff(off, inode)));
31267 + /* don't come here with other offsets */
31268 +
31269 + clust_off =
31270 + (off ==
31271 + get_key_offset(reiser4_max_key())? get_key_offset(reiser4_max_key()) :
31272 + off_to_clust_to_off(off, inode));
31273 +
31274 + key_by_inode_and_offset_common(inode, 0, key);
31275 + set_key_offset(key,
31276 + (__u64) (!inode_crypto_stat(inode) ? clust_off :
31277 + inode_scaled_offset(inode, clust_off)));
31278 + return 0;
31279 +}
31280 +
31281 +/* plugin->flow_by_inode */
31282 +int
31283 +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31284 + const char __user *buf /* user level buffer */ ,
31285 + int user /* 1 if @buf is of user space, 0 - if it is
31286 + kernel space */ ,
31287 + loff_t size /* buffer size */ ,
31288 + loff_t off /* offset to start io from */ ,
31289 + rw_op op /* READ or WRITE */ ,
31290 + flow_t * f /* resulting flow */ )
31291 +{
31292 + assert("edward-436", f != NULL);
31293 + assert("edward-149", inode != NULL);
31294 + assert("edward-150", inode_file_plugin(inode) != NULL);
31295 +
31296 + f->length = size;
31297 + memcpy(&f->data, &buf, sizeof(buf));
31298 + f->user = user;
31299 + f->op = op;
31300 +
31301 + if (op == WRITE_OP && user == 1)
31302 + return 0;
31303 + return key_by_inode_cryptcompress(inode, off, &f->key);
31304 +}
31305 +
31306 +static int
31307 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31308 + znode_lock_mode lock_mode)
31309 +{
31310 + coord_t *coord;
31311 +
31312 + assert("edward-704", hint != NULL);
31313 + assert("edward-1089", !hint_is_valid(hint));
31314 + assert("edward-706", hint->lh.owner == NULL);
31315 +
31316 + coord = &hint->ext_coord.coord;
31317 +
31318 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31319 + /* hint either not set or set by different operation */
31320 + return RETERR(-E_REPEAT);
31321 +
31322 + if (get_key_offset(key) != hint->offset)
31323 + /* hint is set for different key */
31324 + return RETERR(-E_REPEAT);
31325 +
31326 + assert("edward-707", reiser4_schedulable());
31327 +
31328 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31329 + key, &hint->lh, lock_mode,
31330 + ZNODE_LOCK_LOPRI);
31331 +}
31332 +
31333 +/* reserve disk space when writing a logical cluster */
31334 +static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
31335 +{
31336 + int result = 0;
31337 +
31338 + assert("edward-965", reiser4_schedulable());
31339 + assert("edward-439", inode != NULL);
31340 + assert("edward-440", clust != NULL);
31341 + assert("edward-441", clust->pages != NULL);
31342 +
31343 + if (clust->nr_pages == 0) {
31344 + assert("edward-1152", clust->win != NULL);
31345 + assert("edward-1153", clust->win->stat == HOLE_WINDOW);
31346 + /* don't reserve space for fake disk clusteer */
31347 + return 0;
31348 + }
31349 + assert("edward-442", jprivate(clust->pages[0]) != NULL);
31350 +
31351 + result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31352 + estimate_update_cluster(inode),
31353 + BA_CAN_COMMIT);
31354 + if (result)
31355 + return result;
31356 + clust->reserved = 1;
31357 + grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31358 + estimate_update_cluster(inode));
31359 +#if REISER4_DEBUG
31360 + clust->reserved_prepped = estimate_update_cluster(inode);
31361 + clust->reserved_unprepped = estimate_insert_cluster(inode);
31362 +#endif
31363 + /* there can be space grabbed by txnmgr_force_commit_all */
31364 + return 0;
31365 +}
31366 +
31367 +/* free reserved disk space if writing a logical cluster fails */
31368 +static void
31369 +free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
31370 +{
31371 + assert("edward-967", clust->reserved == 1);
31372 +
31373 + cluster_reserved2free(count);
31374 + clust->reserved = 0;
31375 +}
31376 +
31377 +/* The core search procedure of the cryptcompress plugin.
31378 + If returned value is not cbk_errored, then current znode is locked */
31379 +static int find_cluster_item(hint_t * hint,
31380 + const reiser4_key * key, /* key of the item we are
31381 + looking for */
31382 + znode_lock_mode lock_mode /* which lock */ ,
31383 + ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31384 +{
31385 + int result;
31386 + reiser4_key ikey;
31387 + int went_right = 0;
31388 + coord_t *coord = &hint->ext_coord.coord;
31389 + coord_t orig = *coord;
31390 +
31391 + assert("edward-152", hint != NULL);
31392 +
31393 + if (!hint_is_valid(hint)) {
31394 + result = cryptcompress_hint_validate(hint, key, lock_mode);
31395 + if (result == -E_REPEAT)
31396 + goto traverse_tree;
31397 + else if (result) {
31398 + assert("edward-1216", 0);
31399 + return result;
31400 + }
31401 + hint_set_valid(hint);
31402 + }
31403 + assert("edward-709", znode_is_any_locked(coord->node));
31404 +
31405 + /* In-place lookup is going here, it means we just need to
31406 + check if next item of the @coord match to the @keyhint) */
31407 +
31408 + if (equal_to_rdk(coord->node, key)) {
31409 + result = goto_right_neighbor(coord, &hint->lh);
31410 + if (result == -E_NO_NEIGHBOR) {
31411 + assert("edward-1217", 0);
31412 + return RETERR(-EIO);
31413 + }
31414 + if (result)
31415 + return result;
31416 + assert("edward-1218", equal_to_ldk(coord->node, key));
31417 + went_right = 1;
31418 + } else {
31419 + coord->item_pos++;
31420 + coord->unit_pos = 0;
31421 + coord->between = AT_UNIT;
31422 + }
31423 + result = zload(coord->node);
31424 + if (result)
31425 + return result;
31426 + assert("edward-1219", !node_is_empty(coord->node));
31427 +
31428 + if (!coord_is_existing_item(coord)) {
31429 + zrelse(coord->node);
31430 + goto not_found;
31431 + }
31432 + item_key_by_coord(coord, &ikey);
31433 + zrelse(coord->node);
31434 + if (!keyeq(key, &ikey))
31435 + goto not_found;
31436 + /* Ok, item is found, update node counts */
31437 + if (went_right)
31438 + dclust_inc_extension_ncount(hint);
31439 + return CBK_COORD_FOUND;
31440 +
31441 + not_found:
31442 + assert("edward-1220", coord->item_pos > 0);
31443 + //coord->item_pos--;
31444 + /* roll back */
31445 + *coord = orig;
31446 + ON_DEBUG(coord_update_v(coord));
31447 + return CBK_COORD_NOTFOUND;
31448 +
31449 + traverse_tree:
31450 + assert("edward-713", hint->lh.owner == NULL);
31451 + assert("edward-714", reiser4_schedulable());
31452 +
31453 + reiser4_unset_hint(hint);
31454 + dclust_init_extension(hint);
31455 + coord_init_zero(coord);
31456 + result = coord_by_key(current_tree, key, coord, &hint->lh,
31457 + lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31458 + CBK_UNIQUE | flags, ra_info);
31459 + if (cbk_errored(result))
31460 + return result;
31461 + if(result == CBK_COORD_FOUND)
31462 + dclust_inc_extension_ncount(hint);
31463 + hint_set_valid(hint);
31464 + return result;
31465 +}
31466 +
31467 +/* This function is called by deflate[inflate] manager when
31468 + creating a transformed/plain stream to check if we should
31469 + create/cut some overhead. If this returns true, then @oh
31470 + contains the size of this overhead.
31471 + */
31472 +static int
31473 +need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
31474 + rw_op rw, int * oh)
31475 +{
31476 + tfm_cluster_t * tc = &clust->tc;
31477 + switch (rw) {
31478 + case WRITE_OP: /* estimate align */
31479 + *oh = tc->len % cipher_blocksize(inode);
31480 + if (*oh != 0)
31481 + return 1;
31482 + break;
31483 + case READ_OP: /* estimate cut */
31484 + *oh = *(tfm_output_data(clust) + tc->len - 1);
31485 + break;
31486 + default:
31487 + impossible("edward-1401", "bad option");
31488 + }
31489 + return (tc->len != tc->lsize);
31490 +}
31491 +
31492 +/* create/cut an overhead of transformed/plain stream */
31493 +static void
31494 +align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
31495 +{
31496 + int oh;
31497 + cipher_plugin * cplug = inode_cipher_plugin(inode);
31498 +
31499 + assert("edward-1402", need_cipher(inode));
31500 +
31501 + if (!need_cut_or_align(inode, clust, rw, &oh))
31502 + return;
31503 + switch (rw) {
31504 + case WRITE_OP: /* do align */
31505 + clust->tc.len +=
31506 + cplug->align_stream(tfm_input_data(clust) +
31507 + clust->tc.len, clust->tc.len,
31508 + cipher_blocksize(inode));
31509 + *(tfm_input_data(clust) + clust->tc.len - 1) =
31510 + cipher_blocksize(inode) - oh;
31511 + break;
31512 + case READ_OP: /* do cut */
31513 + assert("edward-1403", oh <= cipher_blocksize(inode));
31514 + clust->tc.len -= oh;
31515 + break;
31516 + default:
31517 + impossible("edward-1404", "bad option");
31518 + }
31519 + return;
31520 +}
31521 +
31522 +/* the following two functions are to evaluate results
31523 + of compression transform */
31524 +static unsigned
31525 +max_cipher_overhead(struct inode * inode)
31526 +{
31527 + if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
31528 + return 0;
31529 + return cipher_blocksize(inode);
31530 +}
31531 +
31532 +static int deflate_overhead(struct inode *inode)
31533 +{
31534 + return (inode_compression_plugin(inode)->
31535 + checksum ? DC_CHECKSUM_SIZE : 0);
31536 +}
31537 +
31538 +static unsigned deflate_overrun(struct inode * inode, int ilen)
31539 +{
31540 + return coa_overrun(inode_compression_plugin(inode), ilen);
31541 +}
31542 +
31543 +/* Estimating compressibility of a logical cluster by various
31544 + policies represented by compression mode plugin.
31545 + If this returns false, then compressor won't be called for
31546 + the cluster of index @index.
31547 +*/
31548 +static int should_compress(tfm_cluster_t * tc, cloff_t index,
31549 + struct inode *inode)
31550 +{
31551 + compression_plugin *cplug = inode_compression_plugin(inode);
31552 + compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
31553 +
31554 + assert("edward-1321", tc->len != 0);
31555 + assert("edward-1322", cplug != NULL);
31556 + assert("edward-1323", mplug != NULL);
31557 +
31558 + return /* estimate by size */
31559 + (cplug->min_size_deflate ?
31560 + tc->len >= cplug->min_size_deflate() :
31561 + 1) &&
31562 + /* estimate by compression mode plugin */
31563 + (mplug->should_deflate ?
31564 + mplug->should_deflate(inode, index) :
31565 + 1);
31566 +}
31567 +
31568 +/* Evaluating results of compression transform.
31569 + Returns true, if we need to accept this results */
31570 +static int
31571 +save_compressed(int size_before, int size_after, struct inode * inode)
31572 +{
31573 + return (size_after + deflate_overhead(inode) +
31574 + max_cipher_overhead(inode) < size_before);
31575 +}
31576 +
31577 +/* Guess result of the evaluation above */
31578 +static int
31579 +need_inflate(reiser4_cluster_t * clust, struct inode *inode,
31580 + int encrypted /* is cluster encrypted */ )
31581 +{
31582 + tfm_cluster_t *tc = &clust->tc;
31583 +
31584 + assert("edward-142", tc != 0);
31585 + assert("edward-143", inode != NULL);
31586 +
31587 + return tc->len <
31588 + (encrypted ?
31589 + inode_scaled_offset(inode, tc->lsize) :
31590 + tc->lsize);
31591 +}
31592 +
31593 +/* If results of compression were accepted, then we add
31594 + a checksum to catch possible disk cluster corruption.
31595 + The following is a format of the data stored in disk clusters:
31596 +
31597 + data This is (transformed) logical cluster.
31598 + cipher_overhead This is created by ->align() method
31599 + of cipher plugin. May be absent.
31600 + checksum (4) This is created by ->checksum method
31601 + of compression plugin to check
31602 + integrity. May be absent.
31603 +
31604 + Crypto overhead format:
31605 +
31606 + data
31607 + control_byte (1) contains aligned overhead size:
31608 + 1 <= overhead <= cipher_blksize
31609 +*/
31610 +/* Append a checksum at the end of a transformed stream */
31611 +static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
31612 +{
31613 + __u32 checksum;
31614 +
31615 + assert("edward-1309", tc != NULL);
31616 + assert("edward-1310", tc->len > 0);
31617 + assert("edward-1311", cplug->checksum != NULL);
31618 +
31619 + checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
31620 + put_unaligned(cpu_to_le32(checksum),
31621 + (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
31622 + tc->len += (int)DC_CHECKSUM_SIZE;
31623 +}
31624 +
31625 +/* Check a disk cluster checksum.
31626 + Returns 0 if checksum is correct, otherwise returns 1 */
31627 +static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
31628 +{
31629 + assert("edward-1312", tc != NULL);
31630 + assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
31631 + assert("edward-1314", cplug->checksum != NULL);
31632 +
31633 + if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
31634 + tc->len - (int)DC_CHECKSUM_SIZE) !=
31635 + le32_to_cpu(get_unaligned((d32 *)
31636 + (tfm_stream_data(tc, INPUT_STREAM)
31637 + + tc->len - (int)DC_CHECKSUM_SIZE)))) {
31638 + warning("edward-156",
31639 + "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
31640 + (int)le32_to_cpu
31641 + (get_unaligned((d32 *)
31642 + (tfm_stream_data(tc, INPUT_STREAM) +
31643 + tc->len - (int)DC_CHECKSUM_SIZE))),
31644 + (int)cplug->checksum
31645 + (tfm_stream_data(tc, INPUT_STREAM),
31646 + tc->len - (int)DC_CHECKSUM_SIZE));
31647 + return 1;
31648 + }
31649 + tc->len -= (int)DC_CHECKSUM_SIZE;
31650 + return 0;
31651 +}
31652 +
31653 +/* get input/output stream for some transform action */
31654 +int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
31655 + tfm_stream_id id)
31656 +{
31657 + size_t size = inode_scaled_cluster_size(inode);
31658 +
31659 + assert("edward-901", tc != NULL);
31660 + assert("edward-1027", inode_compression_plugin(inode) != NULL);
31661 +
31662 + if (cluster_get_tfm_act(tc) == TFMA_WRITE)
31663 + size += deflate_overrun(inode, inode_cluster_size(inode));
31664 +
31665 + if (!tfm_stream(tc, id) && id == INPUT_STREAM)
31666 + alternate_streams(tc);
31667 + if (!tfm_stream(tc, id))
31668 + return alloc_tfm_stream(tc, size, id);
31669 +
31670 + assert("edward-902", tfm_stream_is_set(tc, id));
31671 +
31672 + if (tfm_stream_size(tc, id) < size)
31673 + return realloc_tfm_stream(tc, size, id);
31674 + return 0;
31675 +}
31676 +
31677 +/* Common deflate manager */
31678 +int reiser4_deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
31679 +{
31680 + int result = 0;
31681 + int compressed = 0;
31682 + int encrypted = 0;
31683 + tfm_cluster_t * tc = &clust->tc;
31684 + compression_plugin * coplug;
31685 +
31686 + assert("edward-401", inode != NULL);
31687 + assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
31688 + assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
31689 + assert("edward-498", !tfm_cluster_is_uptodate(tc));
31690 +
31691 + coplug = inode_compression_plugin(inode);
31692 + if (should_compress(tc, clust->index, inode)) {
31693 + /* try to compress, discard bad results */
31694 + __u32 dst_len;
31695 + compression_mode_plugin * mplug =
31696 + inode_compression_mode_plugin(inode);
31697 + assert("edward-602", coplug != NULL);
31698 + assert("edward-1423", coplug->compress != NULL);
31699 +
31700 + result = grab_coa(tc, coplug);
31701 + if (result) {
31702 + warning("edward-1424",
31703 + "alloc_coa failed with ret=%d, skipped compression",
31704 + result);
31705 + goto cipher;
31706 + }
31707 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31708 + if (result) {
31709 + warning("edward-1425",
31710 + "alloc stream failed with ret=%d, skipped compression",
31711 + result);
31712 + goto cipher;
31713 + }
31714 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
31715 + coplug->compress(get_coa(tc, coplug->h.id, tc->act),
31716 + tfm_input_data(clust), tc->len,
31717 + tfm_output_data(clust), &dst_len);
31718 + /* make sure we didn't overwrite extra bytes */
31719 + assert("edward-603",
31720 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
31721 +
31722 + /* evaluate results of compression transform */
31723 + if (save_compressed(tc->len, dst_len, inode)) {
31724 + /* good result, accept */
31725 + tc->len = dst_len;
31726 + if (mplug->accept_hook != NULL) {
31727 + result = mplug->accept_hook(inode, clust->index);
31728 + if (result)
31729 + warning("edward-1426",
31730 + "accept_hook failed with ret=%d",
31731 + result);
31732 + }
31733 + compressed = 1;
31734 + }
31735 + else {
31736 + /* bad result, discard */
31737 +#if REISER4_DEBUG
31738 + if (cluster_is_complete(clust, inode))
31739 + warning("edward-1338",
31740 + "incompressible cluster %lu (inode %llu)",
31741 + clust->index,
31742 + (unsigned long long)get_inode_oid(inode));
31743 +#endif
31744 + if (mplug->discard_hook != NULL &&
31745 + cluster_is_complete(clust, inode)) {
31746 + result = mplug->discard_hook(inode,
31747 + clust->index);
31748 + if (result)
31749 + warning("edward-1427",
31750 + "discard_hook failed with ret=%d",
31751 + result);
31752 + }
31753 + }
31754 + }
31755 + cipher:
31756 + if (need_cipher(inode)) {
31757 + cipher_plugin * ciplug;
31758 + struct blkcipher_desc desc;
31759 + struct scatterlist src;
31760 + struct scatterlist dst;
31761 +
31762 + ciplug = inode_cipher_plugin(inode);
31763 + desc.tfm = info_get_cipher(inode_crypto_stat(inode));
31764 + desc.flags = 0;
31765 + if (compressed)
31766 + alternate_streams(tc);
31767 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31768 + if (result)
31769 + return result;
31770 +
31771 + align_or_cut_overhead(inode, clust, WRITE_OP);
31772 + src.page = virt_to_page(tfm_input_data(clust));
31773 + src.offset = offset_in_page(tfm_input_data(clust));
31774 + src.length = tc->len;
31775 +
31776 + dst.page = virt_to_page(tfm_output_data(clust));
31777 + dst.offset = offset_in_page(tfm_output_data(clust));
31778 + dst.length = tc->len;
31779 +
31780 + result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
31781 + if (result) {
31782 + warning("edward-1405",
31783 + "encryption failed flags=%x\n", desc.flags);
31784 + return result;
31785 + }
31786 + encrypted = 1;
31787 + }
31788 + if (compressed && coplug->checksum != NULL)
31789 + dc_set_checksum(coplug, tc);
31790 + if (!compressed && !encrypted)
31791 + alternate_streams(tc);
31792 + return result;
31793 +}
31794 +
31795 +/* Common inflate manager. */
31796 +int reiser4_inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
31797 +{
31798 + int result = 0;
31799 + int transformed = 0;
31800 + tfm_cluster_t * tc = &clust->tc;
31801 + compression_plugin * coplug;
31802 +
31803 + assert("edward-905", inode != NULL);
31804 + assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
31805 + assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
31806 + assert("edward-1349", tc->act == TFMA_READ);
31807 + assert("edward-907", !tfm_cluster_is_uptodate(tc));
31808 +
31809 + /* Handle a checksum (if any) */
31810 + coplug = inode_compression_plugin(inode);
31811 + if (need_inflate(clust, inode, need_cipher(inode)) &&
31812 + coplug->checksum != NULL) {
31813 + result = dc_check_checksum(coplug, tc);
31814 + if (unlikely(result)) {
31815 + warning("edward-1460",
31816 + "Inode %llu: disk cluster %lu looks corrupted",
31817 + (unsigned long long)get_inode_oid(inode),
31818 + clust->index);
31819 + return RETERR(-EIO);
31820 + }
31821 + }
31822 + if (need_cipher(inode)) {
31823 + cipher_plugin * ciplug;
31824 + struct blkcipher_desc desc;
31825 + struct scatterlist src;
31826 + struct scatterlist dst;
31827 +
31828 + ciplug = inode_cipher_plugin(inode);
31829 + desc.tfm = info_get_cipher(inode_crypto_stat(inode));
31830 + desc.flags = 0;
31831 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31832 + if (result)
31833 + return result;
31834 + assert("edward-909", tfm_cluster_is_set(tc));
31835 +
31836 + src.page = virt_to_page(tfm_input_data(clust));
31837 + src.offset = offset_in_page(tfm_input_data(clust));
31838 + src.length = tc->len;
31839 +
31840 + dst.page = virt_to_page(tfm_output_data(clust));
31841 + dst.offset = offset_in_page(tfm_output_data(clust));
31842 + dst.length = tc->len;
31843 +
31844 + result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
31845 + if (result) {
31846 + warning("edward-1600", "decrypt failed flags=%x\n",
31847 + desc.flags);
31848 + return result;
31849 + }
31850 + align_or_cut_overhead(inode, clust, READ_OP);
31851 + transformed = 1;
31852 + }
31853 + if (need_inflate(clust, inode, 0)) {
31854 + unsigned dst_len = inode_cluster_size(inode);
31855 + if(transformed)
31856 + alternate_streams(tc);
31857 +
31858 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31859 + if (result)
31860 + return result;
31861 + assert("edward-1305", coplug->decompress != NULL);
31862 + assert("edward-910", tfm_cluster_is_set(tc));
31863 +
31864 + coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
31865 + tfm_input_data(clust), tc->len,
31866 + tfm_output_data(clust), &dst_len);
31867 + /* check length */
31868 + tc->len = dst_len;
31869 + assert("edward-157", dst_len == tc->lsize);
31870 + transformed = 1;
31871 + }
31872 + if (!transformed)
31873 + alternate_streams(tc);
31874 + return result;
31875 +}
31876 +
31877 +/* This is implementation of readpage method of struct
31878 + address_space_operations for cryptcompress plugin. */
31879 +int readpage_cryptcompress(struct file *file, struct page *page)
31880 +{
31881 + reiser4_context *ctx;
31882 + reiser4_cluster_t clust;
31883 + item_plugin *iplug;
31884 + int result;
31885 +
31886 + assert("edward-88", PageLocked(page));
31887 + assert("vs-976", !PageUptodate(page));
31888 + assert("edward-89", page->mapping && page->mapping->host);
31889 +
31890 + ctx = reiser4_init_context(page->mapping->host->i_sb);
31891 + if (IS_ERR(ctx)) {
31892 + unlock_page(page);
31893 + return PTR_ERR(ctx);
31894 + }
31895 + assert("edward-113",
31896 + ergo(file != NULL,
31897 + page->mapping == file->f_dentry->d_inode->i_mapping));
31898 +
31899 + if (PageUptodate(page)) {
31900 + warning("edward-1338", "page is already uptodate\n");
31901 + unlock_page(page);
31902 + reiser4_exit_context(ctx);
31903 + return 0;
31904 + }
31905 + cluster_init_read(&clust, NULL);
31906 + clust.file = file;
31907 + iplug = item_plugin_by_id(CTAIL_ID);
31908 + if (!iplug->s.file.readpage) {
31909 + unlock_page(page);
31910 + put_cluster_handle(&clust);
31911 + reiser4_exit_context(ctx);
31912 + return -EINVAL;
31913 + }
31914 + result = iplug->s.file.readpage(&clust, page);
31915 +
31916 + assert("edward-1459", !PageLocked(page));
31917 + assert("edward-64", ergo(result == 0, PageUptodate(page)));
31918 + put_cluster_handle(&clust);
31919 + reiser4_exit_context(ctx);
31920 + return result;
31921 +}
31922 +
31923 +/* how much pages will be captured */
31924 +static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
31925 +{
31926 + switch (clust->op) {
31927 + case PCL_APPEND:
31928 + return clust->nr_pages;
31929 + case PCL_TRUNCATE:
31930 + assert("edward-1179", clust->win != NULL);
31931 + return count_to_nrpages(clust->win->off + clust->win->count);
31932 + default:
31933 + impossible("edward-1180", "bad page cluster option");
31934 + return 0;
31935 + }
31936 +}
31937 +
31938 +static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
31939 +{
31940 + int i;
31941 + struct page *pg;
31942 + int nrpages = cluster_nrpages_to_capture(clust);
31943 +
31944 + for (i = 0; i < nrpages; i++) {
31945 +
31946 + pg = clust->pages[i];
31947 + assert("edward-968", pg != NULL);
31948 + lock_page(pg);
31949 + assert("edward-1065", PageUptodate(pg));
31950 + reiser4_set_page_dirty_internal(pg);
31951 + unlock_page(pg);
31952 + mark_page_accessed(pg);
31953 + }
31954 +}
31955 +
31956 +static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
31957 +{
31958 + int i;
31959 + assert("edward-1275", clust != NULL);
31960 +
31961 + for (i = 0; i < clust->nr_pages; i++) {
31962 + assert("edward-1276", clust->pages[i] != NULL);
31963 +
31964 + lock_page(clust->pages[i]);
31965 + if (PageDirty(clust->pages[i])) {
31966 + assert("edward-1277", PageUptodate(clust->pages[i]));
31967 + cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
31968 + }
31969 +#if REISER4_DEBUG
31970 + else
31971 + /* Race between flush and write:
31972 + some pages became clean when write() (or another
31973 + process which modifies data) capture the cluster. */
31974 + warning("edward-985", "Page of index %lu (inode %llu)"
31975 + " is not dirty\n", clust->pages[i]->index,
31976 + (unsigned long long)get_inode_oid(clust->
31977 + pages[i]->
31978 + mapping->
31979 + host));
31980 +#endif
31981 + unlock_page(clust->pages[i]);
31982 + }
31983 +}
31984 +
31985 +/* update i_size by window */
31986 +static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
31987 +{
31988 + loff_t size;
31989 + reiser4_slide_t *win;
31990 +
31991 + assert("edward-1181", clust != NULL);
31992 + assert("edward-1182", inode != NULL);
31993 +
31994 + win = clust->win;
31995 + assert("edward-1183", win != NULL);
31996 + assert("edward-1183", win->count != 0);
31997 +
31998 + size = clust_to_off(clust->index, inode) + win->off;
31999 +
32000 + switch (clust->op) {
32001 + case PCL_APPEND:
32002 + if (size + win->count <= inode->i_size)
32003 + /* overwrite only */
32004 + return;
32005 + size += win->count;
32006 + break;
32007 + case PCL_TRUNCATE:
32008 + break;
32009 + default:
32010 + impossible("edward-1184", "bad page cluster option");
32011 + break;
32012 + }
32013 + inode_check_scale_nolock(inode, inode->i_size, size);
32014 + inode->i_size = size;
32015 + return;
32016 +}
32017 +
32018 +/* Check in page cluster modifications.
32019 + . Make jnode dirty, if it wasn't;
32020 + . Reserve space for a disk cluster update by flush algorithm, if needed;
32021 + . Clean up old references (if any).
32022 + . Put pages (grabbed in this thread) which will be truncated
32023 +*/
32024 +static void
32025 +make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32026 + loff_t * old_isize, struct inode *inode)
32027 +{
32028 + int i;
32029 + int old_nrpages;
32030 + int new_nrpages = cluster_nrpages_to_capture(clust);
32031 +
32032 + assert("edward-973", new_nrpages > 0);
32033 + assert("edward-221", node != NULL);
32034 + assert("edward-971", clust->reserved == 1);
32035 + assert_spin_locked(&(node->guard));
32036 + assert("edward-972", node->page_count <= cluster_nrpages(inode));
32037 + assert("edward-1263",
32038 + clust->reserved_prepped == estimate_update_cluster(inode));
32039 + assert("edward-1264", clust->reserved_unprepped == 0);
32040 +
32041 + if (JF_ISSET(node, JNODE_DIRTY)) {
32042 + /* someone has modified this cluster, but
32043 + the modifications are not committed yet */
32044 + old_nrpages =
32045 + count_to_nrpages(cnt_to_clcnt(*old_isize,
32046 + clust->index, inode));
32047 + /* free space which is already reserved */
32048 + free_reserved4cluster(inode, clust,
32049 + estimate_update_cluster(inode));
32050 + /* put old references */
32051 + for (i = 0; i < old_nrpages; i++) {
32052 + assert("edward-975", clust->pages[i]);
32053 + assert("edward-1185", PageUptodate(clust->pages[i]));
32054 +
32055 + page_cache_release(clust->pages[i]);
32056 +#if REISER4_DEBUG
32057 + cryptcompress_inode_data(inode)->pgcount --;
32058 +#endif
32059 + }
32060 + } else {
32061 + /* no captured pages */
32062 + assert("edward-1043", node->page_count == 0);
32063 + jnode_make_dirty_locked(node);
32064 + clust->reserved = 0;
32065 + }
32066 + /* put pages that will be truncated (if any) */
32067 + for (i = new_nrpages; i < clust->nr_pages; i++) {
32068 + assert("edward-1433", clust->pages[i]);
32069 + assert("edward-1434", PageUptodate(clust->pages[i]));
32070 + page_cache_release(clust->pages[i]);
32071 +#if REISER4_DEBUG
32072 + cryptcompress_inode_data(inode)->pgcount --;
32073 +#endif
32074 + }
32075 +#if REISER4_DEBUG
32076 + clust->reserved_prepped -= estimate_update_cluster(inode);
32077 + node->page_count = new_nrpages;
32078 +#endif
32079 + return;
32080 +}
32081 +
32082 +/* This function spawns a transaction and
32083 + is called by any thread as a final step in page cluster modification.
32084 +*/
32085 +static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32086 +{
32087 + int result = 0;
32088 + loff_t old_size;
32089 + jnode *node;
32090 +
32091 + assert("edward-1029", clust != NULL);
32092 + assert("edward-1030", clust->reserved == 1);
32093 + assert("edward-1031", clust->nr_pages != 0);
32094 + assert("edward-1032", clust->pages != NULL);
32095 + assert("edward-1033", clust->pages[0] != NULL);
32096 +
32097 + node = jprivate(clust->pages[0]);
32098 + assert("edward-1035", node != NULL);
32099 + assert("edward-1446", jnode_is_cluster_page(node));
32100 +
32101 + spin_lock_jnode(node);
32102 +
32103 + old_size = inode->i_size;
32104 + if (clust->win)
32105 + inode_set_new_size(clust, inode);
32106 +
32107 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32108 + if (result)
32109 + goto exit;
32110 + make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32111 + exit:
32112 + spin_unlock_jnode(node);
32113 + jput(node);
32114 + return result;
32115 +}
32116 +
32117 +/* Collect unlocked cluster pages for any modifications and attach a jnode.
32118 + We allocate only one jnode per cluster, this jnode is binded to the first
32119 + page of this cluster, so we have an extra-reference that will exist with
32120 + this jnode, other references will be cleaned up in flush time.
32121 +*/
32122 +static int
32123 +grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32124 +{
32125 + int i;
32126 + int result = 0;
32127 + jnode *node = NULL;
32128 +
32129 + assert("edward-182", clust != NULL);
32130 + assert("edward-183", clust->pages != NULL);
32131 + assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32132 +
32133 + if (clust->nr_pages == 0)
32134 + return 0;
32135 +
32136 + for (i = 0; i < clust->nr_pages; i++) {
32137 +
32138 + assert("edward-1044", clust->pages[i] == NULL);
32139 +
32140 + clust->pages[i] =
32141 + find_or_create_page(inode->i_mapping,
32142 + clust_to_pg(clust->index, inode) + i,
32143 + reiser4_ctx_gfp_mask_get());
32144 + if (!clust->pages[i]) {
32145 + result = RETERR(-ENOMEM);
32146 + break;
32147 + }
32148 + if (i == 0) {
32149 + node = jnode_of_page(clust->pages[i]);
32150 + if (IS_ERR(node)) {
32151 + result = PTR_ERR(node);
32152 + unlock_page(clust->pages[i]);
32153 + break;
32154 + }
32155 + JF_SET(node, JNODE_CLUSTER_PAGE);
32156 + unlock_page(clust->pages[i]);
32157 + assert("edward-919", node);
32158 + continue;
32159 + }
32160 + unlock_page(clust->pages[i]);
32161 + }
32162 + if (result) {
32163 + while (i)
32164 + page_cache_release(clust->pages[--i]);
32165 + if (node && !IS_ERR(node))
32166 + jput(node);
32167 + return result;
32168 + }
32169 + assert("edward-920", jprivate(clust->pages[0]));
32170 +#if REISER4_DEBUG
32171 + cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32172 +#endif
32173 + return 0;
32174 +}
32175 +
32176 +/* Collect unlocked cluster pages only for read (not to modify) */
32177 +int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32178 +{
32179 + int i;
32180 + int result = 0;
32181 +
32182 + assert("edward-1428", inode != NULL);
32183 + assert("edward-1429", inode->i_mapping != NULL);
32184 + assert("edward-787", clust != NULL);
32185 + assert("edward-788", clust->pages != NULL);
32186 + assert("edward-789", clust->nr_pages != 0);
32187 + assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32188 +
32189 + for (i = 0; i < clust->nr_pages; i++) {
32190 + clust->pages[i] =
32191 + find_or_create_page(inode->i_mapping,
32192 + clust_to_pg(clust->index, inode) + i,
32193 + reiser4_ctx_gfp_mask_get());
32194 + if (!clust->pages[i]) {
32195 + result = RETERR(-ENOMEM);
32196 + break;
32197 + }
32198 + unlock_page(clust->pages[i]);
32199 + }
32200 + if (result)
32201 + while (i)
32202 + page_cache_release(clust->pages[--i]);
32203 + return result;
32204 +}
32205 +
32206 +/* @node might be attached by reiser4_writepage(), not by
32207 + cryptcompress plugin code, but emergency flush should
32208 + understand that pages of cryptcompress files are not
32209 + flushable.
32210 +*/
32211 +#if 0
32212 +int jnode_of_cluster(const jnode * node, struct page * page)
32213 +{
32214 + assert("edward-1339", node != NULL);
32215 + assert("edward-1340", page != NULL);
32216 + assert("edward-1341", page->mapping != NULL);
32217 + assert("edward-1342", page->mapping->host != NULL);
32218 + assert("edward-1343",
32219 + ergo(jnode_is_unformatted(node),
32220 + get_inode_oid(page->mapping->host) ==
32221 + node->key.j.objectid));
32222 + if (inode_file_plugin(page->mapping->host) ==
32223 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) {
32224 +#if REISER4_DEBUG
32225 + if (!jnode_is_cluster_page(node))
32226 + warning("edward-1345",
32227 + "inode %llu: cluster page of index %lu became private",
32228 + (unsigned long long)get_inode_oid(page->mapping->host),
32229 + page->index);
32230 +#endif
32231 + return 1;
32232 + }
32233 + return 0;
32234 +}
32235 +#endif /* 0 */
32236 +
32237 +/* put cluster pages */
32238 +void reiser4_release_cluster_pages(reiser4_cluster_t * clust)
32239 +{
32240 + int i;
32241 +
32242 + assert("edward-447", clust != NULL);
32243 + for (i = 0; i < clust->nr_pages; i++) {
32244 +
32245 + assert("edward-449", clust->pages[i] != NULL);
32246 +
32247 + page_cache_release(clust->pages[i]);
32248 + }
32249 +}
32250 +
32251 +/* this is called when something is failed */
32252 +static void reiser4_release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32253 +{
32254 + jnode *node;
32255 +
32256 + assert("edward-445", clust != NULL);
32257 + assert("edward-922", clust->pages != NULL);
32258 + assert("edward-446", clust->pages[0] != NULL);
32259 +
32260 + node = jprivate(clust->pages[0]);
32261 +
32262 + assert("edward-447", node != NULL);
32263 +
32264 + reiser4_release_cluster_pages(clust);
32265 + jput(node);
32266 +}
32267 +
32268 +#if REISER4_DEBUG
32269 +static int window_ok(reiser4_slide_t * win, struct inode *inode)
32270 +{
32271 + assert("edward-1115", win != NULL);
32272 + assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32273 +
32274 + return (win->off != inode_cluster_size(inode)) &&
32275 + (win->off + win->count + win->delta <= inode_cluster_size(inode));
32276 +}
32277 +
32278 +static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32279 +{
32280 + assert("edward-279", clust != NULL);
32281 +
32282 + if (!clust->pages)
32283 + return 0;
32284 + return (clust->win ? window_ok(clust->win, inode) : 1);
32285 +}
32286 +#endif
32287 +
32288 +/* guess next window stat */
32289 +static inline window_stat next_window_stat(reiser4_slide_t * win)
32290 +{
32291 + assert("edward-1130", win != NULL);
32292 + return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32293 + HOLE_WINDOW : DATA_WINDOW);
32294 +}
32295 +
32296 +/* guess next cluster index and window params */
32297 +static void
32298 +update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32299 + loff_t to_file)
32300 +{
32301 + reiser4_slide_t *win;
32302 +
32303 + assert("edward-185", clust != NULL);
32304 + assert("edward-438", clust->pages != NULL);
32305 + assert("edward-281", cluster_ok(clust, inode));
32306 +
32307 + win = clust->win;
32308 + if (!win)
32309 + return;
32310 +
32311 + switch (win->stat) {
32312 + case DATA_WINDOW:
32313 + /* increment window position */
32314 + clust->index++;
32315 + win->stat = DATA_WINDOW;
32316 + win->off = 0;
32317 + win->count = min_count(inode_cluster_size(inode), to_file);
32318 + break;
32319 + case HOLE_WINDOW:
32320 + switch (next_window_stat(win)) {
32321 + case HOLE_WINDOW:
32322 + /* set window to fit the offset we start write from */
32323 + clust->index = off_to_clust(file_off, inode);
32324 + win->stat = HOLE_WINDOW;
32325 + win->off = 0;
32326 + win->count = off_to_cloff(file_off, inode);
32327 + win->delta =
32328 + min_count(inode_cluster_size(inode) - win->count,
32329 + to_file);
32330 + break;
32331 + case DATA_WINDOW:
32332 + /* do not move the window, just change its state,
32333 + off+count+delta=inv */
32334 + win->stat = DATA_WINDOW;
32335 + win->off = win->off + win->count;
32336 + win->count = win->delta;
32337 + win->delta = 0;
32338 + break;
32339 + default:
32340 + impossible("edward-282", "wrong next window state");
32341 + }
32342 + break;
32343 + default:
32344 + impossible("edward-283", "wrong current window state");
32345 + }
32346 + assert("edward-1068", cluster_ok(clust, inode));
32347 +}
32348 +
32349 +static int update_sd_cryptcompress(struct inode *inode)
32350 +{
32351 + int result = 0;
32352 +
32353 + assert("edward-978", reiser4_schedulable());
32354 +
32355 + result = reiser4_grab_space_force( /* one for stat data update */
32356 + estimate_update_common(inode),
32357 + BA_CAN_COMMIT);
32358 + if (result)
32359 + return result;
32360 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32361 + result = reiser4_update_sd(inode);
32362 +
32363 + return result;
32364 +}
32365 +
32366 +/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
32367 +static void uncapture_cluster_jnode(jnode * node)
32368 +{
32369 + txn_atom *atom;
32370 +
32371 + assert_spin_locked(&(node->guard));
32372 +
32373 + /*jnode_make_clean(node); */
32374 + atom = jnode_get_atom(node);
32375 + if (atom == NULL) {
32376 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32377 + spin_unlock_jnode(node);
32378 + return;
32379 + }
32380 +
32381 + reiser4_uncapture_block(node);
32382 + spin_unlock_atom(atom);
32383 + jput(node);
32384 +}
32385 +
32386 +static void forget_cluster_pages(struct page **pages, int nr)
32387 +{
32388 + int i;
32389 + for (i = 0; i < nr; i++) {
32390 +
32391 + assert("edward-1045", pages[i] != NULL);
32392 + page_cache_release(pages[i]);
32393 + }
32394 +}
32395 +
32396 +/* Check out last modifications we are about to commit,
32397 + and prepare input stream for transform operations.
32398 +*/
32399 +int
32400 +flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
32401 + struct inode *inode)
32402 +{
32403 + int result = 0;
32404 + int i;
32405 + int nr_pages = 0;
32406 + tfm_cluster_t *tc = &clust->tc;
32407 +#if REISER4_DEBUG
32408 + int node_pgcount;
32409 +#endif
32410 + assert("edward-980", node != NULL);
32411 + assert("edward-236", inode != NULL);
32412 + assert("edward-237", clust != NULL);
32413 + assert("edward-240", !clust->win);
32414 + assert("edward-241", reiser4_schedulable());
32415 + assert("edward-718", cryptcompress_inode_ok(inode));
32416 +
32417 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32418 + if (result) {
32419 + warning("edward-1430",
32420 + "alloc stream failed with ret=%d", result);
32421 + return result;
32422 + }
32423 + spin_lock_jnode(node);
32424 +#if REISER4_DEBUG
32425 + node_pgcount = node->page_count;
32426 +#endif
32427 + if (!JF_ISSET(node, JNODE_DIRTY)) {
32428 + /* race with another flush */
32429 +#if REISER4_DEBUG
32430 + assert("edward-981", node_pgcount == 0);
32431 + warning("edward-982", "flush_cluster_pages: jnode is not dirty "
32432 + "clust %lu, inode %llu\n",
32433 + clust->index, (unsigned long long)get_inode_oid(inode));
32434 +#endif
32435 + spin_unlock_jnode(node);
32436 + return RETERR(-E_REPEAT);
32437 + }
32438 + /* Check out a size of logical cluster and
32439 + set a number of cluster pages to commit. */
32440 + tc->len = tc->lsize = fsize_to_count(clust, inode);
32441 + clust->nr_pages = count_to_nrpages(tc->len);
32442 +
32443 +#if REISER4_DEBUG
32444 + node->page_count = 0;
32445 +#endif
32446 + cluster_reserved2grabbed(estimate_update_cluster(inode));
32447 + uncapture_cluster_jnode(node);
32448 +
32449 + assert("edward-1224", reiser4_schedulable());
32450 + /* Check out page cluster for commit */
32451 + nr_pages =
32452 + find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
32453 + clust->nr_pages, clust->pages);
32454 + if (nr_pages != clust->nr_pages)
32455 + goto checkout_failed;
32456 +
32457 + /* Try to construct input stream from the checked out pages */
32458 + for (i = 0; i < clust->nr_pages; i++) {
32459 + char *data;
32460 +
32461 + assert("edward-242", clust->pages[i] != NULL);
32462 + if (clust->pages[i]->index !=
32463 + clust_to_pg(clust->index, inode) + i)
32464 + goto checkout_failed;
32465 + BUG_ON(!PageUptodate(clust->pages[i]));
32466 +
32467 + /* flush the page into input transform stream */
32468 + lock_page(clust->pages[i]);
32469 + data = kmap(clust->pages[i]);
32470 +
32471 + assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
32472 +
32473 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
32474 + data, cnt_to_pgcnt(tc->len, i));
32475 + kunmap(clust->pages[i]);
32476 + unlock_page(clust->pages[i]);
32477 + }
32478 + /* page cluster flushed successfully */
32479 +
32480 + clear_cluster_pages_dirty(clust);
32481 + reiser4_release_cluster_pages(clust);
32482 +#if REISER4_DEBUG
32483 + cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
32484 +#endif
32485 + goto out;
32486 + checkout_failed:
32487 +#if REISER4_DEBUG
32488 + assert("edward-1282", node_pgcount == 0);
32489 + warning("edward-1435", "Inode %llu : checkout page cluster"
32490 + "of index %lu failed\n",
32491 + (unsigned long long)get_inode_oid(inode), clust->index);
32492 +#endif /* REISER4_DEBUG */
32493 + result = RETERR(-E_REPEAT);
32494 + out:
32495 + /* put pages that were found here */
32496 + forget_cluster_pages(clust->pages, nr_pages);
32497 + return result;
32498 +}
32499 +
32500 +/* set hint for the cluster of the index @index */
32501 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
32502 + cloff_t index, znode_lock_mode mode)
32503 +{
32504 + reiser4_key key;
32505 + assert("edward-722", cryptcompress_inode_ok(inode));
32506 + assert("edward-723",
32507 + inode_file_plugin(inode) ==
32508 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
32509 +
32510 + inode_file_plugin(inode)->key_by_inode(inode,
32511 + clust_to_off(index, inode),
32512 + &key);
32513 +
32514 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
32515 + hint->offset = get_key_offset(&key);
32516 + hint->mode = mode;
32517 +}
32518 +
32519 +void invalidate_hint_cluster(reiser4_cluster_t * clust)
32520 +{
32521 + assert("edward-1291", clust != NULL);
32522 + assert("edward-1292", clust->hint != NULL);
32523 +
32524 + done_lh(&clust->hint->lh);
32525 + hint_clr_valid(clust->hint);
32526 +}
32527 +
32528 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
32529 + znode_lock_mode mode)
32530 +{
32531 + assert("edward-1286", clust != NULL);
32532 + assert("edward-1287", clust->hint != NULL);
32533 +
32534 + set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
32535 + invalidate_hint_cluster(clust);
32536 +}
32537 +
32538 +static int
32539 +balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
32540 + loff_t off, loff_t to_file)
32541 +{
32542 + int result;
32543 +
32544 + assert("edward-724", inode != NULL);
32545 + assert("edward-725", cryptcompress_inode_ok(inode));
32546 +
32547 + /* set next window params */
32548 + update_cluster(inode, clust, off, to_file);
32549 +
32550 + result = update_sd_cryptcompress(inode);
32551 + if (result)
32552 + return result;
32553 + assert("edward-726", clust->hint->lh.owner == NULL);
32554 +
32555 + reiser4_throttle_write(inode);
32556 + return 0;
32557 +}
32558 +
32559 +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
32560 +static int
32561 +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32562 + loff_t to_file)
32563 +{
32564 + char *data;
32565 + int result = 0;
32566 + unsigned cl_off, cl_count = 0;
32567 + unsigned to_pg, pg_off;
32568 + reiser4_slide_t *win;
32569 +
32570 + assert("edward-190", clust != NULL);
32571 + assert("edward-1069", clust->win != NULL);
32572 + assert("edward-191", inode != NULL);
32573 + assert("edward-727", cryptcompress_inode_ok(inode));
32574 + assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
32575 + assert("edward-1154",
32576 + ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
32577 +
32578 + win = clust->win;
32579 +
32580 + assert("edward-1070", win != NULL);
32581 + assert("edward-201", win->stat == HOLE_WINDOW);
32582 + assert("edward-192", cluster_ok(clust, inode));
32583 +
32584 + if (win->off == 0 && win->count == inode_cluster_size(inode)) {
32585 + /* the hole will be represented by fake disk cluster */
32586 + update_cluster(inode, clust, file_off, to_file);
32587 + return 0;
32588 + }
32589 + cl_count = win->count; /* number of zeroes to write */
32590 + cl_off = win->off;
32591 + pg_off = off_to_pgoff(win->off);
32592 +
32593 + while (cl_count) {
32594 + struct page *page;
32595 + page = clust->pages[off_to_pg(cl_off)];
32596 +
32597 + assert("edward-284", page != NULL);
32598 +
32599 + to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
32600 + lock_page(page);
32601 + data = kmap_atomic(page, KM_USER0);
32602 + memset(data + pg_off, 0, to_pg);
32603 + flush_dcache_page(page);
32604 + kunmap_atomic(data, KM_USER0);
32605 + SetPageUptodate(page);
32606 + unlock_page(page);
32607 +
32608 + cl_off += to_pg;
32609 + cl_count -= to_pg;
32610 + pg_off = 0;
32611 + }
32612 + if (!win->delta) {
32613 + /* only zeroes, try to capture */
32614 +
32615 + set_cluster_pages_dirty(clust);
32616 + result = try_capture_cluster(clust, inode);
32617 + if (result)
32618 + return result;
32619 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32620 + result =
32621 + balance_dirty_page_cluster(clust, inode, file_off, to_file);
32622 + } else
32623 + update_cluster(inode, clust, file_off, to_file);
32624 + return result;
32625 +}
32626 +
32627 +/*
32628 + The main disk search procedure for cryptcompress plugins, which
32629 + . scans all items of disk cluster with the lock mode @mode
32630 + . maybe reads each one (if @read)
32631 + . maybe makes its znode dirty (if write lock mode was specified)
32632 +
32633 + NOTE-EDWARD: Callers should handle the case when disk cluster
32634 + is incomplete (-EIO)
32635 +*/
32636 +int find_disk_cluster(reiser4_cluster_t * clust,
32637 + struct inode *inode, int read, znode_lock_mode mode)
32638 +{
32639 + flow_t f;
32640 + hint_t *hint;
32641 + int result = 0;
32642 + unsigned long cl_idx;
32643 + ra_info_t ra_info;
32644 + file_plugin *fplug;
32645 + item_plugin *iplug;
32646 + tfm_cluster_t *tc;
32647 + int was_grabbed;
32648 +
32649 + assert("edward-138", clust != NULL);
32650 + assert("edward-728", clust->hint != NULL);
32651 + assert("edward-226", reiser4_schedulable());
32652 + assert("edward-137", inode != NULL);
32653 + assert("edward-729", cryptcompress_inode_ok(inode));
32654 +
32655 + hint = clust->hint;
32656 + cl_idx = clust->index;
32657 + fplug = inode_file_plugin(inode);
32658 + was_grabbed = get_current_context()->grabbed_blocks;
32659 + tc = &clust->tc;
32660 +
32661 + assert("edward-462", !tfm_cluster_is_uptodate(tc));
32662 + assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
32663 +
32664 + dclust_init_extension(hint);
32665 +
32666 + /* set key of the first disk cluster item */
32667 + fplug->flow_by_inode(inode,
32668 + (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
32669 + 0 /* kernel space */ ,
32670 + inode_scaled_cluster_size(inode),
32671 + clust_to_off(cl_idx, inode), READ_OP, &f);
32672 + if (mode == ZNODE_WRITE_LOCK) {
32673 + /* reserve for flush to make dirty all the leaf nodes
32674 + which contain disk cluster */
32675 + result =
32676 + reiser4_grab_space_force(estimate_dirty_cluster(inode),
32677 + BA_CAN_COMMIT);
32678 + if (result)
32679 + goto out;
32680 + }
32681 +
32682 + ra_info.key_to_stop = f.key;
32683 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32684 +
32685 + while (f.length) {
32686 + result = find_cluster_item(hint, &f.key, mode,
32687 + NULL, FIND_EXACT,
32688 + (mode == ZNODE_WRITE_LOCK ?
32689 + CBK_FOR_INSERT : 0));
32690 + switch (result) {
32691 + case CBK_COORD_NOTFOUND:
32692 + result = 0;
32693 + if (inode_scaled_offset
32694 + (inode,
32695 + clust_to_off(cl_idx,
32696 + inode)) == get_key_offset(&f.key)) {
32697 + /* first item not found, this is treated
32698 + as disk cluster is absent */
32699 + clust->dstat = FAKE_DISK_CLUSTER;
32700 + goto out;
32701 + }
32702 + /* we are outside the cluster, stop search here */
32703 + assert("edward-146",
32704 + f.length != inode_scaled_cluster_size(inode));
32705 + goto ok;
32706 + case CBK_COORD_FOUND:
32707 + assert("edward-148",
32708 + hint->ext_coord.coord.between == AT_UNIT);
32709 + assert("edward-460",
32710 + hint->ext_coord.coord.unit_pos == 0);
32711 +
32712 + coord_clear_iplug(&hint->ext_coord.coord);
32713 + result = zload_ra(hint->ext_coord.coord.node, &ra_info);
32714 + if (unlikely(result))
32715 + goto out;
32716 + iplug = item_plugin_by_coord(&hint->ext_coord.coord);
32717 + assert("edward-147",
32718 + item_id_by_coord(&hint->ext_coord.coord) ==
32719 + CTAIL_ID);
32720 +
32721 + result = iplug->s.file.read(NULL, &f, hint);
32722 + if (result) {
32723 + zrelse(hint->ext_coord.coord.node);
32724 + goto out;
32725 + }
32726 + if (mode == ZNODE_WRITE_LOCK) {
32727 + /* Don't make dirty more nodes then it was
32728 + estimated (see comments before
32729 + estimate_dirty_cluster). Missed nodes will be
32730 + read up in flush time if they are evicted from
32731 + memory */
32732 + if (dclust_get_extension_ncount(hint) <=
32733 + estimate_dirty_cluster(inode))
32734 + znode_make_dirty(hint->ext_coord.coord.node);
32735 +
32736 + znode_set_convertible(hint->ext_coord.coord.
32737 + node);
32738 + }
32739 + zrelse(hint->ext_coord.coord.node);
32740 + break;
32741 + default:
32742 + goto out;
32743 + }
32744 + }
32745 + ok:
32746 + /* at least one item was found */
32747 + /* NOTE-EDWARD: Callers should handle the case
32748 + when disk cluster is incomplete (-EIO) */
32749 + tc->len = inode_scaled_cluster_size(inode) - f.length;
32750 + tc->lsize = fsize_to_count(clust, inode);
32751 + assert("edward-1196", tc->len > 0);
32752 + assert("edward-1406", tc->lsize > 0);
32753 +
32754 + if (hint_is_unprepped_dclust(clust->hint))
32755 + clust->dstat = UNPR_DISK_CLUSTER;
32756 + else {
32757 + dclust_set_extension_dsize(clust->hint, tc->len);
32758 + clust->dstat = PREP_DISK_CLUSTER;
32759 + }
32760 + out:
32761 + assert("edward-1339",
32762 + get_current_context()->grabbed_blocks >= was_grabbed);
32763 + grabbed2free(get_current_context(),
32764 + get_current_super_private(),
32765 + get_current_context()->grabbed_blocks - was_grabbed);
32766 + return result;
32767 +}
32768 +
32769 +int
32770 +get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
32771 + znode_lock_mode lock_mode)
32772 +{
32773 + reiser4_key key;
32774 + ra_info_t ra_info;
32775 +
32776 + assert("edward-730", reiser4_schedulable());
32777 + assert("edward-731", clust != NULL);
32778 + assert("edward-732", inode != NULL);
32779 +
32780 + if (hint_is_valid(clust->hint)) {
32781 + assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
32782 + assert("edward-1294",
32783 + znode_is_write_locked(clust->hint->lh.node));
32784 + /* already have a valid locked position */
32785 + return (clust->dstat ==
32786 + FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
32787 + CBK_COORD_FOUND);
32788 + }
32789 + key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
32790 + &key);
32791 + ra_info.key_to_stop = key;
32792 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32793 +
32794 + return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
32795 + CBK_FOR_INSERT);
32796 +}
32797 +
32798 +/* Read needed cluster pages before modifying.
32799 + If success, @clust->hint contains locked position in the tree.
32800 + Also:
32801 + . find and set disk cluster state
32802 + . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
32803 +*/
32804 +static int
32805 +read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32806 +{
32807 + int i;
32808 + int result = 0;
32809 + item_plugin *iplug;
32810 + reiser4_slide_t *win = clust->win;
32811 + znode_lock_mode mode = ZNODE_WRITE_LOCK;
32812 +
32813 + iplug = item_plugin_by_id(CTAIL_ID);
32814 +
32815 + assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
32816 +
32817 +#if REISER4_DEBUG
32818 + if (clust->nr_pages == 0) {
32819 + /* start write hole from fake disk cluster */
32820 + assert("edward-1117", win != NULL);
32821 + assert("edward-1118", win->stat == HOLE_WINDOW);
32822 + assert("edward-1119", new_cluster(clust, inode));
32823 + }
32824 +#endif
32825 + if (new_cluster(clust, inode)) {
32826 + /*
32827 + new page cluster is about to be written, nothing to read,
32828 + */
32829 + assert("edward-734", reiser4_schedulable());
32830 + assert("edward-735", clust->hint->lh.owner == NULL);
32831 +
32832 + if (clust->nr_pages) {
32833 + int off;
32834 + char *data;
32835 + struct page * pg;
32836 + assert("edward-1419", clust->pages != NULL);
32837 + pg = clust->pages[clust->nr_pages - 1];
32838 + assert("edward-1420", pg != NULL);
32839 + off = off_to_pgoff(win->off+win->count+win->delta);
32840 + if (off) {
32841 + lock_page(pg);
32842 + data = kmap_atomic(pg, KM_USER0);
32843 + memset(data + off, 0, PAGE_CACHE_SIZE - off);
32844 + flush_dcache_page(pg);
32845 + kunmap_atomic(data, KM_USER0);
32846 + unlock_page(pg);
32847 + }
32848 + }
32849 + clust->dstat = FAKE_DISK_CLUSTER;
32850 + return 0;
32851 + }
32852 + /*
32853 + Here we should search for disk cluster to figure out its real state.
32854 + Also there is one more important reason to do disk search: we need
32855 + to make disk cluster _dirty_ if it exists
32856 + */
32857 +
32858 + /* if windows is specified, read the only pages
32859 + that will be modified partially */
32860 +
32861 + for (i = 0; i < clust->nr_pages; i++) {
32862 + struct page *pg = clust->pages[i];
32863 +
32864 + lock_page(pg);
32865 + if (PageUptodate(pg)) {
32866 + unlock_page(pg);
32867 + continue;
32868 + }
32869 + unlock_page(pg);
32870 +
32871 + if (win &&
32872 + i >= count_to_nrpages(win->off) &&
32873 + i < off_to_pg(win->off + win->count + win->delta))
32874 + /* page will be completely overwritten */
32875 + continue;
32876 +
32877 + if (win && (i == clust->nr_pages - 1) &&
32878 + /* the last page is
32879 + partially modified,
32880 + not uptodate .. */
32881 + (count_to_nrpages(inode->i_size) <= pg->index)) {
32882 + /* .. and appended,
32883 + so set zeroes to the rest */
32884 + char *data;
32885 + int offset;
32886 + lock_page(pg);
32887 + data = kmap_atomic(pg, KM_USER0);
32888 +
32889 + assert("edward-1260",
32890 + count_to_nrpages(win->off + win->count +
32891 + win->delta) - 1 == i);
32892 +
32893 + offset =
32894 + off_to_pgoff(win->off + win->count + win->delta);
32895 + memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
32896 + flush_dcache_page(pg);
32897 + kunmap_atomic(data, KM_USER0);
32898 + unlock_page(pg);
32899 + /* still not uptodate */
32900 + break;
32901 + }
32902 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
32903 + result = ctail_read_disk_cluster(clust, inode, mode);
32904 + if (result)
32905 + goto out;
32906 + assert("edward-925",
32907 + tfm_cluster_is_uptodate(&clust->tc));
32908 + }
32909 + lock_page(pg);
32910 + result = do_readpage_ctail(inode, clust, pg, mode);
32911 + unlock_page(pg);
32912 + if (result) {
32913 + impossible("edward-219",
32914 + "do_readpage_ctail returned crap");
32915 + goto out;
32916 + }
32917 + }
32918 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
32919 + /* disk cluster unclaimed, but we need to make its znodes dirty
32920 + to make flush update convert its content */
32921 + result = find_disk_cluster(clust, inode, 0 /* do not read items */,
32922 + mode);
32923 + }
32924 + out:
32925 + tfm_cluster_clr_uptodate(&clust->tc);
32926 + return result;
32927 +}
32928 +
32929 +static int
32930 +should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
32931 +{
32932 + assert("edward-737", clust != NULL);
32933 +
32934 + switch (clust->dstat) {
32935 + case PREP_DISK_CLUSTER:
32936 + case UNPR_DISK_CLUSTER:
32937 + return 0;
32938 + case FAKE_DISK_CLUSTER:
32939 + if (clust->win &&
32940 + clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
32941 + assert("edward-1172", new_cluster(clust, inode));
32942 + return 0;
32943 + }
32944 + return 1;
32945 + default:
32946 + impossible("edward-1173", "bad disk cluster state");
32947 + return 0;
32948 + }
32949 +}
32950 +
32951 +static int
32952 +cryptcompress_make_unprepped_cluster(reiser4_cluster_t * clust,
32953 + struct inode *inode)
32954 +{
32955 + int result;
32956 +
32957 + assert("edward-1123", reiser4_schedulable());
32958 + assert("edward-737", clust != NULL);
32959 + assert("edward-738", inode != NULL);
32960 + assert("edward-739", cryptcompress_inode_ok(inode));
32961 + assert("edward-1053", clust->hint != NULL);
32962 +
32963 + if (!should_create_unprepped_cluster(clust, inode)) {
32964 + if (clust->reserved) {
32965 + cluster_reserved2free(estimate_insert_cluster(inode));
32966 +#if REISER4_DEBUG
32967 + assert("edward-1267",
32968 + clust->reserved_unprepped ==
32969 + estimate_insert_cluster(inode));
32970 + clust->reserved_unprepped -=
32971 + estimate_insert_cluster(inode);
32972 +#endif
32973 + }
32974 + return 0;
32975 + }
32976 + assert("edward-1268", clust->reserved);
32977 + cluster_reserved2grabbed(estimate_insert_cluster(inode));
32978 +#if REISER4_DEBUG
32979 + assert("edward-1441",
32980 + clust->reserved_unprepped == estimate_insert_cluster(inode));
32981 + clust->reserved_unprepped -= estimate_insert_cluster(inode);
32982 +#endif
32983 + result = ctail_insert_unprepped_cluster(clust, inode);
32984 + if (result)
32985 + return result;
32986 +
32987 + inode_add_bytes(inode, inode_cluster_size(inode));
32988 +
32989 + assert("edward-743", cryptcompress_inode_ok(inode));
32990 + assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
32991 +
32992 + clust->dstat = UNPR_DISK_CLUSTER;
32993 + return 0;
32994 +}
32995 +
32996 +#if REISER4_DEBUG
32997 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
32998 +{
32999 + jnode *node;
33000 + node =
33001 + jlookup(current_tree, get_inode_oid(inode),
33002 + clust_to_pg(index, inode));
33003 + if (likely(!node))
33004 + return 1;
33005 + /* someone got this jnode */
33006 + warning("edward-1315", "jnode %p is untruncated\n", node);
33007 + jput(node);
33008 + return (atomic_read(&node->x_count));
33009 +}
33010 +#endif
33011 +
33012 +/* Collect unlocked cluster pages and jnode (the last is in the
33013 + case when the page cluster will be modified and captured) */
33014 +int
33015 +prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33016 + int capture)
33017 +{
33018 + assert("edward-177", inode != NULL);
33019 + assert("edward-741", cryptcompress_inode_ok(inode));
33020 + assert("edward-740", clust->pages != NULL);
33021 +
33022 + set_cluster_nrpages(clust, inode);
33023 + reset_cluster_pgset(clust, cluster_nrpages(inode));
33024 + return (capture ?
33025 + grab_cluster_pages_jnode(inode, clust) :
33026 + grab_cluster_pages(inode, clust));
33027 +}
33028 +
33029 +/* Truncate all pages of the cluster of index @index.
33030 + This is called by ->kill_hook() method of item plugin */
33031 +void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t index,
33032 + int even_cows)
33033 +{
33034 + int i;
33035 + int found = 0;
33036 + int nr_pages;
33037 + jnode *node;
33038 + struct page *pages[MAX_CLUSTER_NRPAGES];
33039 +
33040 + node =
33041 + jlookup(current_tree, get_inode_oid(inode),
33042 + clust_to_pg(index, inode));
33043 + /* jnode is absent, just drop pages which can not
33044 + acquire jnode because of exclusive access */
33045 + if (!node)
33046 + goto truncate;
33047 + /* jnode is present and may be dirty */
33048 + nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33049 +
33050 + found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33051 + nr_pages, pages);
33052 + spin_lock_jnode(node);
33053 +
33054 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33055 + && index == 0)
33056 + /* converting to unix_file in progress */
33057 + JF_CLR(node, JNODE_CLUSTER_PAGE);
33058 + if (JF_ISSET(node, JNODE_DIRTY)) {
33059 + /* someone has done modifications which are not
33060 + yet committed, so we need to release some resources */
33061 +
33062 + /* free disk space grabbed for disk cluster converting */
33063 + cluster_reserved2grabbed(estimate_update_cluster(inode));
33064 + grabbed2free(get_current_context(),
33065 + get_current_super_private(),
33066 + estimate_update_cluster(inode));
33067 +
33068 + assert("edward-1198", found == nr_pages);
33069 + assert("edward-1199", node->page_count == nr_pages);
33070 +#if REISER4_DEBUG
33071 + node->page_count = 0;
33072 +#endif
33073 + /* This will clear dirty bit */
33074 + uncapture_cluster_jnode(node);
33075 +
33076 + /* put pages grabbed for last uncommitted modifications */
33077 + for (i = 0; i < nr_pages; i++) {
33078 + assert("edward-1200", PageUptodate(pages[i]));
33079 + page_cache_release(pages[i]);
33080 +#if REISER4_DEBUG
33081 + cryptcompress_inode_data(inode)->pgcount --;
33082 +#endif
33083 + }
33084 + } else
33085 + spin_unlock_jnode(node);
33086 + /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33087 +
33088 + jput(node);
33089 + /* put pages found here */
33090 + forget_cluster_pages(pages, found);
33091 + truncate:
33092 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33093 + index == 0)
33094 + return;
33095 + reiser4_invalidate_pages(inode->i_mapping,
33096 + clust_to_pg(index, inode),
33097 + cluster_nrpages(inode),
33098 + even_cows);
33099 + assert("edward-1201",
33100 + ergo(!reiser4_inode_get_flag(inode,
33101 + REISER4_FILE_CONV_IN_PROGRESS),
33102 + jnode_truncate_ok(inode, index)));
33103 + return;
33104 +}
33105 +
33106 +/* Prepare cluster handle before(after) modifications
33107 + which are supposed to be committed.
33108 +
33109 + . grab cluster pages;
33110 + . reserve disk space;
33111 + . maybe read pages from disk and set the disk cluster dirty;
33112 + . maybe write hole;
33113 + . maybe create 'unprepped' disk cluster if the last one is fake
33114 + (i.e. is not represenred by any items)
33115 +*/
33116 +
33117 +static int
33118 +prepare_cluster(struct inode *inode,
33119 + loff_t file_off /* write position in the file */ ,
33120 + loff_t to_file, /* bytes of users data to write to the file */
33121 + reiser4_cluster_t * clust, page_cluster_op op)
33122 +{
33123 + int result = 0;
33124 + reiser4_slide_t *win = clust->win;
33125 +
33126 + reset_cluster_params(clust);
33127 + cluster_set_tfm_act(&clust->tc, TFMA_READ);
33128 +#if REISER4_DEBUG
33129 + clust->ctx = get_current_context();
33130 +#endif
33131 + assert("edward-1190", op != PCL_UNKNOWN);
33132 +
33133 + clust->op = op;
33134 +
33135 + result = prepare_page_cluster(inode, clust, 1);
33136 + if (result)
33137 + return result;
33138 + assert("edward-1447",
33139 + ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33140 + assert("edward-1448",
33141 + ergo(clust->nr_pages != 0,
33142 + jnode_is_cluster_page(jprivate(clust->pages[0]))));
33143 +
33144 + result = reserve4cluster(inode, clust);
33145 + if (result)
33146 + goto err1;
33147 + result = read_some_cluster_pages(inode, clust);
33148 + if (result) {
33149 + free_reserved4cluster(inode,
33150 + clust,
33151 + estimate_update_cluster(inode) +
33152 + estimate_insert_cluster(inode));
33153 + goto err1;
33154 + }
33155 + assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33156 +
33157 + result = cryptcompress_make_unprepped_cluster(clust, inode);
33158 + if (result)
33159 + goto err2;
33160 + if (win && win->stat == HOLE_WINDOW) {
33161 + result = write_hole(inode, clust, file_off, to_file);
33162 + if (result)
33163 + goto err2;
33164 + }
33165 + return 0;
33166 + err2:
33167 + free_reserved4cluster(inode, clust,
33168 + estimate_update_cluster(inode));
33169 + err1:
33170 + reiser4_release_cluster_pages_and_jnode(clust);
33171 + assert("edward-1125", result == -ENOSPC);
33172 + return result;
33173 +}
33174 +
33175 +/* set window by two offsets */
33176 +static void
33177 +set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33178 + struct inode *inode, loff_t o1, loff_t o2)
33179 +{
33180 + assert("edward-295", clust != NULL);
33181 + assert("edward-296", inode != NULL);
33182 + assert("edward-1071", win != NULL);
33183 + assert("edward-297", o1 <= o2);
33184 +
33185 + clust->index = off_to_clust(o1, inode);
33186 +
33187 + win->off = off_to_cloff(o1, inode);
33188 + win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33189 + win->delta = 0;
33190 +
33191 + clust->win = win;
33192 +}
33193 +
33194 +static int
33195 +set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33196 + reiser4_slide_t * win, flow_t * f, loff_t file_off)
33197 +{
33198 + int result;
33199 +
33200 + assert("edward-197", clust != NULL);
33201 + assert("edward-1072", win != NULL);
33202 + assert("edward-198", inode != NULL);
33203 +
33204 + result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33205 + if (result)
33206 + return result;
33207 +
33208 + if (file_off > inode->i_size) {
33209 + /* Uhmm, hole in cryptcompress file... */
33210 + loff_t hole_size;
33211 + hole_size = file_off - inode->i_size;
33212 +
33213 + set_window(clust, win, inode, inode->i_size, file_off);
33214 + win->stat = HOLE_WINDOW;
33215 + if (win->off + hole_size < inode_cluster_size(inode))
33216 + /* there is also user's data to append to the hole */
33217 + win->delta =
33218 + min_count(inode_cluster_size(inode) -
33219 + (win->off + win->count), f->length);
33220 + return 0;
33221 + }
33222 + set_window(clust, win, inode, file_off, file_off + f->length);
33223 + win->stat = DATA_WINDOW;
33224 + return 0;
33225 +}
33226 +
33227 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33228 + int count)
33229 +{
33230 + int result = 0;
33231 + int (*setting_actor)(reiser4_cluster_t * clust, int count);
33232 +
33233 + assert("edward-1358", clust != NULL);
33234 + assert("edward-1359", page != NULL);
33235 + assert("edward-1360", page->mapping != NULL);
33236 + assert("edward-1361", page->mapping->host != NULL);
33237 +
33238 + setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33239 + result = setting_actor(clust, count);
33240 + clust->index = pg_to_clust(page->index, page->mapping->host);
33241 + return result;
33242 +}
33243 +
33244 +/* reset all the params that not get updated */
33245 +void reset_cluster_params(reiser4_cluster_t * clust)
33246 +{
33247 + assert("edward-197", clust != NULL);
33248 +
33249 + clust->dstat = INVAL_DISK_CLUSTER;
33250 + clust->tc.uptodate = 0;
33251 + clust->tc.len = 0;
33252 +}
33253 +
33254 +/* Core write procedure of cryptcompress plugin, which slices user's
33255 + flow into logical clusters, maps the last ones to the appropriate
33256 + page clusters, and tries to capture them.
33257 + If @buf != NULL, returns number of successfully written bytes,
33258 + otherwise returns error
33259 +*/
33260 +static loff_t
33261 +write_cryptcompress_flow(struct file *file, struct inode *inode,
33262 + const char __user *buf, size_t count, loff_t pos,
33263 + int *conv_occured)
33264 +{
33265 + int i;
33266 + flow_t f;
33267 + hint_t *hint;
33268 + int result = 0;
33269 + size_t to_write = 0;
33270 + loff_t file_off;
33271 + reiser4_slide_t win;
33272 + reiser4_cluster_t clust;
33273 +
33274 + assert("edward-161", reiser4_schedulable());
33275 + assert("edward-748", cryptcompress_inode_ok(inode));
33276 + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33277 + assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33278 +
33279 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33280 + if (hint == NULL)
33281 + return RETERR(-ENOMEM);
33282 +
33283 + result = load_file_hint(file, hint);
33284 + if (result) {
33285 + kfree(hint);
33286 + return result;
33287 + }
33288 +
33289 + result =
33290 + flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33291 + count, pos, WRITE_OP, &f);
33292 + if (result)
33293 + goto out;
33294 + to_write = f.length;
33295 +
33296 + /* current write position in file */
33297 + file_off = pos;
33298 + reiser4_slide_init(&win);
33299 + cluster_init_read(&clust, &win);
33300 + clust.hint = hint;
33301 +
33302 + result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33303 + if (result)
33304 + goto out;
33305 +
33306 + if (next_window_stat(&win) == HOLE_WINDOW) {
33307 + result = write_conversion_hook(file, inode, pos, &clust, NULL);
33308 + if (result)
33309 + goto out;
33310 + result =
33311 + prepare_cluster(inode, file_off, f.length, &clust,
33312 + PCL_APPEND);
33313 + if (result)
33314 + goto out;
33315 + }
33316 + do {
33317 + char *src;
33318 + unsigned page_off, page_count;
33319 +
33320 + assert("edward-750", reiser4_schedulable());
33321 +
33322 + result = write_conversion_hook(file, inode, pos, &clust,
33323 + conv_occured);
33324 + if (result || *conv_occured)
33325 + goto out;
33326 + result =
33327 + prepare_cluster(inode, file_off, f.length, &clust,
33328 + PCL_APPEND);
33329 + if (result)
33330 + goto out;
33331 +
33332 + assert("edward-751", cryptcompress_inode_ok(inode));
33333 + assert("edward-204", win.stat == DATA_WINDOW);
33334 + assert("edward-1288", hint_is_valid(clust.hint));
33335 + assert("edward-752",
33336 + znode_is_write_locked(hint->ext_coord.coord.node));
33337 +
33338 + put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33339 +
33340 + /* set write position in page */
33341 + page_off = off_to_pgoff(win.off);
33342 +
33343 + /* copy user's data to cluster pages */
33344 + for (i = off_to_pg(win.off), src = f.data;
33345 + i < count_to_nrpages(win.off + win.count);
33346 + i++, src += page_count) {
33347 + page_count =
33348 + cnt_to_pgcnt(win.off + win.count, i) - page_off;
33349 +
33350 + assert("edward-1039",
33351 + page_off + page_count <= PAGE_CACHE_SIZE);
33352 + assert("edward-287", clust.pages[i] != NULL);
33353 +
33354 + lock_page(clust.pages[i]);
33355 + result =
33356 + __copy_from_user((char *)kmap(clust.pages[i]) +
33357 + page_off, (char __user *)src, page_count);
33358 + kunmap(clust.pages[i]);
33359 + if (unlikely(result)) {
33360 + unlock_page(clust.pages[i]);
33361 + result = -EFAULT;
33362 + goto err2;
33363 + }
33364 + SetPageUptodate(clust.pages[i]);
33365 + unlock_page(clust.pages[i]);
33366 + page_off = 0;
33367 + }
33368 + assert("edward-753", cryptcompress_inode_ok(inode));
33369 +
33370 + set_cluster_pages_dirty(&clust);
33371 +
33372 + result = try_capture_cluster(&clust, inode);
33373 + if (result)
33374 + goto err2;
33375 +
33376 + assert("edward-998", f.user == 1);
33377 +
33378 + move_flow_forward(&f, win.count);
33379 +
33380 + /* disk cluster may be already clean at this point */
33381 +
33382 + /* . update cluster
33383 + . set hint for new offset
33384 + . unlock znode
33385 + . update inode
33386 + . balance dirty pages
33387 + */
33388 + result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
33389 + if (result)
33390 + goto err1;
33391 + assert("edward-755", hint->lh.owner == NULL);
33392 + reset_cluster_params(&clust);
33393 + continue;
33394 + err2:
33395 + reiser4_release_cluster_pages_and_jnode(&clust);
33396 + err1:
33397 + if (clust.reserved)
33398 + free_reserved4cluster(inode,
33399 + &clust,
33400 + estimate_update_cluster(inode));
33401 + break;
33402 + } while (f.length);
33403 + out:
33404 + done_lh(&hint->lh);
33405 + if (result == -EEXIST)
33406 + warning("edward-1407", "write returns EEXIST!\n");
33407 +
33408 + put_cluster_handle(&clust);
33409 + save_file_hint(file, hint);
33410 + kfree(hint);
33411 + if (buf) {
33412 + /* if nothing were written - there must be an error */
33413 + assert("edward-195", ergo((to_write == f.length),
33414 + (result < 0 || *conv_occured)));
33415 + return (to_write - f.length) ? (to_write - f.length) : result;
33416 + }
33417 + return result;
33418 +}
33419 +
33420 +/**
33421 + * write_cryptcompress - write of struct file_operations
33422 + * @file: file to write to
33423 + * @buf: address of user-space buffer
33424 + * @read_amount: number of bytes to write
33425 + * @off: position in file to write to
33426 + *
33427 + * This is implementation of vfs's write method of struct file_operations for
33428 + * cryptcompress plugin.
33429 + */
33430 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
33431 + size_t count, loff_t *off, int *conv)
33432 +{
33433 + ssize_t result;
33434 + struct inode *inode;
33435 + reiser4_context *ctx;
33436 + loff_t pos = *off;
33437 + cryptcompress_info_t *info;
33438 +
33439 + assert("edward-1449", *conv == 0);
33440 +
33441 + inode = file->f_dentry->d_inode;
33442 + assert("edward-196", cryptcompress_inode_ok(inode));
33443 +
33444 + info = cryptcompress_inode_data(inode);
33445 +
33446 + ctx = reiser4_init_context(inode->i_sb);
33447 + if (IS_ERR(ctx))
33448 + return PTR_ERR(ctx);
33449 +
33450 + mutex_lock(&inode->i_mutex);
33451 +
33452 + result = generic_write_checks(file, &pos, &count, 0);
33453 + if (unlikely(result != 0))
33454 + goto out;
33455 + if (unlikely(count == 0))
33456 + goto out;
33457 + result = remove_suid(file->f_dentry);
33458 + if (unlikely(result != 0))
33459 + goto out;
33460 + /* remove_suid might create a transaction */
33461 + reiser4_txn_restart(ctx);
33462 +
33463 + result = write_cryptcompress_flow(file, inode, buf, count, pos, conv);
33464 +
33465 + if (result < 0)
33466 + goto out;
33467 + /* update position in a file */
33468 + *off = pos + result;
33469 + out:
33470 + mutex_unlock(&inode->i_mutex);
33471 +
33472 + context_set_commit_async(ctx);
33473 + reiser4_exit_context(ctx);
33474 + return result;
33475 +}
33476 +
33477 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33478 + struct list_head *pages, unsigned nr_pages)
33479 +{
33480 + reiser4_context * ctx;
33481 + int ret;
33482 +
33483 + ctx = reiser4_init_context(mapping->host->i_sb);
33484 + if (IS_ERR(ctx)) {
33485 + ret = PTR_ERR(ctx);
33486 + goto err;
33487 + }
33488 + /* crc files can be built of ctail items only */
33489 + ret = readpages_ctail(file, mapping, pages);
33490 + reiser4_exit_context(ctx);
33491 + if (ret) {
33492 +err:
33493 + put_pages_list(pages);
33494 + }
33495 + return ret;
33496 +}
33497 +
33498 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
33499 +{
33500 + /* reserve one block to update stat data item */
33501 + assert("edward-1193",
33502 + inode_file_plugin(inode)->estimate.update ==
33503 + estimate_update_common);
33504 + return estimate_update_common(inode);
33505 +}
33506 +
33507 +/**
33508 + * read_cryptcompress - read of struct file_operations
33509 + * @file: file to read from
33510 + * @buf: address of user-space buffer
33511 + * @read_amount: number of bytes to read
33512 + * @off: position in file to read from
33513 + *
33514 + * This is implementation of vfs's read method of struct file_operations for
33515 + * cryptcompress plugin.
33516 + */
33517 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
33518 + loff_t * off)
33519 +{
33520 + ssize_t result;
33521 + struct inode *inode;
33522 + reiser4_context *ctx;
33523 + cryptcompress_info_t *info;
33524 + reiser4_block_nr needed;
33525 +
33526 + inode = file->f_dentry->d_inode;
33527 + assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
33528 +
33529 + ctx = reiser4_init_context(inode->i_sb);
33530 + if (IS_ERR(ctx))
33531 + return PTR_ERR(ctx);
33532 +
33533 + info = cryptcompress_inode_data(inode);
33534 + needed = cryptcompress_estimate_read(inode);
33535 +
33536 + result = reiser4_grab_space(needed, BA_CAN_COMMIT);
33537 + if (result != 0) {
33538 + reiser4_exit_context(ctx);
33539 + return result;
33540 + }
33541 +
33542 + LOCK_CNT_INC(inode_sem_r);
33543 +
33544 + result = do_sync_read(file, buf, size, off);
33545 +
33546 + LOCK_CNT_DEC(inode_sem_r);
33547 +
33548 + context_set_commit_async(ctx);
33549 + reiser4_exit_context(ctx);
33550 +
33551 + return result;
33552 +}
33553 +
33554 +/* If @index > 0, find real disk cluster of the index (@index - 1),
33555 + If @index == 0 find the real disk cluster of the object of maximal index.
33556 + Keep incremented index of the result in @found.
33557 + It succes was returned:
33558 + (@index == 0 && @found == 0) means that the object doesn't have real disk
33559 + clusters.
33560 + (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
33561 + exist.
33562 +*/
33563 +static int
33564 +find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
33565 +{
33566 + int result;
33567 + reiser4_key key;
33568 + loff_t offset;
33569 + hint_t *hint;
33570 + lock_handle *lh;
33571 + lookup_bias bias;
33572 + coord_t *coord;
33573 + item_plugin *iplug;
33574 +
33575 + assert("edward-1131", inode != NULL);
33576 + assert("edward-95", cryptcompress_inode_ok(inode));
33577 +
33578 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33579 + if (hint == NULL)
33580 + return RETERR(-ENOMEM);
33581 + hint_init_zero(hint);
33582 + lh = &hint->lh;
33583 +
33584 + bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
33585 + offset =
33586 + (index ? clust_to_off(index, inode) -
33587 + 1 : get_key_offset(reiser4_max_key()));
33588 +
33589 + key_by_inode_cryptcompress(inode, offset, &key);
33590 +
33591 + /* find the last item of this object */
33592 + result =
33593 + find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
33594 + bias, 0);
33595 + if (cbk_errored(result)) {
33596 + done_lh(lh);
33597 + kfree(hint);
33598 + return result;
33599 + }
33600 + if (result == CBK_COORD_NOTFOUND) {
33601 + /* no real disk clusters */
33602 + done_lh(lh);
33603 + kfree(hint);
33604 + *found = 0;
33605 + return 0;
33606 + }
33607 + /* disk cluster is found */
33608 + coord = &hint->ext_coord.coord;
33609 + coord_clear_iplug(coord);
33610 + result = zload(coord->node);
33611 + if (unlikely(result)) {
33612 + done_lh(lh);
33613 + kfree(hint);
33614 + return result;
33615 + }
33616 + iplug = item_plugin_by_coord(coord);
33617 + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
33618 + assert("edward-1202", ctail_ok(coord));
33619 +
33620 + item_key_by_coord(coord, &key);
33621 + *found = off_to_clust(get_key_offset(&key), inode) + 1;
33622 +
33623 + assert("edward-1132", ergo(index, index == *found));
33624 +
33625 + zrelse(coord->node);
33626 + done_lh(lh);
33627 + kfree(hint);
33628 + return 0;
33629 +}
33630 +
33631 +static int find_fake_appended(struct inode *inode, cloff_t * index)
33632 +{
33633 + return find_real_disk_cluster(inode, index,
33634 + 0 /* find last real one */ );
33635 +}
33636 +
33637 +/* Set left coord when unit is not found after node_lookup()
33638 + This takes into account that there can be holes in a sequence
33639 + of disk clusters */
33640 +
33641 +static void adjust_left_coord(coord_t * left_coord)
33642 +{
33643 + switch (left_coord->between) {
33644 + case AFTER_UNIT:
33645 + left_coord->between = AFTER_ITEM;
33646 + case AFTER_ITEM:
33647 + case BEFORE_UNIT:
33648 + break;
33649 + default:
33650 + impossible("edward-1204", "bad left coord to cut");
33651 + }
33652 + return;
33653 +}
33654 +
33655 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
33656 +int
33657 +cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
33658 + const reiser4_key * to_key,
33659 + reiser4_key * smallest_removed,
33660 + struct inode *object, int truncate, int *progress)
33661 +{
33662 + lock_handle next_node_lock;
33663 + coord_t left_coord;
33664 + int result;
33665 +
33666 + assert("edward-1158", tap->coord->node != NULL);
33667 + assert("edward-1159", znode_is_write_locked(tap->coord->node));
33668 + assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
33669 +
33670 + *progress = 0;
33671 + init_lh(&next_node_lock);
33672 +
33673 + while (1) {
33674 + znode *node; /* node from which items are cut */
33675 + node_plugin *nplug; /* node plugin for @node */
33676 +
33677 + node = tap->coord->node;
33678 +
33679 + /* Move next_node_lock to the next node on the left. */
33680 + result =
33681 + reiser4_get_left_neighbor(&next_node_lock, node,
33682 + ZNODE_WRITE_LOCK,
33683 + GN_CAN_USE_UPPER_LEVELS);
33684 + if (result != 0 && result != -E_NO_NEIGHBOR)
33685 + break;
33686 + /* FIXME-EDWARD: Check can we delete the node as a whole. */
33687 + result = reiser4_tap_load(tap);
33688 + if (result)
33689 + return result;
33690 +
33691 + /* Prepare the second (right) point for cut_node() */
33692 + if (*progress)
33693 + coord_init_last_unit(tap->coord, node);
33694 +
33695 + else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
33696 + /* set rightmost unit for the items without lookup method */
33697 + tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
33698 +
33699 + nplug = node->nplug;
33700 +
33701 + assert("edward-1161", nplug);
33702 + assert("edward-1162", nplug->lookup);
33703 +
33704 + /* left_coord is leftmost unit cut from @node */
33705 + result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
33706 +
33707 + if (IS_CBKERR(result))
33708 + break;
33709 +
33710 + if (result == CBK_COORD_NOTFOUND)
33711 + adjust_left_coord(&left_coord);
33712 +
33713 + /* adjust coordinates so that they are set to existing units */
33714 + if (coord_set_to_right(&left_coord)
33715 + || coord_set_to_left(tap->coord)) {
33716 + result = 0;
33717 + break;
33718 + }
33719 +
33720 + if (coord_compare(&left_coord, tap->coord) ==
33721 + COORD_CMP_ON_RIGHT) {
33722 + /* keys from @from_key to @to_key are not in the tree */
33723 + result = 0;
33724 + break;
33725 + }
33726 +
33727 + /* cut data from one node */
33728 + *smallest_removed = *reiser4_min_key();
33729 + result = kill_node_content(&left_coord,
33730 + tap->coord,
33731 + from_key,
33732 + to_key,
33733 + smallest_removed,
33734 + next_node_lock.node,
33735 + object, truncate);
33736 +#if REISER4_DEBUG
33737 + /*node_check(node, ~0U); */
33738 +#endif
33739 + reiser4_tap_relse(tap);
33740 +
33741 + if (result)
33742 + break;
33743 +
33744 + ++(*progress);
33745 +
33746 + /* Check whether all items with keys >= from_key were removed
33747 + * from the tree. */
33748 + if (keyle(smallest_removed, from_key))
33749 + /* result = 0; */
33750 + break;
33751 +
33752 + if (next_node_lock.node == NULL)
33753 + break;
33754 +
33755 + result = reiser4_tap_move(tap, &next_node_lock);
33756 + done_lh(&next_node_lock);
33757 + if (result)
33758 + break;
33759 +
33760 + /* Break long cut_tree operation (deletion of a large file) if
33761 + * atom requires commit. */
33762 + if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
33763 + && current_atom_should_commit()) {
33764 + result = -E_REPEAT;
33765 + break;
33766 + }
33767 + }
33768 + done_lh(&next_node_lock);
33769 + return result;
33770 +}
33771 +
33772 +/* Append or expand hole in two steps (exclusive access should be aquired!)
33773 + 1) write zeroes to the current real cluster,
33774 + 2) expand hole via fake clusters (just increase i_size) */
33775 +static int
33776 +cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
33777 + loff_t new_size)
33778 +{
33779 + int result = 0;
33780 + hint_t *hint;
33781 + lock_handle *lh;
33782 + loff_t hole_size;
33783 + int nr_zeroes;
33784 + reiser4_slide_t win;
33785 + reiser4_cluster_t clust;
33786 +
33787 + assert("edward-1133", inode->i_size < new_size);
33788 + assert("edward-1134", reiser4_schedulable());
33789 + assert("edward-1135", cryptcompress_inode_ok(inode));
33790 + assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
33791 + assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
33792 +
33793 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33794 + if (hint == NULL)
33795 + return RETERR(-ENOMEM);
33796 + hint_init_zero(hint);
33797 + lh = &hint->lh;
33798 +
33799 + reiser4_slide_init(&win);
33800 + cluster_init_read(&clust, &win);
33801 + clust.hint = hint;
33802 +
33803 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33804 + if (result)
33805 + goto out;
33806 + if (off_to_cloff(inode->i_size, inode) == 0)
33807 + goto fake_append;
33808 + hole_size = new_size - inode->i_size;
33809 + nr_zeroes =
33810 + inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
33811 + if (hole_size < nr_zeroes)
33812 + nr_zeroes = hole_size;
33813 + set_window(&clust, &win, inode, inode->i_size,
33814 + inode->i_size + nr_zeroes);
33815 + win.stat = HOLE_WINDOW;
33816 +
33817 + assert("edward-1137",
33818 + clust.index == off_to_clust(inode->i_size, inode));
33819 +
33820 + result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
33821 +
33822 + assert("edward-1271", !result || result == -ENOSPC);
33823 + if (result)
33824 + goto out;
33825 + assert("edward-1139",
33826 + clust.dstat == PREP_DISK_CLUSTER ||
33827 + clust.dstat == UNPR_DISK_CLUSTER);
33828 +
33829 + assert("edward-1431", hole_size >= nr_zeroes);
33830 + if (hole_size == nr_zeroes)
33831 + /* nothing to append anymore */
33832 + goto out;
33833 + fake_append:
33834 + INODE_SET_FIELD(inode, i_size, new_size);
33835 + out:
33836 + done_lh(lh);
33837 + kfree(hint);
33838 + put_cluster_handle(&clust);
33839 + return result;
33840 +}
33841 +
33842 +#if REISER4_DEBUG
33843 +static int
33844 +pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
33845 +{
33846 + struct pagevec pvec;
33847 + int i;
33848 + int count;
33849 + int rest;
33850 +
33851 + rest = count_to_nrpages(old_size) - start;
33852 +
33853 + pagevec_init(&pvec, 0);
33854 + count = min_count(pagevec_space(&pvec), rest);
33855 +
33856 + while (rest) {
33857 + count = min_count(pagevec_space(&pvec), rest);
33858 + pvec.nr = find_get_pages(inode->i_mapping, start,
33859 + count, pvec.pages);
33860 + for (i = 0; i < pagevec_count(&pvec); i++) {
33861 + if (PageUptodate(pvec.pages[i])) {
33862 + warning("edward-1205",
33863 + "truncated page of index %lu is uptodate",
33864 + pvec.pages[i]->index);
33865 + return 0;
33866 + }
33867 + }
33868 + start += count;
33869 + rest -= count;
33870 + pagevec_release(&pvec);
33871 + }
33872 + return 1;
33873 +}
33874 +
33875 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
33876 +{
33877 + int result;
33878 + cloff_t raidx;
33879 +
33880 + result = find_fake_appended(inode, &raidx);
33881 + return !result && (aidx == raidx);
33882 +}
33883 +#endif
33884 +
33885 +static int
33886 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
33887 +{
33888 + return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
33889 + ? 0 : reiser4_update_file_size(inode, key, update_sd));
33890 +}
33891 +
33892 +/* prune cryptcompress file in two steps (exclusive access should be acquired!)
33893 + 1) cut all disk clusters but the last one partially truncated,
33894 + 2) set zeroes and capture last partially truncated page cluster if the last
33895 + one exists, otherwise truncate via prune fake cluster (just decrease i_size)
33896 +*/
33897 +static int
33898 +prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
33899 + cloff_t aidx)
33900 +{
33901 + int result = 0;
33902 + unsigned nr_zeroes;
33903 + loff_t to_prune;
33904 + loff_t old_size;
33905 + cloff_t ridx;
33906 +
33907 + hint_t *hint;
33908 + lock_handle *lh;
33909 + reiser4_slide_t win;
33910 + reiser4_cluster_t clust;
33911 +
33912 + assert("edward-1140", inode->i_size >= new_size);
33913 + assert("edward-1141", reiser4_schedulable());
33914 + assert("edward-1142", cryptcompress_inode_ok(inode));
33915 + assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
33916 +
33917 + old_size = inode->i_size;
33918 +
33919 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33920 + if (hint == NULL)
33921 + return RETERR(-ENOMEM);
33922 + hint_init_zero(hint);
33923 + lh = &hint->lh;
33924 +
33925 + reiser4_slide_init(&win);
33926 + cluster_init_read(&clust, &win);
33927 + clust.hint = hint;
33928 +
33929 + /* rightmost completely truncated cluster */
33930 + ridx = count_to_nrclust(new_size, inode);
33931 +
33932 + assert("edward-1174", ridx <= aidx);
33933 + old_size = inode->i_size;
33934 + if (ridx != aidx) {
33935 + result = cut_file_items(inode,
33936 + clust_to_off(ridx, inode),
33937 + update_sd,
33938 + clust_to_off(aidx, inode),
33939 + update_cryptcompress_size);
33940 + if (result)
33941 + goto out;
33942 + }
33943 + if (!off_to_cloff(new_size, inode)) {
33944 + /* no partially truncated clusters */
33945 + assert("edward-1145", inode->i_size == new_size);
33946 + goto finish;
33947 + }
33948 + assert("edward-1146", new_size < inode->i_size);
33949 +
33950 + to_prune = inode->i_size - new_size;
33951 +
33952 + /* partial truncate of leftmost cluster,
33953 + first check if it is fake */
33954 + result = find_real_disk_cluster(inode, &aidx, ridx);
33955 + if (result)
33956 + goto out;
33957 + if (!aidx)
33958 + /* yup, this is fake one */
33959 + goto finish;
33960 +
33961 + assert("edward-1148", aidx == ridx);
33962 +
33963 + /* do partial truncate of the leftmost page cluster,
33964 + then try to capture this one */
33965 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33966 + if (result)
33967 + goto out;
33968 + nr_zeroes = (off_to_pgoff(new_size) ?
33969 + PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
33970 + set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
33971 + win.stat = HOLE_WINDOW;
33972 +
33973 + assert("edward-1149", clust.index == ridx - 1);
33974 +
33975 + result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
33976 + if (result)
33977 + goto out;
33978 + assert("edward-1151",
33979 + clust.dstat == PREP_DISK_CLUSTER ||
33980 + clust.dstat == UNPR_DISK_CLUSTER);
33981 +
33982 + assert("edward-1191", inode->i_size == new_size);
33983 + assert("edward-1206", body_truncate_ok(inode, ridx));
33984 + finish:
33985 + /* drop all the pages that don't have jnodes (i.e. pages
33986 + which can not be truncated by cut_file_items() because
33987 + of holes represented by fake disk clusters) including
33988 + the pages of partially truncated cluster which was
33989 + released by prepare_cluster() */
33990 + truncate_inode_pages(inode->i_mapping, new_size);
33991 + INODE_SET_FIELD(inode, i_size, new_size);
33992 + out:
33993 + assert("edward-1334", !result || result == -ENOSPC);
33994 + assert("edward-1209",
33995 + pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
33996 + done_lh(lh);
33997 + kfree(hint);
33998 + put_cluster_handle(&clust);
33999 + return result;
34000 +}
34001 +
34002 +/* Prepare cryptcompress file for truncate:
34003 + prune or append rightmost fake logical clusters (if any)
34004 +*/
34005 +static int
34006 +start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34007 + int update_sd)
34008 +{
34009 + int result = 0;
34010 + int bytes;
34011 +
34012 + if (new_size > inode->i_size) {
34013 + /* append */
34014 + if (inode->i_size < clust_to_off(aidx, inode))
34015 + /* no fake bytes */
34016 + return 0;
34017 + bytes = new_size - inode->i_size;
34018 + INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34019 + } else {
34020 + /* prune */
34021 + if (inode->i_size <= clust_to_off(aidx, inode))
34022 + /* no fake bytes */
34023 + return 0;
34024 + bytes =
34025 + inode->i_size - max_count(new_size,
34026 + clust_to_off(aidx, inode));
34027 + if (!bytes)
34028 + return 0;
34029 + INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34030 + /* In the case of fake prune we need to drop page cluster.
34031 + There are only 2 cases for partially truncated page:
34032 + 1. If is is dirty, therefore it is anonymous
34033 + (was dirtied via mmap), and will be captured
34034 + later via ->capture().
34035 + 2. If is clean, therefore it is filled by zeroes.
34036 + In both cases we don't need to make it dirty and
34037 + capture here.
34038 + */
34039 + truncate_inode_pages(inode->i_mapping, inode->i_size);
34040 + }
34041 + if (update_sd)
34042 + result = update_sd_cryptcompress(inode);
34043 + return result;
34044 +}
34045 +
34046 +/* This is called in setattr_cryptcompress when it is used to truncate,
34047 + and in delete_cryptcompress */
34048 +static int cryptcompress_truncate(struct inode *inode, /* old size */
34049 + loff_t new_size, /* new size */
34050 + int update_sd)
34051 +{
34052 + int result;
34053 + cloff_t aidx;
34054 +
34055 + result = find_fake_appended(inode, &aidx);
34056 + if (result)
34057 + return result;
34058 + assert("edward-1208",
34059 + ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34060 +
34061 + result = start_truncate_fake(inode, aidx, new_size, update_sd);
34062 + if (result)
34063 + return result;
34064 + if (inode->i_size == new_size)
34065 + /* nothing to truncate anymore */
34066 + return 0;
34067 + result = (inode->i_size < new_size ?
34068 + cryptcompress_append_hole(inode, new_size) :
34069 + prune_cryptcompress(inode, new_size, update_sd, aidx));
34070 + if (!result && update_sd)
34071 + result = update_sd_cryptcompress(inode);
34072 + return result;
34073 +}
34074 +
34075 +static void clear_moved_tag_cluster(struct address_space * mapping,
34076 + reiser4_cluster_t * clust)
34077 +{
34078 + int i;
34079 + void * ret;
34080 + read_lock_irq(&mapping->tree_lock);
34081 + for (i = 0; i < clust->nr_pages; i++) {
34082 + assert("edward-1438", clust->pages[i] != NULL);
34083 + ret = radix_tree_tag_clear(&mapping->page_tree,
34084 + clust->pages[i]->index,
34085 + PAGECACHE_TAG_REISER4_MOVED);
34086 + assert("edward-1439", ret == clust->pages[i]);
34087 + }
34088 + read_unlock_irq(&mapping->tree_lock);
34089 +}
34090 +
34091 +/* Capture an anonymous pager cluster. (Page cluser is
34092 + anonymous if it contains at least one anonymous page */
34093 +static int
34094 +capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34095 +{
34096 + int result;
34097 +
34098 + assert("edward-1073", clust != NULL);
34099 + assert("edward-1074", inode != NULL);
34100 + assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34101 +
34102 + result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34103 + if (result)
34104 + return result;
34105 + set_cluster_pages_dirty(clust);
34106 + clear_moved_tag_cluster(inode->i_mapping, clust);
34107 +
34108 + result = try_capture_cluster(clust, inode);
34109 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34110 + if (unlikely(result)) {
34111 + /* set cleared tag back, so it will be
34112 + possible to capture it again later */
34113 + read_lock_irq(&inode->i_mapping->tree_lock);
34114 + radix_tree_tag_set(&inode->i_mapping->page_tree,
34115 + clust_to_pg(clust->index, inode),
34116 + PAGECACHE_TAG_REISER4_MOVED);
34117 + read_unlock_irq(&inode->i_mapping->tree_lock);
34118 +
34119 + reiser4_release_cluster_pages_and_jnode(clust);
34120 + }
34121 + return result;
34122 +}
34123 +
34124 +#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode))
34125 +
34126 +/* read lock should be acquired */
34127 +static int
34128 +capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34129 + int to_capture)
34130 +{
34131 + int result = 0;
34132 + int found;
34133 + struct page *page = NULL;
34134 + hint_t *hint;
34135 + lock_handle *lh;
34136 + reiser4_cluster_t clust;
34137 +
34138 + assert("edward-1127", mapping != NULL);
34139 + assert("edward-1128", mapping->host != NULL);
34140 + assert("edward-1440", mapping->host->i_mapping == mapping);
34141 +
34142 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34143 + if (hint == NULL)
34144 + return RETERR(-ENOMEM);
34145 + hint_init_zero(hint);
34146 + lh = &hint->lh;
34147 +
34148 + cluster_init_read(&clust, NULL);
34149 + clust.hint = hint;
34150 +
34151 + result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34152 + if (result)
34153 + goto out;
34154 +
34155 + while (to_capture > 0) {
34156 + found =
34157 + find_get_pages_tag(mapping, index,
34158 + PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34159 + if (!found) {
34160 + *index = (pgoff_t) - 1;
34161 + break;
34162 + }
34163 + assert("edward-1109", page != NULL);
34164 +
34165 + move_cluster_forward(&clust, mapping->host, page->index);
34166 + result = capture_page_cluster(&clust, mapping->host);
34167 + page_cache_release(page);
34168 + if (result)
34169 + break;
34170 + to_capture -= clust.nr_pages;
34171 + }
34172 + if (result) {
34173 + warning("edward-1077",
34174 + "Cannot capture anon pages: result=%i (captured=%d)\n",
34175 + result,
34176 + ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34177 + to_capture);
34178 + } else {
34179 + /* something had to be found */
34180 + assert("edward-1078",
34181 + to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34182 + if (to_capture <= 0)
34183 + /* there may be left more pages */
34184 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34185 + }
34186 + out:
34187 + done_lh(lh);
34188 + kfree(hint);
34189 + put_cluster_handle(&clust);
34190 + return result;
34191 +}
34192 +
34193 +/* Check mapping for existence of not captured dirty pages.
34194 + This returns !0 if either page tree contains pages tagged
34195 + PAGECACHE_TAG_REISER4_MOVED */
34196 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
34197 +{
34198 + return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34199 +}
34200 +
34201 +/* this is implementation of vfs's writepages method of struct
34202 + address_space_operations */
34203 +int
34204 +writepages_cryptcompress(struct address_space *mapping,
34205 + struct writeback_control *wbc)
34206 +{
34207 + int result;
34208 + int to_capture;
34209 + pgoff_t nrpages;
34210 + pgoff_t index = 0;
34211 + cryptcompress_info_t *info;
34212 + struct inode *inode;
34213 +
34214 + inode = mapping->host;
34215 + if (!cryptcompress_inode_has_anon_pages(inode)) {
34216 + result = 0;
34217 + goto end;
34218 + }
34219 +
34220 + info = cryptcompress_inode_data(inode);
34221 + nrpages = count_to_nrpages(i_size_read(inode));
34222 +
34223 + if (wbc->sync_mode != WB_SYNC_ALL)
34224 + to_capture =
34225 + min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34226 + else
34227 + to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34228 + do {
34229 + reiser4_context *ctx;
34230 +
34231 + ctx = reiser4_init_context(inode->i_sb);
34232 + if (IS_ERR(ctx)) {
34233 + result = PTR_ERR(ctx);
34234 + break;
34235 + }
34236 + ctx->nobalance = 1;
34237 +
34238 + assert("edward-1079",
34239 + lock_stack_isclean(get_current_lock_stack()));
34240 +
34241 + LOCK_CNT_INC(inode_sem_r);
34242 +
34243 + result =
34244 + capture_anonymous_clusters(inode->i_mapping, &index,
34245 + to_capture);
34246 +
34247 + if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34248 + reiser4_exit_context(ctx);
34249 + break;
34250 + }
34251 + result = txnmgr_force_commit_all(inode->i_sb, 0);
34252 + reiser4_exit_context(ctx);
34253 + } while (result == 0 && index < nrpages);
34254 +
34255 + end:
34256 + if (is_in_reiser4_context()) {
34257 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34258 + /* there are already pages to flush, flush them out, do
34259 + not delay until end of reiser4_sync_inodes */
34260 + reiser4_writeout(inode->i_sb, wbc);
34261 + get_current_context()->nr_captured = 0;
34262 + }
34263 + }
34264 + return result;
34265 +}
34266 +
34267 +/* plugin->u.file.mmap */
34268 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34269 +{
34270 + int result;
34271 + struct inode *inode;
34272 + reiser4_context *ctx;
34273 +
34274 + inode = file->f_dentry->d_inode;
34275 + ctx = reiser4_init_context(inode->i_sb);
34276 + if (IS_ERR(ctx))
34277 + return PTR_ERR(ctx);
34278 + /*
34279 + * generic_file_mmap will do update_atime. Grab space for stat data
34280 + * update.
34281 + */
34282 + result = reiser4_grab_space_force
34283 + (inode_file_plugin(inode)->estimate.update(inode),
34284 + BA_CAN_COMMIT);
34285 + if (result) {
34286 + reiser4_exit_context(ctx);
34287 + return result;
34288 + }
34289 + result = generic_file_mmap(file, vma);
34290 + reiser4_exit_context(ctx);
34291 + return result;
34292 +}
34293 +
34294 +/* plugin->u.file.release */
34295 +/* plugin->u.file.get_block */
34296 +
34297 +/* this is implementation of delete method of file plugin for
34298 + cryptcompress objects */
34299 +int delete_object_cryptcompress(struct inode *inode)
34300 +{
34301 + int result;
34302 +
34303 + assert("edward-429", inode->i_nlink == 0);
34304 +
34305 + reiser4_txn_restart_current();
34306 +
34307 + result = cryptcompress_truncate(inode, 0, 0);
34308 + if (result) {
34309 + warning("edward-430",
34310 + "cannot truncate cryptcompress file %lli: %i",
34311 + (unsigned long long)get_inode_oid(inode),
34312 + result);
34313 + }
34314 + truncate_inode_pages(inode->i_mapping, 0);
34315 + /* and remove stat data */
34316 + return reiser4_delete_object_common(inode);
34317 +}
34318 +
34319 +/* plugin->u.file.setattr method
34320 + This implements actual truncate (see comments in reiser4/page_cache.c) */
34321 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
34322 +{
34323 + int result;
34324 + struct inode *inode;
34325 +
34326 + inode = dentry->d_inode;
34327 + if (attr->ia_valid & ATTR_SIZE) {
34328 + if (inode->i_size != attr->ia_size) {
34329 + reiser4_context *ctx;
34330 + loff_t old_size;
34331 +
34332 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
34333 + if (IS_ERR(ctx))
34334 + return PTR_ERR(ctx);
34335 +
34336 + inode_check_scale(inode, inode->i_size, attr->ia_size);
34337 +
34338 + old_size = inode->i_size;
34339 +
34340 + result =
34341 + cryptcompress_truncate(inode, attr->ia_size,
34342 + 1 /* update stat data */ );
34343 + if (result) {
34344 + warning("edward-1192",
34345 + "truncate_cryptcompress failed: oid %lli, "
34346 + "old size %lld, new size %lld, retval %d",
34347 + (unsigned long long)
34348 + get_inode_oid(inode), old_size,
34349 + attr->ia_size, result);
34350 + }
34351 + context_set_commit_async(ctx);
34352 + reiser4_exit_context(ctx);
34353 + } else
34354 + result = 0;
34355 + } else
34356 + result = reiser4_setattr_common(dentry, attr);
34357 + return result;
34358 +}
34359 +
34360 +/* sendfile_cryptcompress - sendfile of struct file_operations */
34361 +ssize_t
34362 +sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34363 + read_actor_t actor, void *target)
34364 +{
34365 + reiser4_context *ctx;
34366 + ssize_t result;
34367 + struct inode *inode;
34368 + cryptcompress_info_t *info;
34369 +
34370 + inode = file->f_dentry->d_inode;
34371 + ctx = reiser4_init_context(inode->i_sb);
34372 + if (IS_ERR(ctx))
34373 + return PTR_ERR(ctx);
34374 + /*
34375 + * generic_file_sndfile may want to call update_atime. Grab space for
34376 + * stat data update
34377 + */
34378 + result = reiser4_grab_space(estimate_update_common(inode),
34379 + BA_CAN_COMMIT);
34380 + if (result)
34381 + goto exit;
34382 + info = cryptcompress_inode_data(inode);
34383 +
34384 + result = generic_file_sendfile(file, ppos, count, actor, target);
34385 + exit:
34386 + reiser4_exit_context(ctx);
34387 + return result;
34388 +}
34389 +
34390 +/*
34391 + * release_cryptcompress - release of struct file_operations
34392 + * @inode: inode of released file
34393 + * @file: file to release
34394 + */
34395 +int release_cryptcompress(struct inode *inode, struct file *file)
34396 +{
34397 + reiser4_context *ctx = reiser4_init_context(inode->i_sb);
34398 +
34399 + if (IS_ERR(ctx))
34400 + return PTR_ERR(ctx);
34401 + reiser4_free_file_fsdata(file);
34402 + reiser4_exit_context(ctx);
34403 + return 0;
34404 +}
34405 +
34406 +#if 0
34407 +int prepare_write_cryptcompress(struct file *file, struct page *page,
34408 + unsigned from, unsigned to)
34409 +{
34410 + return prepare_write_common(file, page, from, to);
34411 +}
34412 +#endif /* 0 */
34413 +
34414 +
34415 +/*
34416 + Local variables:
34417 + c-indentation-style: "K&R"
34418 + mode-name: "LC"
34419 + c-basic-offset: 8
34420 + tab-width: 8
34421 + fill-column: 80
34422 + scroll-step: 1
34423 + End:
34424 +*/
34425 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h
34426 --- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
34427 +++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h 2007-05-06 14:50:43.774999471 +0400
34428 @@ -0,0 +1,554 @@
34429 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34430 +/* See http://www.namesys.com/cryptcompress_design.html */
34431 +
34432 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34433 +#define __FS_REISER4_CRYPTCOMPRESS_H__
34434 +
34435 +#include "../../page_cache.h"
34436 +#include "../compress/compress.h"
34437 +#include "../crypto/cipher.h"
34438 +
34439 +#include <linux/pagemap.h>
34440 +
34441 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34442 +#define MAX_CLUSTER_SHIFT 16
34443 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34444 +#define DC_CHECKSUM_SIZE 4
34445 +
34446 +#define MIN_LATTICE_FACTOR 1
34447 +#define MAX_LATTICE_FACTOR 32
34448 +
34449 +/* this mask contains all non-standard plugins that might
34450 + be present in reiser4-specific part of inode managed by
34451 + cryptcompress file plugin */
34452 +#define cryptcompress_mask \
34453 + ((1 << PSET_FILE) | \
34454 + (1 << PSET_CLUSTER) | \
34455 + (1 << PSET_CIPHER) | \
34456 + (1 << PSET_DIGEST) | \
34457 + (1 << PSET_COMPRESSION) | \
34458 + (1 << PSET_COMPRESSION_MODE))
34459 +
34460 +static inline loff_t min_count(loff_t a, loff_t b)
34461 +{
34462 + return (a < b ? a : b);
34463 +}
34464 +
34465 +static inline loff_t max_count(loff_t a, loff_t b)
34466 +{
34467 + return (a > b ? a : b);
34468 +}
34469 +
34470 +#if REISER4_DEBUG
34471 +static inline int cluster_shift_ok(int shift)
34472 +{
34473 + return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34474 +}
34475 +#endif
34476 +
34477 +typedef struct tfm_stream {
34478 + __u8 *data;
34479 + size_t size;
34480 +} tfm_stream_t;
34481 +
34482 +typedef enum {
34483 + INPUT_STREAM,
34484 + OUTPUT_STREAM,
34485 + LAST_STREAM
34486 +} tfm_stream_id;
34487 +
34488 +typedef tfm_stream_t *tfm_unit[LAST_STREAM];
34489 +
34490 +static inline __u8 *ts_data(tfm_stream_t * stm)
34491 +{
34492 + assert("edward-928", stm != NULL);
34493 + return stm->data;
34494 +}
34495 +
34496 +static inline size_t ts_size(tfm_stream_t * stm)
34497 +{
34498 + assert("edward-929", stm != NULL);
34499 + return stm->size;
34500 +}
34501 +
34502 +static inline void set_ts_size(tfm_stream_t * stm, size_t size)
34503 +{
34504 + assert("edward-930", stm != NULL);
34505 +
34506 + stm->size = size;
34507 +}
34508 +
34509 +static inline int alloc_ts(tfm_stream_t ** stm)
34510 +{
34511 + assert("edward-931", stm);
34512 + assert("edward-932", *stm == NULL);
34513 +
34514 + *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
34515 + if (*stm == NULL)
34516 + return -ENOMEM;
34517 + memset(*stm, 0, sizeof **stm);
34518 + return 0;
34519 +}
34520 +
34521 +static inline void free_ts(tfm_stream_t * stm)
34522 +{
34523 + assert("edward-933", !ts_data(stm));
34524 + assert("edward-934", !ts_size(stm));
34525 +
34526 + kfree(stm);
34527 +}
34528 +
34529 +static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
34530 +{
34531 + assert("edward-935", !ts_data(stm));
34532 + assert("edward-936", !ts_size(stm));
34533 + assert("edward-937", size != 0);
34534 +
34535 + stm->data = reiser4_vmalloc(size);
34536 + if (!stm->data)
34537 + return -ENOMEM;
34538 + set_ts_size(stm, size);
34539 + return 0;
34540 +}
34541 +
34542 +static inline void free_ts_data(tfm_stream_t * stm)
34543 +{
34544 + assert("edward-938", equi(ts_data(stm), ts_size(stm)));
34545 +
34546 + if (ts_data(stm))
34547 + vfree(ts_data(stm));
34548 + memset(stm, 0, sizeof *stm);
34549 +}
34550 +
34551 +/* Write modes for item conversion in flush convert phase */
34552 +typedef enum {
34553 + CRC_APPEND_ITEM = 1,
34554 + CRC_OVERWRITE_ITEM = 2,
34555 + CRC_CUT_ITEM = 3
34556 +} cryptcompress_write_mode_t;
34557 +
34558 +typedef enum {
34559 + PCL_UNKNOWN = 0, /* invalid option */
34560 + PCL_APPEND = 1, /* append and/or overwrite */
34561 + PCL_TRUNCATE = 2 /* truncate */
34562 +} page_cluster_op;
34563 +
34564 +/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
34565 + using crypto/compression transforms implemented by reiser4 transform plugins.
34566 + Before each transform we allocate a pair of streams (tfm_unit) and assemble
34567 + page cluster into the input one. After transform we split output stream into
34568 + a set of items (disk cluster).
34569 +*/
34570 +typedef struct tfm_cluster {
34571 + coa_set coa;
34572 + tfm_unit tun;
34573 + tfm_action act;
34574 + int uptodate;
34575 + int lsize; /* size of the logical cluster */
34576 + int len; /* length of the transform stream */
34577 +} tfm_cluster_t;
34578 +
34579 +static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
34580 +{
34581 + return tc->coa[id][act];
34582 +}
34583 +
34584 +static inline void
34585 +set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
34586 +{
34587 + tc->coa[id][act] = coa;
34588 +}
34589 +
34590 +static inline int
34591 +alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
34592 +{
34593 + coa_t coa;
34594 +
34595 + coa = cplug->alloc(tc->act);
34596 + if (IS_ERR(coa))
34597 + return PTR_ERR(coa);
34598 + set_coa(tc, cplug->h.id, tc->act, coa);
34599 + return 0;
34600 +}
34601 +
34602 +static inline int
34603 +grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
34604 +{
34605 + return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
34606 + alloc_coa(tc, cplug) : 0);
34607 +}
34608 +
34609 +static inline void free_coa_set(tfm_cluster_t * tc)
34610 +{
34611 + tfm_action j;
34612 + reiser4_compression_id i;
34613 + compression_plugin *cplug;
34614 +
34615 + assert("edward-810", tc != NULL);
34616 +
34617 + for (j = 0; j < TFMA_LAST; j++)
34618 + for (i = 0; i < LAST_COMPRESSION_ID; i++) {
34619 + if (!get_coa(tc, i, j))
34620 + continue;
34621 + cplug = compression_plugin_by_id(i);
34622 + assert("edward-812", cplug->free != NULL);
34623 + cplug->free(get_coa(tc, i, j), j);
34624 + set_coa(tc, i, j, 0);
34625 + }
34626 + return;
34627 +}
34628 +
34629 +static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
34630 +{
34631 + return tc->tun[id];
34632 +}
34633 +
34634 +static inline void
34635 +set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
34636 +{
34637 + tc->tun[id] = ts;
34638 +}
34639 +
34640 +static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
34641 +{
34642 + return ts_data(tfm_stream(tc, id));
34643 +}
34644 +
34645 +static inline void
34646 +set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
34647 +{
34648 + tfm_stream(tc, id)->data = data;
34649 +}
34650 +
34651 +static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
34652 +{
34653 + return ts_size(tfm_stream(tc, id));
34654 +}
34655 +
34656 +static inline void
34657 +set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
34658 +{
34659 + tfm_stream(tc, id)->size = size;
34660 +}
34661 +
34662 +static inline int
34663 +alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
34664 +{
34665 + assert("edward-939", tc != NULL);
34666 + assert("edward-940", !tfm_stream(tc, id));
34667 +
34668 + tc->tun[id] = kmalloc(sizeof(tfm_stream_t), reiser4_ctx_gfp_mask_get());
34669 + if (!tc->tun[id])
34670 + return -ENOMEM;
34671 + memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
34672 + return alloc_ts_data(tfm_stream(tc, id), size);
34673 +}
34674 +
34675 +static inline int
34676 +realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
34677 +{
34678 + assert("edward-941", tfm_stream_size(tc, id) < size);
34679 + free_ts_data(tfm_stream(tc, id));
34680 + return alloc_ts_data(tfm_stream(tc, id), size);
34681 +}
34682 +
34683 +static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
34684 +{
34685 + free_ts_data(tfm_stream(tc, id));
34686 + free_ts(tfm_stream(tc, id));
34687 + set_tfm_stream(tc, id, 0);
34688 +}
34689 +
34690 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
34691 +{
34692 + return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
34693 +}
34694 +
34695 +static inline void free_tfm_unit(tfm_cluster_t * tc)
34696 +{
34697 + tfm_stream_id id;
34698 + for (id = 0; id < LAST_STREAM; id++) {
34699 + if (!tfm_stream(tc, id))
34700 + continue;
34701 + free_tfm_stream(tc, id);
34702 + }
34703 +}
34704 +
34705 +static inline void put_tfm_cluster(tfm_cluster_t * tc)
34706 +{
34707 + assert("edward-942", tc != NULL);
34708 + free_coa_set(tc);
34709 + free_tfm_unit(tc);
34710 +}
34711 +
34712 +static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
34713 +{
34714 + assert("edward-943", tc != NULL);
34715 + assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
34716 + return (tc->uptodate == 1);
34717 +}
34718 +
34719 +static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
34720 +{
34721 + assert("edward-945", tc != NULL);
34722 + assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
34723 + tc->uptodate = 1;
34724 + return;
34725 +}
34726 +
34727 +static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
34728 +{
34729 + assert("edward-947", tc != NULL);
34730 + assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
34731 + tc->uptodate = 0;
34732 + return;
34733 +}
34734 +
34735 +static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
34736 +{
34737 + return (tfm_stream(tc, id) &&
34738 + tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
34739 +}
34740 +
34741 +static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
34742 +{
34743 + int i;
34744 + for (i = 0; i < LAST_STREAM; i++)
34745 + if (!tfm_stream_is_set(tc, i))
34746 + return 0;
34747 + return 1;
34748 +}
34749 +
34750 +static inline void alternate_streams(tfm_cluster_t * tc)
34751 +{
34752 + tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
34753 +
34754 + set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
34755 + set_tfm_stream(tc, OUTPUT_STREAM, tmp);
34756 +}
34757 +
34758 +/* a kind of data that we can write to the window */
34759 +typedef enum {
34760 + DATA_WINDOW, /* the data we copy form user space */
34761 + HOLE_WINDOW /* zeroes if we write hole */
34762 +} window_stat;
34763 +
34764 +/* Sliding window of cluster size which should be set to the approprite position
34765 + (defined by cluster index) in a file before page cluster modification by
34766 + file_write. Then we translate file size, offset to write from, number of
34767 + bytes to write, etc.. to the following configuration needed to estimate
34768 + number of pages to read before write, etc...
34769 +*/
34770 +typedef struct reiser4_slide {
34771 + unsigned off; /* offset we start to write/truncate from */
34772 + unsigned count; /* number of bytes (zeroes) to write/truncate */
34773 + unsigned delta; /* number of bytes to append to the hole */
34774 + window_stat stat; /* a kind of data to write to the window */
34775 +} reiser4_slide_t;
34776 +
34777 +/* The following is a set of possible disk cluster states */
34778 +typedef enum {
34779 + INVAL_DISK_CLUSTER, /* unknown state */
34780 + PREP_DISK_CLUSTER, /* disk cluster got converted by flush
34781 + at least 1 time */
34782 + UNPR_DISK_CLUSTER, /* disk cluster just created and should be
34783 + converted by flush */
34784 + FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory
34785 + nor on disk */
34786 +} disk_cluster_stat;
34787 +
34788 +/*
34789 + While implementing all transforms (from page to disk cluster, and back)
34790 + reiser4 cluster manager fills the following structure incapsulating pointers
34791 + to all the clusters for the same index including the sliding window above
34792 +*/
34793 +typedef struct reiser4_cluster {
34794 + tfm_cluster_t tc; /* transform cluster */
34795 + int nr_pages; /* number of pages */
34796 + struct page **pages; /* page cluster */
34797 + page_cluster_op op; /* page cluster operation */
34798 + struct file *file;
34799 + hint_t *hint; /* disk cluster item for traversal */
34800 + disk_cluster_stat dstat; /* state of the current disk cluster */
34801 + cloff_t index; /* offset in the units of cluster size */
34802 + int index_valid; /* to validate the index above, if needed */
34803 + reiser4_slide_t *win; /* sliding window of cluster size */
34804 + int reserved; /* this indicates that space for disk
34805 + cluster modification is reserved */
34806 +#if REISER4_DEBUG
34807 + reiser4_context *ctx;
34808 + int reserved_prepped;
34809 + int reserved_unprepped;
34810 +#endif
34811 +
34812 +} reiser4_cluster_t;
34813 +
34814 +static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
34815 +{
34816 + return tfm_stream_data(&clust->tc, INPUT_STREAM);
34817 +}
34818 +
34819 +static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
34820 +{
34821 + return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
34822 +}
34823 +
34824 +static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
34825 +{
34826 + assert("edward-1057", clust->pages != NULL);
34827 + memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
34828 + return 0;
34829 +}
34830 +
34831 +static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
34832 +{
34833 + assert("edward-949", clust != NULL);
34834 + assert("edward-1362", clust->pages == NULL);
34835 + assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
34836 +
34837 + clust->pages =
34838 + kmalloc(sizeof(*clust->pages) * nrpages,
34839 + reiser4_ctx_gfp_mask_get());
34840 + if (!clust->pages)
34841 + return RETERR(-ENOMEM);
34842 + reset_cluster_pgset(clust, nrpages);
34843 + return 0;
34844 +}
34845 +
34846 +static inline void free_cluster_pgset(reiser4_cluster_t * clust)
34847 +{
34848 + assert("edward-951", clust->pages != NULL);
34849 + kfree(clust->pages);
34850 + clust->pages = NULL;
34851 +}
34852 +
34853 +static inline void put_cluster_handle(reiser4_cluster_t * clust)
34854 +{
34855 + assert("edward-435", clust != NULL);
34856 +
34857 + put_tfm_cluster(&clust->tc);
34858 + if (clust->pages)
34859 + free_cluster_pgset(clust);
34860 + memset(clust, 0, sizeof *clust);
34861 +}
34862 +
34863 +static inline void inc_keyload_count(crypto_stat_t * data)
34864 +{
34865 + assert("edward-1410", data != NULL);
34866 + data->keyload_count++;
34867 +}
34868 +
34869 +static inline void dec_keyload_count(crypto_stat_t * data)
34870 +{
34871 + assert("edward-1411", data != NULL);
34872 + assert("edward-1412", data->keyload_count > 0);
34873 + data->keyload_count--;
34874 +}
34875 +
34876 +/* cryptcompress specific part of reiser4_inode */
34877 +typedef struct cryptcompress_info {
34878 + crypto_stat_t *crypt;
34879 + /* the following 2 fields are controlled by compression mode plugin */
34880 + int compress_toggle; /* current status of compressibility */
34881 + int lattice_factor; /* factor of dynamic lattice. FIXME: Have a
34882 + compression_toggle to keep the factor */
34883 +#if REISER4_DEBUG
34884 + int pgcount; /* number of captured pages */
34885 +#endif
34886 +} cryptcompress_info_t;
34887 +
34888 +static inline void set_compression_toggle (cryptcompress_info_t * info, int val)
34889 +{
34890 + info->compress_toggle = val;
34891 +}
34892 +
34893 +static inline int get_compression_toggle (cryptcompress_info_t * info)
34894 +{
34895 + return info->compress_toggle;
34896 +}
34897 +
34898 +static inline int compression_is_on(cryptcompress_info_t * info)
34899 +{
34900 + return get_compression_toggle(info) == 1;
34901 +}
34902 +
34903 +static inline void turn_on_compression(cryptcompress_info_t * info)
34904 +{
34905 + set_compression_toggle(info, 1);
34906 +}
34907 +
34908 +static inline void turn_off_compression(cryptcompress_info_t * info)
34909 +{
34910 + set_compression_toggle(info, 0);
34911 +}
34912 +
34913 +static inline void set_lattice_factor(cryptcompress_info_t * info, int val)
34914 +{
34915 + info->lattice_factor = val;
34916 +}
34917 +
34918 +static inline int get_lattice_factor(cryptcompress_info_t * info)
34919 +{
34920 + return info->lattice_factor;
34921 +}
34922 +
34923 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
34924 +int equal_to_rdk(znode *, const reiser4_key *);
34925 +int goto_right_neighbor(coord_t *, lock_handle *);
34926 +int cryptcompress_inode_ok(struct inode *inode);
34927 +int coord_is_unprepped_ctail(const coord_t * coord);
34928 +extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *,
34929 + znode_lock_mode mode);
34930 +extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
34931 + struct page * page, znode_lock_mode mode);
34932 +extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
34933 + struct inode * inode);
34934 +extern int readpages_cryptcompress(struct file*, struct address_space*,
34935 + struct list_head*, unsigned);
34936 +int bind_cryptcompress(struct inode *child, struct inode *parent);
34937 +void destroy_inode_cryptcompress(struct inode * inode);
34938 +int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust);
34939 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
34940 + reiser4_cluster_t * clust, int * progress);
34941 +crypto_stat_t * inode_crypto_stat (struct inode * inode);
34942 +void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
34943 + int (*can_inherit)(struct inode * child,
34944 + struct inode * parent));
34945 +void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
34946 +void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
34947 +crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode);
34948 +
34949 +static inline struct crypto_blkcipher * info_get_cipher(crypto_stat_t * info)
34950 +{
34951 + return info->cipher;
34952 +}
34953 +
34954 +static inline void info_set_cipher(crypto_stat_t * info,
34955 + struct crypto_blkcipher * tfm)
34956 +{
34957 + info->cipher = tfm;
34958 +}
34959 +
34960 +static inline struct crypto_hash * info_get_digest(crypto_stat_t * info)
34961 +{
34962 + return info->digest;
34963 +}
34964 +
34965 +static inline void info_set_digest(crypto_stat_t * info,
34966 + struct crypto_hash * tfm)
34967 +{
34968 + info->digest = tfm;
34969 +}
34970 +
34971 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
34972 +
34973 +/* Make Linus happy.
34974 + Local variables:
34975 + c-indentation-style: "K&R"
34976 + mode-name: "LC"
34977 + c-basic-offset: 8
34978 + tab-width: 8
34979 + fill-column: 120
34980 + scroll-step: 1
34981 + End:
34982 +*/
34983 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.c linux-2.6.20/fs/reiser4/plugin/file/file.c
34984 --- linux-2.6.20.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
34985 +++ linux-2.6.20/fs/reiser4/plugin/file/file.c 2007-05-06 14:50:43.779000721 +0400
34986 @@ -0,0 +1,2821 @@
34987 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
34988 + * reiser4/README */
34989 +
34990 +/*
34991 + * this file contains implementations of inode/file/address_space/file plugin
34992 + * operations specific for "unix file plugin" (plugin id is
34993 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
34994 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
34995 + * no items but stat data)
34996 + */
34997 +
34998 +#include "../../inode.h"
34999 +#include "../../super.h"
35000 +#include "../../tree_walk.h"
35001 +#include "../../carry.h"
35002 +#include "../../page_cache.h"
35003 +#include "../../ioctl.h"
35004 +#include "../object.h"
35005 +#include "../../safe_link.h"
35006 +
35007 +#include <linux/writeback.h>
35008 +#include <linux/pagevec.h>
35009 +#include <linux/syscalls.h>
35010 +
35011 +
35012 +static int unpack(struct file *file, struct inode *inode, int forever);
35013 +static void drop_access(unix_file_info_t *);
35014 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35015 + znode_lock_mode lock_mode);
35016 +
35017 +/* Get exclusive access and make sure that file is not partially
35018 + * converted (It may happen that another process is doing tail
35019 + * conversion. If so, wait until it completes)
35020 + */
35021 +static inline void get_exclusive_access_careful(unix_file_info_t * uf_info,
35022 + struct inode *inode)
35023 +{
35024 + do {
35025 + get_exclusive_access(uf_info);
35026 + if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
35027 + break;
35028 + drop_exclusive_access(uf_info);
35029 + schedule();
35030 + } while (1);
35031 +}
35032 +
35033 +/* get unix file plugin specific portion of inode */
35034 +unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35035 +{
35036 + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35037 +}
35038 +
35039 +/**
35040 + * equal_to_rdk - compare key and znode's right delimiting key
35041 + * @node: node whose right delimiting key to compare with @key
35042 + * @key: key to compare with @node's right delimiting key
35043 + *
35044 + * Returns true if @key is equal to right delimiting key of @node.
35045 + */
35046 +int equal_to_rdk(znode *node, const reiser4_key *key)
35047 +{
35048 + int result;
35049 +
35050 + read_lock_dk(znode_get_tree(node));
35051 + result = keyeq(key, znode_get_rd_key(node));
35052 + read_unlock_dk(znode_get_tree(node));
35053 + return result;
35054 +}
35055 +
35056 +#if REISER4_DEBUG
35057 +
35058 +/**
35059 + * equal_to_ldk - compare key and znode's left delimiting key
35060 + * @node: node whose left delimiting key to compare with @key
35061 + * @key: key to compare with @node's left delimiting key
35062 + *
35063 + * Returns true if @key is equal to left delimiting key of @node.
35064 + */
35065 +int equal_to_ldk(znode *node, const reiser4_key *key)
35066 +{
35067 + int result;
35068 +
35069 + read_lock_dk(znode_get_tree(node));
35070 + result = keyeq(key, znode_get_ld_key(node));
35071 + read_unlock_dk(znode_get_tree(node));
35072 + return result;
35073 +}
35074 +
35075 +/**
35076 + * check_coord - check whether coord corresponds to key
35077 + * @coord: coord to check
35078 + * @key: key @coord has to correspond to
35079 + *
35080 + * Returns true if @coord is set as if it was set as result of lookup with @key
35081 + * in coord->node.
35082 + */
35083 +static int check_coord(const coord_t *coord, const reiser4_key *key)
35084 +{
35085 + coord_t twin;
35086 +
35087 + node_plugin_by_node(coord->node)->lookup(coord->node, key,
35088 + FIND_MAX_NOT_MORE_THAN, &twin);
35089 + return coords_equal(coord, &twin);
35090 +}
35091 +
35092 +#endif /* REISER4_DEBUG */
35093 +
35094 +/**
35095 + * init_uf_coord - initialize extended coord
35096 + * @uf_coord:
35097 + * @lh:
35098 + *
35099 + *
35100 + */
35101 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35102 +{
35103 + coord_init_zero(&uf_coord->coord);
35104 + coord_clear_iplug(&uf_coord->coord);
35105 + uf_coord->lh = lh;
35106 + init_lh(lh);
35107 + memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35108 + uf_coord->valid = 0;
35109 +}
35110 +
35111 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35112 +{
35113 + assert("vs-1333", uf_coord->valid == 0);
35114 +
35115 + if (coord_is_between_items(&uf_coord->coord))
35116 + return;
35117 +
35118 + assert("vs-1348",
35119 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35120 + init_coord_extension);
35121 +
35122 + item_body_by_coord(&uf_coord->coord);
35123 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35124 + init_coord_extension(uf_coord, offset);
35125 +}
35126 +
35127 +/**
35128 + * goto_right_neighbor - lock right neighbor, drop current node lock
35129 + * @coord:
35130 + * @lh:
35131 + *
35132 + * Obtain lock on right neighbor and drop lock on current node.
35133 + */
35134 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35135 +{
35136 + int result;
35137 + lock_handle lh_right;
35138 +
35139 + assert("vs-1100", znode_is_locked(coord->node));
35140 +
35141 + init_lh(&lh_right);
35142 + result = reiser4_get_right_neighbor(&lh_right, coord->node,
35143 + znode_is_wlocked(coord->node) ?
35144 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35145 + GN_CAN_USE_UPPER_LEVELS);
35146 + if (result) {
35147 + done_lh(&lh_right);
35148 + return result;
35149 + }
35150 +
35151 + /*
35152 + * we hold two longterm locks on neighboring nodes. Unlock left of
35153 + * them
35154 + */
35155 + done_lh(lh);
35156 +
35157 + coord_init_first_unit_nocheck(coord, lh_right.node);
35158 + move_lh(lh, &lh_right);
35159 +
35160 + return 0;
35161 +
35162 +}
35163 +
35164 +/**
35165 + * set_file_state
35166 + * @uf_info:
35167 + * @cbk_result:
35168 + * @level:
35169 + *
35170 + * This is to be used by find_file_item and in find_file_state to
35171 + * determine real state of file
35172 + */
35173 +static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35174 + tree_level level)
35175 +{
35176 + if (cbk_errored(cbk_result))
35177 + /* error happened in find_file_item */
35178 + return;
35179 +
35180 + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35181 +
35182 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35183 + /*
35184 + * container is unknown, therefore conversion can not be in
35185 + * progress
35186 + */
35187 + assert("",
35188 + !reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35189 + REISER4_PART_IN_CONV));
35190 + if (cbk_result == CBK_COORD_NOTFOUND)
35191 + uf_info->container = UF_CONTAINER_EMPTY;
35192 + else if (level == LEAF_LEVEL)
35193 + uf_info->container = UF_CONTAINER_TAILS;
35194 + else
35195 + uf_info->container = UF_CONTAINER_EXTENTS;
35196 + } else {
35197 + /*
35198 + * file state is known, check whether it is set correctly if
35199 + * file is not being tail converted
35200 + */
35201 + if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35202 + REISER4_PART_IN_CONV)) {
35203 + assert("vs-1162",
35204 + ergo(level == LEAF_LEVEL &&
35205 + cbk_result == CBK_COORD_FOUND,
35206 + uf_info->container == UF_CONTAINER_TAILS));
35207 + assert("vs-1165",
35208 + ergo(level == TWIG_LEVEL &&
35209 + cbk_result == CBK_COORD_FOUND,
35210 + uf_info->container == UF_CONTAINER_EXTENTS));
35211 + }
35212 + }
35213 +}
35214 +
35215 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35216 + const reiser4_key *key, znode_lock_mode lock_mode,
35217 + struct inode *inode)
35218 +{
35219 + return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35220 + FIND_MAX_NOT_MORE_THAN,
35221 + TWIG_LEVEL, LEAF_LEVEL,
35222 + (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35223 + (CBK_UNIQUE | CBK_FOR_INSERT),
35224 + NULL /* ra_info */ );
35225 +}
35226 +
35227 +/**
35228 + * find_file_item - look for file item in the tree
35229 + * @hint: provides coordinate, lock handle, seal
35230 + * @key: key for search
35231 + * @mode: mode of lock to put on returned node
35232 + * @ra_info:
35233 + * @inode:
35234 + *
35235 + * This finds position in the tree corresponding to @key. It first tries to use
35236 + * @hint's seal if it is set.
35237 + */
35238 +int find_file_item(hint_t *hint, const reiser4_key *key,
35239 + znode_lock_mode lock_mode,
35240 + struct inode *inode)
35241 +{
35242 + int result;
35243 + coord_t *coord;
35244 + lock_handle *lh;
35245 +
35246 + assert("nikita-3030", reiser4_schedulable());
35247 + assert("vs-1707", hint != NULL);
35248 + assert("vs-47", inode != NULL);
35249 +
35250 + coord = &hint->ext_coord.coord;
35251 + lh = hint->ext_coord.lh;
35252 + init_lh(lh);
35253 +
35254 + result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35255 + if (!result) {
35256 + if (coord->between == AFTER_UNIT &&
35257 + equal_to_rdk(coord->node, key)) {
35258 + result = goto_right_neighbor(coord, lh);
35259 + if (result == -E_NO_NEIGHBOR)
35260 + return RETERR(-EIO);
35261 + if (result)
35262 + return result;
35263 + assert("vs-1152", equal_to_ldk(coord->node, key));
35264 + /*
35265 + * we moved to different node. Invalidate coord
35266 + * extension, zload is necessary to init it again
35267 + */
35268 + hint->ext_coord.valid = 0;
35269 + }
35270 +
35271 + set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35272 + znode_get_level(coord->node));
35273 +
35274 + return CBK_COORD_FOUND;
35275 + }
35276 +
35277 + coord_init_zero(coord);
35278 + result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35279 + set_file_state(unix_file_inode_data(inode), result,
35280 + znode_get_level(coord->node));
35281 +
35282 + /* FIXME: we might already have coord extension initialized */
35283 + hint->ext_coord.valid = 0;
35284 + return result;
35285 +}
35286 +
35287 +/* plugin->u.file.write_flowom = NULL
35288 + plugin->u.file.read_flow = NULL */
35289 +
35290 +void hint_init_zero(hint_t * hint)
35291 +{
35292 + memset(hint, 0, sizeof(*hint));
35293 + init_lh(&hint->lh);
35294 + hint->ext_coord.lh = &hint->lh;
35295 +}
35296 +
35297 +static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35298 +{
35299 + int result;
35300 + reiser4_key key;
35301 + coord_t coord;
35302 + lock_handle lh;
35303 +
35304 + assert("vs-1628", ea_obtained(uf_info));
35305 +
35306 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35307 + key_by_inode_and_offset_common(inode, 0, &key);
35308 + init_lh(&lh);
35309 + result = find_file_item_nohint(&coord, &lh, &key,
35310 + ZNODE_READ_LOCK, inode);
35311 + set_file_state(uf_info, result, znode_get_level(coord.node));
35312 + done_lh(&lh);
35313 + if (!cbk_errored(result))
35314 + result = 0;
35315 + } else
35316 + result = 0;
35317 + assert("vs-1074",
35318 + ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35319 + reiser4_txn_restart_current();
35320 + return result;
35321 +}
35322 +
35323 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35324 + data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35325 + if page corresponds to hole extent and unallocated one will have to be created */
35326 +static int reserve_partial_page(reiser4_tree * tree)
35327 +{
35328 + grab_space_enable();
35329 + return reiser4_grab_reserved(reiser4_get_current_sb(),
35330 + 1 +
35331 + 2 * estimate_one_insert_into_item(tree),
35332 + BA_CAN_COMMIT);
35333 +}
35334 +
35335 +/* estimate and reserve space needed to cut one item and update one stat data */
35336 +static int reserve_cut_iteration(reiser4_tree * tree)
35337 +{
35338 + __u64 estimate = estimate_one_item_removal(tree)
35339 + + estimate_one_insert_into_item(tree);
35340 +
35341 + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35342 +
35343 + grab_space_enable();
35344 + /* We need to double our estimate now that we can delete more than one
35345 + node. */
35346 + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35347 + BA_CAN_COMMIT);
35348 +}
35349 +
35350 +int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35351 + int update_sd)
35352 +{
35353 + int result = 0;
35354 +
35355 + INODE_SET_FIELD(inode, i_size, get_key_offset(key));
35356 + if (update_sd) {
35357 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35358 + result = reiser4_update_sd(inode);
35359 + }
35360 + return result;
35361 +}
35362 +
35363 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35364 + and update file stat data on every single cut from the tree */
35365 +int
35366 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35367 + loff_t cur_size, int (*update_actor) (struct inode *,
35368 + reiser4_key *, int))
35369 +{
35370 + reiser4_key from_key, to_key;
35371 + reiser4_key smallest_removed;
35372 + file_plugin *fplug = inode_file_plugin(inode);
35373 + int result;
35374 + int progress = 0;
35375 +
35376 + assert("vs-1248",
35377 + fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
35378 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35379 +
35380 + fplug->key_by_inode(inode, new_size, &from_key);
35381 + to_key = from_key;
35382 + set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
35383 + /* this loop normally runs just once */
35384 + while (1) {
35385 + result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
35386 + if (result)
35387 + break;
35388 +
35389 + result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35390 + &smallest_removed, inode, 1,
35391 + &progress);
35392 + if (result == -E_REPEAT) {
35393 + /* -E_REPEAT is a signal to interrupt a long file truncation process */
35394 + if (progress) {
35395 + result =
35396 + update_actor(inode, &smallest_removed,
35397 + update_sd);
35398 + if (result)
35399 + break;
35400 + }
35401 +
35402 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
35403 + reiser4_release_reserved(inode->i_sb);
35404 +
35405 + /* reiser4_cut_tree_object() was interrupted probably because
35406 + * current atom requires commit, we have to release
35407 + * transaction handle to allow atom commit. */
35408 + reiser4_txn_restart_current();
35409 + continue;
35410 + }
35411 + if (result
35412 + && !(result == CBK_COORD_NOTFOUND && new_size == 0
35413 + && inode->i_size == 0))
35414 + break;
35415 +
35416 + set_key_offset(&smallest_removed, new_size);
35417 + /* Final sd update after the file gets its correct size */
35418 + result = update_actor(inode, &smallest_removed, update_sd);
35419 + break;
35420 + }
35421 +
35422 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
35423 + reiser4_release_reserved(inode->i_sb);
35424 +
35425 + return result;
35426 +}
35427 +
35428 +int find_or_create_extent(struct page *page);
35429 +
35430 +/* part of truncate_file_body: it is called when truncate is used to make file
35431 + shorter */
35432 +static int shorten_file(struct inode *inode, loff_t new_size)
35433 +{
35434 + int result;
35435 + struct page *page;
35436 + int padd_from;
35437 + unsigned long index;
35438 + char *kaddr;
35439 + unix_file_info_t *uf_info;
35440 +
35441 + /*
35442 + * all items of ordinary reiser4 file are grouped together. That is why
35443 + * we can use reiser4_cut_tree. Plan B files (for instance) can not be
35444 + * truncated that simply
35445 + */
35446 + result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
35447 + get_key_offset(reiser4_max_key()),
35448 + reiser4_update_file_size);
35449 + if (result)
35450 + return result;
35451 +
35452 + uf_info = unix_file_inode_data(inode);
35453 + assert("vs-1105", new_size == inode->i_size);
35454 + if (new_size == 0) {
35455 + uf_info->container = UF_CONTAINER_EMPTY;
35456 + return 0;
35457 + }
35458 +
35459 + result = find_file_state(inode, uf_info);
35460 + if (result)
35461 + return result;
35462 + if (uf_info->container == UF_CONTAINER_TAILS)
35463 + /*
35464 + * No need to worry about zeroing last page after new file
35465 + * end
35466 + */
35467 + return 0;
35468 +
35469 + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35470 + if (!padd_from)
35471 + /* file is truncated to page boundary */
35472 + return 0;
35473 +
35474 + result = reserve_partial_page(reiser4_tree_by_inode(inode));
35475 + if (result) {
35476 + reiser4_release_reserved(inode->i_sb);
35477 + return result;
35478 + }
35479 +
35480 + /* last page is partially truncated - zero its content */
35481 + index = (inode->i_size >> PAGE_CACHE_SHIFT);
35482 + page = read_mapping_page(inode->i_mapping, index, NULL);
35483 + if (IS_ERR(page)) {
35484 + /*
35485 + * the below does up(sbinfo->delete_mutex). Do not get
35486 + * confused
35487 + */
35488 + reiser4_release_reserved(inode->i_sb);
35489 + if (likely(PTR_ERR(page) == -EINVAL)) {
35490 + /* looks like file is built of tail items */
35491 + return 0;
35492 + }
35493 + return PTR_ERR(page);
35494 + }
35495 + wait_on_page_locked(page);
35496 + if (!PageUptodate(page)) {
35497 + page_cache_release(page);
35498 + /*
35499 + * the below does up(sbinfo->delete_mutex). Do not get
35500 + * confused
35501 + */
35502 + reiser4_release_reserved(inode->i_sb);
35503 + return RETERR(-EIO);
35504 + }
35505 +
35506 + /*
35507 + * if page correspons to hole extent unit - unallocated one will be
35508 + * created here. This is not necessary
35509 + */
35510 + result = find_or_create_extent(page);
35511 +
35512 + /*
35513 + * FIXME: cut_file_items has already updated inode. Probably it would
35514 + * be better to update it here when file is really truncated
35515 + */
35516 + if (result) {
35517 + page_cache_release(page);
35518 + /*
35519 + * the below does up(sbinfo->delete_mutex). Do not get
35520 + * confused
35521 + */
35522 + reiser4_release_reserved(inode->i_sb);
35523 + return result;
35524 + }
35525 +
35526 + lock_page(page);
35527 + assert("vs-1066", PageLocked(page));
35528 + kaddr = kmap_atomic(page, KM_USER0);
35529 + memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
35530 + flush_dcache_page(page);
35531 + kunmap_atomic(kaddr, KM_USER0);
35532 + unlock_page(page);
35533 + page_cache_release(page);
35534 + /* the below does up(sbinfo->delete_mutex). Do not get confused */
35535 + reiser4_release_reserved(inode->i_sb);
35536 + return 0;
35537 +}
35538 +
35539 +/**
35540 + * should_have_notail
35541 + * @uf_info:
35542 + * @new_size:
35543 + *
35544 + * Calls formatting plugin to see whether file of size @new_size has to be
35545 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
35546 + */
35547 +static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
35548 +{
35549 + if (!uf_info->tplug)
35550 + return 1;
35551 + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
35552 + new_size);
35553 +
35554 +}
35555 +
35556 +/**
35557 + * truncate_file_body - change length of file
35558 + * @inode: inode of file
35559 + * @new_size: new file length
35560 + *
35561 + * Adjusts items file @inode is built of to match @new_size. It may either cut
35562 + * items or add them to represent a hole at the end of file. The caller has to
35563 + * obtain exclusive access to the file.
35564 + */
35565 +static int truncate_file_body(struct inode *inode, loff_t new_size)
35566 +{
35567 + int result;
35568 +
35569 + if (inode->i_size < new_size) {
35570 + /* expanding truncate */
35571 + struct dentry dentry;
35572 + struct file file;
35573 + unix_file_info_t *uf_info;
35574 +
35575 + dentry.d_inode = inode;
35576 + file.f_dentry = &dentry;
35577 + file.private_data = NULL;
35578 + file.f_pos = new_size;
35579 + file.private_data = NULL;
35580 + uf_info = unix_file_inode_data(inode);
35581 + result = find_file_state(inode, uf_info);
35582 + if (result)
35583 + return result;
35584 +
35585 + if (should_have_notail(uf_info, new_size)) {
35586 + /*
35587 + * file of size @new_size has to be built of
35588 + * extents. If it is built of tails - convert to
35589 + * extents
35590 + */
35591 + if (uf_info->container == UF_CONTAINER_TAILS) {
35592 + /*
35593 + * if file is being convered by another process
35594 + * - wait until it completes
35595 + */
35596 + while (1) {
35597 + if (reiser4_inode_get_flag(inode,
35598 + REISER4_PART_IN_CONV)) {
35599 + drop_exclusive_access(uf_info);
35600 + schedule();
35601 + get_exclusive_access(uf_info);
35602 + continue;
35603 + }
35604 + break;
35605 + }
35606 +
35607 + if (uf_info->container == UF_CONTAINER_TAILS) {
35608 + result = tail2extent(uf_info);
35609 + if (result)
35610 + return result;
35611 + }
35612 + }
35613 + result = reiser4_write_extent(&file, NULL, 0,
35614 + &new_size);
35615 + if (result)
35616 + return result;
35617 + uf_info->container = UF_CONTAINER_EXTENTS;
35618 + } else {
35619 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
35620 + result = reiser4_write_extent(&file, NULL, 0,
35621 + &new_size);
35622 + if (result)
35623 + return result;
35624 + } else {
35625 + result = reiser4_write_tail(&file, NULL, 0,
35626 + &new_size);
35627 + if (result)
35628 + return result;
35629 + uf_info->container = UF_CONTAINER_TAILS;
35630 + }
35631 + }
35632 + BUG_ON(result > 0);
35633 + INODE_SET_FIELD(inode, i_size, new_size);
35634 + file_update_time(&file);
35635 + result = reiser4_update_sd(inode);
35636 + BUG_ON(result != 0);
35637 + reiser4_free_file_fsdata(&file);
35638 + } else
35639 + result = shorten_file(inode, new_size);
35640 + return result;
35641 +}
35642 +
35643 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
35644 +
35645 +/**
35646 + * load_file_hint - copy hint from struct file to local variable
35647 + * @file: file to get hint from
35648 + * @hint: structure to fill
35649 + *
35650 + * Reiser4 specific portion of struct file may contain information (hint)
35651 + * stored on exiting from previous read or write. That information includes
35652 + * seal of znode and coord within that znode where previous read or write
35653 + * stopped. This function copies that information to @hint if it was stored or
35654 + * initializes @hint by 0s otherwise.
35655 + */
35656 +int load_file_hint(struct file *file, hint_t *hint)
35657 +{
35658 + reiser4_file_fsdata *fsdata;
35659 +
35660 + if (file) {
35661 + fsdata = reiser4_get_file_fsdata(file);
35662 + if (IS_ERR(fsdata))
35663 + return PTR_ERR(fsdata);
35664 +
35665 + spin_lock_inode(file->f_dentry->d_inode);
35666 + if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
35667 + *hint = fsdata->reg.hint;
35668 + init_lh(&hint->lh);
35669 + hint->ext_coord.lh = &hint->lh;
35670 + spin_unlock_inode(file->f_dentry->d_inode);
35671 + /*
35672 + * force re-validation of the coord on the first
35673 + * iteration of the read/write loop.
35674 + */
35675 + hint->ext_coord.valid = 0;
35676 + assert("nikita-19892", coords_equal(&hint->seal.coord1,
35677 + &hint->ext_coord.
35678 + coord));
35679 + return 0;
35680 + }
35681 + memset(&fsdata->reg.hint, 0, sizeof(hint_t));
35682 + spin_unlock_inode(file->f_dentry->d_inode);
35683 + }
35684 + hint_init_zero(hint);
35685 + return 0;
35686 +}
35687 +
35688 +/**
35689 + * save_file_hint - copy hint to reiser4 private struct file's part
35690 + * @file: file to save hint in
35691 + * @hint: hint to save
35692 + *
35693 + * This copies @hint to reiser4 private part of struct file. It can help
35694 + * speedup future accesses to the file.
35695 + */
35696 +void save_file_hint(struct file *file, const hint_t *hint)
35697 +{
35698 + reiser4_file_fsdata *fsdata;
35699 +
35700 + assert("edward-1337", hint != NULL);
35701 +
35702 + if (!file || !reiser4_seal_is_set(&hint->seal))
35703 + return;
35704 + fsdata = reiser4_get_file_fsdata(file);
35705 + assert("vs-965", !IS_ERR(fsdata));
35706 + assert("nikita-19891",
35707 + coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
35708 + assert("vs-30", hint->lh.owner == NULL);
35709 + spin_lock_inode(file->f_dentry->d_inode);
35710 + fsdata->reg.hint = *hint;
35711 + spin_unlock_inode(file->f_dentry->d_inode);
35712 + return;
35713 +}
35714 +
35715 +void reiser4_unset_hint(hint_t * hint)
35716 +{
35717 + assert("vs-1315", hint);
35718 + hint->ext_coord.valid = 0;
35719 + reiser4_seal_done(&hint->seal);
35720 + done_lh(&hint->lh);
35721 +}
35722 +
35723 +/* coord must be set properly. So, that reiser4_set_hint
35724 + has nothing to do */
35725 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
35726 + znode_lock_mode mode)
35727 +{
35728 + ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
35729 + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
35730 +
35731 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
35732 + hint->offset = get_key_offset(key);
35733 + hint->mode = mode;
35734 + done_lh(&hint->lh);
35735 +}
35736 +
35737 +int hint_is_set(const hint_t * hint)
35738 +{
35739 + return reiser4_seal_is_set(&hint->seal);
35740 +}
35741 +
35742 +#if REISER4_DEBUG
35743 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
35744 +{
35745 + return (get_key_locality(k1) == get_key_locality(k2) &&
35746 + get_key_type(k1) == get_key_type(k2) &&
35747 + get_key_band(k1) == get_key_band(k2) &&
35748 + get_key_ordering(k1) == get_key_ordering(k2) &&
35749 + get_key_objectid(k1) == get_key_objectid(k2));
35750 +}
35751 +#endif
35752 +
35753 +static int
35754 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35755 + znode_lock_mode lock_mode)
35756 +{
35757 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
35758 + /* hint either not set or set by different operation */
35759 + return RETERR(-E_REPEAT);
35760 +
35761 + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
35762 +
35763 + if (check_key && get_key_offset(key) != hint->offset)
35764 + /* hint is set for different key */
35765 + return RETERR(-E_REPEAT);
35766 +
35767 + assert("vs-31", hint->ext_coord.lh == &hint->lh);
35768 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
35769 + hint->ext_coord.lh, lock_mode,
35770 + ZNODE_LOCK_LOPRI);
35771 +}
35772 +
35773 +/**
35774 + * find_or_create_extent -
35775 + * @page:
35776 + *
35777 + *
35778 + */
35779 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
35780 + unallocated extent if it does not exist yet, initialize jnode, capture page */
35781 +int find_or_create_extent(struct page *page)
35782 +{
35783 + int result;
35784 + struct inode *inode;
35785 + int plugged_hole;
35786 +
35787 + jnode *node;
35788 +
35789 + assert("vs-1065", page->mapping && page->mapping->host);
35790 + inode = page->mapping->host;
35791 +
35792 + lock_page(page);
35793 + node = jnode_of_page(page);
35794 + if (IS_ERR(node)) {
35795 + unlock_page(page);
35796 + return PTR_ERR(node);
35797 + }
35798 + JF_SET(node, JNODE_WRITE_PREPARED);
35799 + unlock_page(page);
35800 +
35801 + if (node->blocknr == 0) {
35802 + plugged_hole = 0;
35803 + result = reiser4_update_extent(inode, node, page_offset(page),
35804 + &plugged_hole);
35805 + if (result) {
35806 + JF_CLR(node, JNODE_WRITE_PREPARED);
35807 + jput(node);
35808 + warning("", "reiser4_update_extent failed: %d", result);
35809 + return result;
35810 + }
35811 + if (plugged_hole)
35812 + reiser4_update_sd(inode);
35813 + } else {
35814 + spin_lock_jnode(node);
35815 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
35816 + BUG_ON(result != 0);
35817 + jnode_make_dirty_locked(node);
35818 + spin_unlock_jnode(node);
35819 + }
35820 +
35821 + BUG_ON(node->atom == NULL);
35822 + JF_CLR(node, JNODE_WRITE_PREPARED);
35823 + jput(node);
35824 +
35825 + if (get_current_context()->entd) {
35826 + entd_context *ent = get_entd_context(node->tree->super);
35827 +
35828 + if (ent->cur_request->page == page)
35829 + ent->cur_request->node = node;
35830 + }
35831 + return 0;
35832 +}
35833 +
35834 +/**
35835 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
35836 + * @inode: inode to check
35837 + *
35838 + * Returns true if inode's mapping has dirty pages which do not belong to any
35839 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
35840 + * tree or were eflushed and can be found via jnodes tagged
35841 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
35842 + */
35843 +static int has_anonymous_pages(struct inode *inode)
35844 +{
35845 + int result;
35846 +
35847 + read_lock_irq(&inode->i_mapping->tree_lock);
35848 + result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
35849 + read_unlock_irq(&inode->i_mapping->tree_lock);
35850 + return result;
35851 +}
35852 +
35853 +/**
35854 + * capture_page_and_create_extent -
35855 + * @page: page to be captured
35856 + *
35857 + * Grabs space for extent creation and stat data update and calls function to
35858 + * do actual work.
35859 + */
35860 +static int capture_page_and_create_extent(struct page *page)
35861 +{
35862 + int result;
35863 + struct inode *inode;
35864 +
35865 + assert("vs-1084", page->mapping && page->mapping->host);
35866 + inode = page->mapping->host;
35867 + assert("vs-1139",
35868 + unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
35869 + /* page belongs to file */
35870 + assert("vs-1393",
35871 + inode->i_size > page_offset(page));
35872 +
35873 + /* page capture may require extent creation (if it does not exist yet)
35874 + and stat data's update (number of blocks changes on extent
35875 + creation) */
35876 + grab_space_enable();
35877 + result = reiser4_grab_space(2 * estimate_one_insert_into_item
35878 + (reiser4_tree_by_inode(inode)),
35879 + BA_CAN_COMMIT);
35880 + if (likely(!result))
35881 + result = find_or_create_extent(page);
35882 +
35883 + if (result != 0)
35884 + SetPageError(page);
35885 + return result;
35886 +}
35887 +
35888 +/* this is implementation of method commit_write of struct
35889 + address_space_operations for unix file plugin */
35890 +int
35891 +commit_write_unix_file(struct file *file, struct page *page,
35892 + unsigned from, unsigned to)
35893 +{
35894 + reiser4_context *ctx;
35895 + struct inode *inode;
35896 + int result;
35897 +
35898 + assert("umka-3101", file != NULL);
35899 + assert("umka-3102", page != NULL);
35900 + assert("umka-3093", PageLocked(page));
35901 +
35902 + SetPageUptodate(page);
35903 +
35904 + inode = page->mapping->host;
35905 + ctx = reiser4_init_context(page->mapping->host->i_sb);
35906 + if (IS_ERR(ctx))
35907 + return PTR_ERR(ctx);
35908 + page_cache_get(page);
35909 + unlock_page(page);
35910 + result = capture_page_and_create_extent(page);
35911 + lock_page(page);
35912 + page_cache_release(page);
35913 +
35914 + /* don't commit transaction under inode semaphore */
35915 + context_set_commit_async(ctx);
35916 + reiser4_exit_context(ctx);
35917 + return result;
35918 +}
35919 +
35920 +/*
35921 + * Support for "anonymous" pages and jnodes.
35922 + *
35923 + * When file is write-accessed through mmap pages can be dirtied from the user
35924 + * level. In this case kernel is not notified until one of following happens:
35925 + *
35926 + * (1) msync()
35927 + *
35928 + * (2) truncate() (either explicit or through unlink)
35929 + *
35930 + * (3) VM scanner starts reclaiming mapped pages, dirtying them before
35931 + * starting write-back.
35932 + *
35933 + * As a result of (3) ->writepage may be called on a dirty page without
35934 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
35935 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
35936 + * this situation by creating jnode for anonymous page, starting IO on the
35937 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
35938 + * memory. Such jnode is also called anonymous.
35939 + *
35940 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
35941 + * tree. This is done by capture_anonymous_*() functions below.
35942 + */
35943 +
35944 +/**
35945 + * capture_anonymous_page - involve page into transaction
35946 + * @pg: page to deal with
35947 + *
35948 + * Takes care that @page has corresponding metadata in the tree, creates jnode
35949 + * for @page and captures it. On success 1 is returned.
35950 + */
35951 +static int capture_anonymous_page(struct page *page)
35952 +{
35953 + int result;
35954 +
35955 + if (PageWriteback(page))
35956 + /* FIXME: do nothing? */
35957 + return 0;
35958 +
35959 + result = capture_page_and_create_extent(page);
35960 + if (result == 0) {
35961 + result = 1;
35962 + } else
35963 + warning("nikita-3329",
35964 + "Cannot capture anon page: %i", result);
35965 +
35966 + return result;
35967 +}
35968 +
35969 +/**
35970 + * capture_anonymous_pages - find and capture pages dirtied via mmap
35971 + * @mapping: address space where to look for pages
35972 + * @index: start index
35973 + * @to_capture: maximum number of pages to capture
35974 + *
35975 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
35976 + * captures (involves into atom) them, returns number of captured pages,
35977 + * updates @index to next page after the last captured one.
35978 + */
35979 +static int
35980 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
35981 + unsigned int to_capture)
35982 +{
35983 + int result;
35984 + struct pagevec pvec;
35985 + unsigned int i, count;
35986 + int nr;
35987 +
35988 + pagevec_init(&pvec, 0);
35989 + count = min(pagevec_space(&pvec), to_capture);
35990 + nr = 0;
35991 +
35992 + /* find pages tagged MOVED */
35993 + write_lock_irq(&mapping->tree_lock);
35994 + pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
35995 + (void **)pvec.pages, *index, count,
35996 + PAGECACHE_TAG_REISER4_MOVED);
35997 + if (pagevec_count(&pvec) == 0) {
35998 + /*
35999 + * there are no pages tagged MOVED in mapping->page_tree
36000 + * starting from *index
36001 + */
36002 + write_unlock_irq(&mapping->tree_lock);
36003 + *index = (pgoff_t)-1;
36004 + return 0;
36005 + }
36006 +
36007 + /* clear MOVED tag for all found pages */
36008 + for (i = 0; i < pagevec_count(&pvec); i++) {
36009 + void *p;
36010 +
36011 + page_cache_get(pvec.pages[i]);
36012 + p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36013 + PAGECACHE_TAG_REISER4_MOVED);
36014 + assert("vs-49", p == pvec.pages[i]);
36015 + }
36016 + write_unlock_irq(&mapping->tree_lock);
36017 +
36018 +
36019 + *index = pvec.pages[i - 1]->index + 1;
36020 +
36021 + for (i = 0; i < pagevec_count(&pvec); i++) {
36022 + /*
36023 + * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36024 + * reiser4_set_page_dirty_internal which is called when jnode is
36025 + * captured
36026 + */
36027 + result = capture_anonymous_page(pvec.pages[i]);
36028 + if (result == 1)
36029 + nr++;
36030 + else {
36031 + if (result < 0) {
36032 + warning("vs-1454",
36033 + "failed to capture page: "
36034 + "result=%d, captured=%d)\n",
36035 + result, i);
36036 +
36037 + /*
36038 + * set MOVED tag to all pages which left not
36039 + * captured
36040 + */
36041 + write_lock_irq(&mapping->tree_lock);
36042 + for (; i < pagevec_count(&pvec); i ++) {
36043 + radix_tree_tag_set(&mapping->page_tree,
36044 + pvec.pages[i]->index,
36045 + PAGECACHE_TAG_REISER4_MOVED);
36046 + }
36047 + write_unlock_irq(&mapping->tree_lock);
36048 +
36049 + pagevec_release(&pvec);
36050 + return result;
36051 + } else {
36052 + /*
36053 + * result == 0. capture_anonymous_page returns
36054 + * 0 for Writeback-ed page. Set MOVED tag on
36055 + * that page
36056 + */
36057 + write_lock_irq(&mapping->tree_lock);
36058 + radix_tree_tag_set(&mapping->page_tree,
36059 + pvec.pages[i]->index,
36060 + PAGECACHE_TAG_REISER4_MOVED);
36061 + write_unlock_irq(&mapping->tree_lock);
36062 + if (i == 0)
36063 + *index = pvec.pages[0]->index;
36064 + else
36065 + *index = pvec.pages[i - 1]->index + 1;
36066 + }
36067 + }
36068 + }
36069 + pagevec_release(&pvec);
36070 + return nr;
36071 +}
36072 +
36073 +/**
36074 + * capture_anonymous_jnodes - find and capture anonymous jnodes
36075 + * @mapping: address space where to look for jnodes
36076 + * @from: start index
36077 + * @to: end index
36078 + * @to_capture: maximum number of jnodes to capture
36079 + *
36080 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36081 + * the range of indexes @from-@to and captures them, returns number of captured
36082 + * jnodes, updates @from to next jnode after the last captured one.
36083 + */
36084 +static int
36085 +capture_anonymous_jnodes(struct address_space *mapping,
36086 + pgoff_t *from, pgoff_t to, int to_capture)
36087 +{
36088 + *from = to;
36089 + return 0;
36090 +}
36091 +
36092 +/*
36093 + * Commit atom of the jnode of a page.
36094 + */
36095 +static int sync_page(struct page *page)
36096 +{
36097 + int result;
36098 + do {
36099 + jnode *node;
36100 + txn_atom *atom;
36101 +
36102 + lock_page(page);
36103 + node = jprivate(page);
36104 + if (node != NULL) {
36105 + spin_lock_jnode(node);
36106 + atom = jnode_get_atom(node);
36107 + spin_unlock_jnode(node);
36108 + } else
36109 + atom = NULL;
36110 + unlock_page(page);
36111 + result = reiser4_sync_atom(atom);
36112 + } while (result == -E_REPEAT);
36113 + /*
36114 + * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36115 + * handle the case where more pages get added to the atom while we are
36116 + * syncing it?
36117 + */
36118 + assert("nikita-3485", ergo(result == 0,
36119 + get_current_context()->trans->atom == NULL));
36120 + return result;
36121 +}
36122 +
36123 +/*
36124 + * Commit atoms of pages on @pages list.
36125 + * call sync_page for each page from mapping's page tree
36126 + */
36127 +static int sync_page_list(struct inode *inode)
36128 +{
36129 + int result;
36130 + struct address_space *mapping;
36131 + unsigned long from; /* start index for radix_tree_gang_lookup */
36132 + unsigned int found; /* return value for radix_tree_gang_lookup */
36133 +
36134 + mapping = inode->i_mapping;
36135 + from = 0;
36136 + result = 0;
36137 + read_lock_irq(&mapping->tree_lock);
36138 + while (result == 0) {
36139 + struct page *page;
36140 +
36141 + found =
36142 + radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36143 + from, 1);
36144 + assert("", found < 2);
36145 + if (found == 0)
36146 + break;
36147 +
36148 + /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36149 + sys_fsync */
36150 + page_cache_get(page);
36151 + read_unlock_irq(&mapping->tree_lock);
36152 +
36153 + from = page->index + 1;
36154 +
36155 + result = sync_page(page);
36156 +
36157 + page_cache_release(page);
36158 + read_lock_irq(&mapping->tree_lock);
36159 + }
36160 +
36161 + read_unlock_irq(&mapping->tree_lock);
36162 + return result;
36163 +}
36164 +
36165 +static int commit_file_atoms(struct inode *inode)
36166 +{
36167 + int result;
36168 + unix_file_info_t *uf_info;
36169 +
36170 + uf_info = unix_file_inode_data(inode);
36171 +
36172 + get_exclusive_access(uf_info);
36173 + /*
36174 + * find what items file is made from
36175 + */
36176 + result = find_file_state(inode, uf_info);
36177 + drop_exclusive_access(uf_info);
36178 + if (result != 0)
36179 + return result;
36180 +
36181 + /*
36182 + * file state cannot change because we are under ->i_mutex
36183 + */
36184 + switch (uf_info->container) {
36185 + case UF_CONTAINER_EXTENTS:
36186 + /* find_file_state might open join an atom */
36187 + reiser4_txn_restart_current();
36188 + result =
36189 + /*
36190 + * when we are called by
36191 + * filemap_fdatawrite->
36192 + * do_writepages()->
36193 + * reiser4_writepages()
36194 + *
36195 + * inode->i_mapping->dirty_pages are spices into
36196 + * ->io_pages, leaving ->dirty_pages dirty.
36197 + *
36198 + * When we are called from
36199 + * reiser4_fsync()->sync_unix_file(), we have to
36200 + * commit atoms of all pages on the ->dirty_list.
36201 + *
36202 + * So for simplicity we just commit ->io_pages and
36203 + * ->dirty_pages.
36204 + */
36205 + sync_page_list(inode);
36206 + break;
36207 + case UF_CONTAINER_TAILS:
36208 + /*
36209 + * NOTE-NIKITA probably we can be smarter for tails. For now
36210 + * just commit all existing atoms.
36211 + */
36212 + result = txnmgr_force_commit_all(inode->i_sb, 0);
36213 + break;
36214 + case UF_CONTAINER_EMPTY:
36215 + result = 0;
36216 + break;
36217 + case UF_CONTAINER_UNKNOWN:
36218 + default:
36219 + result = -EIO;
36220 + break;
36221 + }
36222 +
36223 + /*
36224 + * commit current transaction: there can be captured nodes from
36225 + * find_file_state() and finish_conversion().
36226 + */
36227 + reiser4_txn_restart_current();
36228 + return result;
36229 +}
36230 +
36231 +/**
36232 + * writepages_unix_file - writepages of struct address_space_operations
36233 + * @mapping:
36234 + * @wbc:
36235 + *
36236 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36237 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36238 + * created by reiser4_writepage.
36239 + */
36240 +int writepages_unix_file(struct address_space *mapping,
36241 + struct writeback_control *wbc)
36242 +{
36243 + int result;
36244 + unix_file_info_t *uf_info;
36245 + pgoff_t pindex, jindex, nr_pages;
36246 + long to_capture;
36247 + struct inode *inode;
36248 +
36249 + inode = mapping->host;
36250 + if (!has_anonymous_pages(inode)) {
36251 + result = 0;
36252 + goto end;
36253 + }
36254 + jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
36255 + result = 0;
36256 + nr_pages =
36257 + (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36258 + uf_info = unix_file_inode_data(inode);
36259 +
36260 + do {
36261 + reiser4_context *ctx;
36262 +
36263 + if (wbc->sync_mode != WB_SYNC_ALL)
36264 + to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36265 + else
36266 + to_capture = CAPTURE_APAGE_BURST;
36267 +
36268 + ctx = reiser4_init_context(inode->i_sb);
36269 + if (IS_ERR(ctx)) {
36270 + result = PTR_ERR(ctx);
36271 + break;
36272 + }
36273 + /* avoid recursive calls to ->sync_inodes */
36274 + ctx->nobalance = 1;
36275 + assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36276 + assert("", LOCK_CNT_NIL(inode_sem_w));
36277 + assert("", LOCK_CNT_NIL(inode_sem_r));
36278 +
36279 + reiser4_txn_restart_current();
36280 +
36281 + /* we have to get nonexclusive access to the file */
36282 + if (get_current_context()->entd) {
36283 + /*
36284 + * use nonblocking version of nonexclusive_access to
36285 + * avoid deadlock which might look like the following:
36286 + * process P1 holds NEA on file F1 and called entd to
36287 + * reclaim some memory. Entd works for P1 and is going
36288 + * to capture pages of file F2. To do that entd has to
36289 + * get NEA to F2. F2 is held by process P2 which also
36290 + * called entd. But entd is serving P1 at the moment
36291 + * and P2 has to wait. Process P3 trying to get EA to
36292 + * file F2. Existence of pending EA request to file F2
36293 + * makes impossible for entd to get NEA to file
36294 + * F2. Neither of these process can continue. Using
36295 + * nonblocking version of gettign NEA is supposed to
36296 + * avoid this deadlock.
36297 + */
36298 + if (try_to_get_nonexclusive_access(uf_info) == 0) {
36299 + result = RETERR(-EBUSY);
36300 + reiser4_exit_context(ctx);
36301 + break;
36302 + }
36303 + } else
36304 + get_nonexclusive_access(uf_info);
36305 +
36306 + while (to_capture > 0) {
36307 + pgoff_t start;
36308 +
36309 + assert("vs-1727", jindex <= pindex);
36310 + if (pindex == jindex) {
36311 + start = pindex;
36312 + result =
36313 + capture_anonymous_pages(inode->i_mapping,
36314 + &pindex,
36315 + to_capture);
36316 + if (result <= 0)
36317 + break;
36318 + to_capture -= result;
36319 + wbc->nr_to_write -= result;
36320 + if (start + result == pindex) {
36321 + jindex = pindex;
36322 + continue;
36323 + }
36324 + if (to_capture <= 0)
36325 + break;
36326 + }
36327 + /* deal with anonymous jnodes between jindex and pindex */
36328 + result =
36329 + capture_anonymous_jnodes(inode->i_mapping, &jindex,
36330 + pindex, to_capture);
36331 + if (result < 0)
36332 + break;
36333 + to_capture -= result;
36334 + get_current_context()->nr_captured += result;
36335 +
36336 + if (jindex == (pgoff_t) - 1) {
36337 + assert("vs-1728", pindex == (pgoff_t) - 1);
36338 + break;
36339 + }
36340 + }
36341 + if (to_capture <= 0)
36342 + /* there may be left more pages */
36343 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
36344 +
36345 + drop_nonexclusive_access(uf_info);
36346 + if (result < 0) {
36347 + /* error happened */
36348 + reiser4_exit_context(ctx);
36349 + return result;
36350 + }
36351 + if (wbc->sync_mode != WB_SYNC_ALL) {
36352 + reiser4_exit_context(ctx);
36353 + return 0;
36354 + }
36355 + result = commit_file_atoms(inode);
36356 + reiser4_exit_context(ctx);
36357 + if (pindex >= nr_pages && jindex == pindex)
36358 + break;
36359 + } while (1);
36360 +
36361 + end:
36362 + if (is_in_reiser4_context()) {
36363 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36364 + /*
36365 + * there are already pages to flush, flush them out, do
36366 + * not delay until end of reiser4_sync_inodes
36367 + */
36368 + reiser4_writeout(inode->i_sb, wbc);
36369 + get_current_context()->nr_captured = 0;
36370 + }
36371 + }
36372 + return result;
36373 +}
36374 +
36375 +/*
36376 + * ->sync() method for unix file.
36377 + *
36378 + * We are trying to be smart here. Instead of committing all atoms (original
36379 + * solution), we scan dirty pages of this file and commit all atoms they are
36380 + * part of.
36381 + *
36382 + * Situation is complicated by anonymous pages: i.e., extent-less pages
36383 + * dirtied through mmap. Fortunately sys_fsync() first calls
36384 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36385 + * all missing extents and capture anonymous pages.
36386 + */
36387 +int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36388 +{
36389 + reiser4_context *ctx;
36390 + txn_atom *atom;
36391 + reiser4_block_nr reserve;
36392 +
36393 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
36394 + if (IS_ERR(ctx))
36395 + return PTR_ERR(ctx);
36396 +
36397 + reserve = estimate_update_common(dentry->d_inode);
36398 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36399 + reiser4_exit_context(ctx);
36400 + return RETERR(-ENOSPC);
36401 + }
36402 + write_sd_by_inode_common(dentry->d_inode);
36403 +
36404 + atom = get_current_atom_locked();
36405 + spin_lock_txnh(ctx->trans);
36406 + force_commit_atom(ctx->trans);
36407 + reiser4_exit_context(ctx);
36408 + return 0;
36409 +}
36410 +
36411 +/**
36412 + * readpage_unix_file_nolock - readpage of struct address_space_operations
36413 + * @file:
36414 + * @page:
36415 + *
36416 + * Compose a key and search for item containing information about @page
36417 + * data. If item is found - its readpage method is called.
36418 + */
36419 +int readpage_unix_file(struct file *file, struct page *page)
36420 +{
36421 + reiser4_context *ctx;
36422 + int result;
36423 + struct inode *inode;
36424 + reiser4_key key;
36425 + item_plugin *iplug;
36426 + hint_t *hint;
36427 + lock_handle *lh;
36428 + coord_t *coord;
36429 +
36430 + assert("vs-1062", PageLocked(page));
36431 + assert("vs-976", !PageUptodate(page));
36432 + assert("vs-1061", page->mapping && page->mapping->host);
36433 +
36434 + if (page->mapping->host->i_size <= page_offset(page)) {
36435 + /* page is out of file already */
36436 + unlock_page(page);
36437 + return -EINVAL;
36438 + }
36439 +
36440 + inode = page->mapping->host;
36441 + ctx = reiser4_init_context(inode->i_sb);
36442 + if (IS_ERR(ctx)) {
36443 + unlock_page(page);
36444 + return PTR_ERR(ctx);
36445 + }
36446 +
36447 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36448 + if (hint == NULL) {
36449 + unlock_page(page);
36450 + reiser4_exit_context(ctx);
36451 + return RETERR(-ENOMEM);
36452 + }
36453 +
36454 + result = load_file_hint(file, hint);
36455 + if (result) {
36456 + kfree(hint);
36457 + unlock_page(page);
36458 + reiser4_exit_context(ctx);
36459 + return result;
36460 + }
36461 + lh = &hint->lh;
36462 +
36463 + /* get key of first byte of the page */
36464 + key_by_inode_and_offset_common(inode, page_offset(page), &key);
36465 +
36466 + /* look for file metadata corresponding to first byte of page */
36467 + page_cache_get(page);
36468 + unlock_page(page);
36469 + result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36470 + lock_page(page);
36471 + page_cache_release(page);
36472 +
36473 + if (page->mapping == NULL) {
36474 + /*
36475 + * readpage allows truncate to run concurrently. Page was
36476 + * truncated while it was not locked
36477 + */
36478 + done_lh(lh);
36479 + kfree(hint);
36480 + unlock_page(page);
36481 + reiser4_txn_restart(ctx);
36482 + reiser4_exit_context(ctx);
36483 + return -EINVAL;
36484 + }
36485 +
36486 + if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36487 + if (result == CBK_COORD_FOUND &&
36488 + hint->ext_coord.coord.between != AT_UNIT)
36489 + /* file is truncated */
36490 + result = -EINVAL;
36491 + done_lh(lh);
36492 + kfree(hint);
36493 + unlock_page(page);
36494 + reiser4_txn_restart(ctx);
36495 + reiser4_exit_context(ctx);
36496 + return result;
36497 + }
36498 +
36499 + /*
36500 + * item corresponding to page is found. It can not be removed because
36501 + * znode lock is held
36502 + */
36503 + if (PageUptodate(page)) {
36504 + done_lh(lh);
36505 + kfree(hint);
36506 + unlock_page(page);
36507 + reiser4_txn_restart(ctx);
36508 + reiser4_exit_context(ctx);
36509 + return 0;
36510 + }
36511 +
36512 + coord = &hint->ext_coord.coord;
36513 + result = zload(coord->node);
36514 + if (result) {
36515 + done_lh(lh);
36516 + kfree(hint);
36517 + unlock_page(page);
36518 + reiser4_txn_restart(ctx);
36519 + reiser4_exit_context(ctx);
36520 + return result;
36521 + }
36522 +
36523 + validate_extended_coord(&hint->ext_coord, page_offset(page));
36524 +
36525 + if (!coord_is_existing_unit(coord)) {
36526 + /* this indicates corruption */
36527 + warning("vs-280",
36528 + "Looking for page %lu of file %llu (size %lli). "
36529 + "No file items found (%d). File is corrupted?\n",
36530 + page->index, (unsigned long long)get_inode_oid(inode),
36531 + inode->i_size, result);
36532 + zrelse(coord->node);
36533 + done_lh(lh);
36534 + kfree(hint);
36535 + unlock_page(page);
36536 + reiser4_txn_restart(ctx);
36537 + reiser4_exit_context(ctx);
36538 + return RETERR(-EIO);
36539 + }
36540 +
36541 + /*
36542 + * get plugin of found item or use plugin if extent if there are no
36543 + * one
36544 + */
36545 + iplug = item_plugin_by_coord(coord);
36546 + if (iplug->s.file.readpage)
36547 + result = iplug->s.file.readpage(coord, page);
36548 + else
36549 + result = RETERR(-EINVAL);
36550 +
36551 + if (!result) {
36552 + set_key_offset(&key,
36553 + (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
36554 + /* FIXME should call reiser4_set_hint() */
36555 + reiser4_unset_hint(hint);
36556 + } else {
36557 + unlock_page(page);
36558 + reiser4_unset_hint(hint);
36559 + }
36560 + assert("vs-979",
36561 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
36562 + assert("vs-9791", ergo(result != 0, !PageLocked(page)));
36563 +
36564 + zrelse(coord->node);
36565 + done_lh(lh);
36566 +
36567 + save_file_hint(file, hint);
36568 + kfree(hint);
36569 +
36570 + /*
36571 + * FIXME: explain why it is needed. HINT: page allocation in write can
36572 + * not be done when atom is not NULL because reiser4_writepage can not
36573 + * kick entd and have to eflush
36574 + */
36575 + reiser4_txn_restart(ctx);
36576 + reiser4_exit_context(ctx);
36577 + return result;
36578 +}
36579 +
36580 +struct uf_readpages_context {
36581 + lock_handle lh;
36582 + coord_t coord;
36583 +};
36584 +
36585 +/* A callback function for readpages_unix_file/read_cache_pages.
36586 + * If the file is build of tails, then return error (-ENOENT).
36587 + *
36588 + * @data -- a pointer to reiser4_readpages_context object,
36589 + * to save the twig lock and the coord between
36590 + * read_cache_page iterations.
36591 + * @page -- page to start read.
36592 + */
36593 +static int uf_readpages_filler(void * data, struct page * page)
36594 +{
36595 + struct uf_readpages_context *rc = data;
36596 + jnode * node;
36597 + int ret = 0;
36598 + reiser4_extent *ext;
36599 + __u64 ext_index;
36600 + int cbk_done = 0;
36601 + struct address_space * mapping = page->mapping;
36602 +
36603 + if (PageUptodate(page)) {
36604 + unlock_page(page);
36605 + return 0;
36606 + }
36607 + if (rc->lh.node == 0) {
36608 + /* no twig lock - have to do tree search. */
36609 + reiser4_key key;
36610 + repeat:
36611 + unlock_page(page);
36612 + key_by_inode_and_offset_common(
36613 + mapping->host, page_offset(page), &key);
36614 + ret = coord_by_key(
36615 + &get_super_private(mapping->host->i_sb)->tree,
36616 + &key, &rc->coord, &rc->lh,
36617 + ZNODE_READ_LOCK, FIND_EXACT,
36618 + TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
36619 + if (ret)
36620 + return ret;
36621 + lock_page(page);
36622 + cbk_done = 1;
36623 + }
36624 + ret = zload(rc->coord.node);
36625 + if (ret) {
36626 + unlock_page(page);
36627 + return ret;
36628 + }
36629 + if (!coord_is_existing_item(&rc->coord) ||
36630 + !item_is_extent(&rc->coord)) {
36631 + zrelse(rc->coord.node);
36632 + unlock_page(page);
36633 + return RETERR(-EIO);
36634 + }
36635 + ext = extent_by_coord(&rc->coord);
36636 + ext_index = extent_unit_index(&rc->coord);
36637 + if (page->index < ext_index ||
36638 + page->index >= ext_index + extent_get_width(ext)) {
36639 + /* the page index doesn't belong to the extent unit
36640 + which the coord points to - release the lock and
36641 + repeat with tree search. */
36642 + zrelse(rc->coord.node);
36643 + done_lh(&rc->lh);
36644 + /* we can be here after a CBK call only in case of
36645 + corruption of the tree or the tree lookup algorithm bug. */
36646 + if (unlikely(cbk_done)) {
36647 + unlock_page(page);
36648 + return RETERR(-EIO);
36649 + }
36650 + goto repeat;
36651 + }
36652 + node = jnode_of_page(page);
36653 + if (unlikely(IS_ERR(node))) {
36654 + zrelse(rc->coord.node);
36655 + unlock_page(page);
36656 + return PTR_ERR(node);
36657 + }
36658 + ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
36659 + jput(node);
36660 + zrelse(rc->coord.node);
36661 + if (ret)
36662 + unlock_page(page);
36663 + return ret;
36664 +}
36665 +
36666 +/**
36667 + * readpages_unix_file - called by the readahead code, starts reading for each
36668 + * page of given list of pages
36669 + */
36670 +int readpages_unix_file(
36671 + struct file *file, struct address_space *mapping,
36672 + struct list_head *pages, unsigned nr_pages)
36673 +{
36674 + reiser4_context *ctx;
36675 + struct uf_readpages_context rc;
36676 + int ret;
36677 +
36678 + ctx = reiser4_init_context(mapping->host->i_sb);
36679 + if (IS_ERR(ctx)) {
36680 + put_pages_list(pages);
36681 + return PTR_ERR(ctx);
36682 + }
36683 + init_lh(&rc.lh);
36684 + ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
36685 + done_lh(&rc.lh);
36686 + context_set_commit_async(ctx);
36687 + /* close the transaction to protect further page allocation from deadlocks */
36688 + reiser4_txn_restart(ctx);
36689 + reiser4_exit_context(ctx);
36690 + return ret;
36691 +}
36692 +
36693 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
36694 + loff_t count UNUSED_ARG)
36695 +{
36696 + /* We should reserve one block, because of updating of the stat data
36697 + item */
36698 + assert("vs-1249",
36699 + inode_file_plugin(inode)->estimate.update ==
36700 + estimate_update_common);
36701 + return estimate_update_common(inode);
36702 +}
36703 +
36704 +/* this is called with nonexclusive access obtained, file's container can not change */
36705 +static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
36706 + char __user *buf, /* address of user-space buffer */
36707 + size_t count, /* number of bytes to read */
36708 + loff_t *off)
36709 +{
36710 + int result;
36711 + struct inode *inode;
36712 + flow_t flow;
36713 + int (*read_f) (struct file *, flow_t *, hint_t *);
36714 + coord_t *coord;
36715 + znode *loaded;
36716 +
36717 + inode = file->f_dentry->d_inode;
36718 +
36719 + /* build flow */
36720 + assert("vs-1250",
36721 + inode_file_plugin(inode)->flow_by_inode ==
36722 + flow_by_inode_unix_file);
36723 + result =
36724 + flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
36725 + *off, READ_OP, &flow);
36726 + if (unlikely(result))
36727 + return result;
36728 +
36729 + /* get seal and coord sealed with it from reiser4 private data
36730 + of struct file. The coord will tell us where our last read
36731 + of this file finished, and the seal will help to determine
36732 + if that location is still valid.
36733 + */
36734 + coord = &hint->ext_coord.coord;
36735 + while (flow.length && result == 0) {
36736 + result =
36737 + find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
36738 + if (cbk_errored(result))
36739 + /* error happened */
36740 + break;
36741 +
36742 + if (coord->between != AT_UNIT) {
36743 + /* there were no items corresponding to given offset */
36744 + done_lh(hint->ext_coord.lh);
36745 + break;
36746 + }
36747 +
36748 + loaded = coord->node;
36749 + result = zload(loaded);
36750 + if (unlikely(result)) {
36751 + done_lh(hint->ext_coord.lh);
36752 + break;
36753 + }
36754 +
36755 + if (hint->ext_coord.valid == 0)
36756 + validate_extended_coord(&hint->ext_coord,
36757 + get_key_offset(&flow.key));
36758 +
36759 + assert("vs-4", hint->ext_coord.valid == 1);
36760 + assert("vs-33", hint->ext_coord.lh == &hint->lh);
36761 + /* call item's read method */
36762 + read_f = item_plugin_by_coord(coord)->s.file.read;
36763 + result = read_f(file, &flow, hint);
36764 + zrelse(loaded);
36765 + done_lh(hint->ext_coord.lh);
36766 + }
36767 +
36768 + return (count - flow.length) ? (count - flow.length) : result;
36769 +}
36770 +
36771 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
36772 +
36773 +/**
36774 + * read_unix_file - read of struct file_operations
36775 + * @file: file to read from
36776 + * @buf: address of user-space buffer
36777 + * @read_amount: number of bytes to read
36778 + * @off: position in file to read from
36779 + *
36780 + * This is implementation of vfs's read method of struct file_operations for
36781 + * unix file plugin.
36782 + */
36783 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
36784 + loff_t *off)
36785 +{
36786 + reiser4_context *ctx;
36787 + ssize_t result;
36788 + struct inode *inode;
36789 + unix_file_info_t *uf_info;
36790 +
36791 + if (unlikely(read_amount == 0))
36792 + return 0;
36793 +
36794 + assert("umka-072", file != NULL);
36795 + assert("umka-074", off != NULL);
36796 + inode = file->f_dentry->d_inode;
36797 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36798 +
36799 + ctx = reiser4_init_context(inode->i_sb);
36800 + if (IS_ERR(ctx))
36801 + return PTR_ERR(ctx);
36802 + uf_info = unix_file_inode_data(inode);
36803 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36804 + get_exclusive_access(uf_info);
36805 + result = find_file_state(inode, uf_info);
36806 + if (unlikely(result != 0))
36807 + goto out;
36808 + } else
36809 + get_nonexclusive_access(uf_info);
36810 + result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
36811 + BA_CAN_COMMIT);
36812 + if (unlikely(result != 0))
36813 + goto out;
36814 + if (uf_info->container == UF_CONTAINER_EXTENTS){
36815 + result = do_sync_read(file, buf, read_amount, off);
36816 + } else if (uf_info->container == UF_CONTAINER_TAILS ||
36817 + reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
36818 + reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
36819 + result = read_unix_file_container_tails(file, buf, read_amount, off);
36820 + } else {
36821 + assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
36822 + result = 0;
36823 + }
36824 +out:
36825 + drop_access(uf_info);
36826 + context_set_commit_async(ctx);
36827 + reiser4_exit_context(ctx);
36828 + return result;
36829 +}
36830 +
36831 +static ssize_t read_unix_file_container_tails(
36832 + struct file *file, char __user *buf, size_t read_amount, loff_t *off)
36833 +{
36834 + int result;
36835 + struct inode *inode;
36836 + hint_t *hint;
36837 + unix_file_info_t *uf_info;
36838 + size_t count, read, left;
36839 + loff_t size;
36840 +
36841 + assert("umka-072", file != NULL);
36842 + assert("umka-074", off != NULL);
36843 + inode = file->f_dentry->d_inode;
36844 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36845 +
36846 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36847 + if (hint == NULL)
36848 + return RETERR(-ENOMEM);
36849 +
36850 + result = load_file_hint(file, hint);
36851 + if (result) {
36852 + kfree(hint);
36853 + return result;
36854 + }
36855 +
36856 + left = read_amount;
36857 + count = 0;
36858 + uf_info = unix_file_inode_data(inode);
36859 + while (left > 0) {
36860 + reiser4_txn_restart_current();
36861 + size = i_size_read(inode);
36862 + if (*off >= size)
36863 + /* position to read from is past the end of file */
36864 + break;
36865 + if (*off + left > size)
36866 + left = size - *off;
36867 + /* faultin user page */
36868 + result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
36869 + if (result)
36870 + return RETERR(-EFAULT);
36871 +
36872 + read = read_file(hint, file, buf,
36873 + left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
36874 + off);
36875 + if (read < 0) {
36876 + result = read;
36877 + break;
36878 + }
36879 + left -= read;
36880 + buf += read;
36881 +
36882 + /* update position in a file */
36883 + *off += read;
36884 + /* total number of read bytes */
36885 + count += read;
36886 + }
36887 + done_lh(&hint->lh);
36888 + save_file_hint(file, hint);
36889 + kfree(hint);
36890 + if (count)
36891 + file_accessed(file);
36892 + /* return number of read bytes or error code if nothing is read */
36893 + return count ? count : result;
36894 +}
36895 +
36896 +/* This function takes care about @file's pages. First of all it checks if
36897 + filesystems readonly and if so gets out. Otherwise, it throws out all
36898 + pages of file if it was mapped for read and going to be mapped for write
36899 + and consists of tails. This is done in order to not manage few copies
36900 + of the data (first in page cache and second one in tails them selves)
36901 + for the case of mapping files consisting tails.
36902 +
36903 + Here also tail2extent conversion is performed if it is allowed and file
36904 + is going to be written or mapped for write. This functions may be called
36905 + from write_unix_file() or mmap_unix_file(). */
36906 +static int check_pages_unix_file(struct file *file, struct inode *inode)
36907 +{
36908 + reiser4_invalidate_pages(inode->i_mapping, 0,
36909 + (inode->i_size + PAGE_CACHE_SIZE -
36910 + 1) >> PAGE_CACHE_SHIFT, 0);
36911 + return unpack(file, inode, 0 /* not forever */ );
36912 +}
36913 +
36914 +/**
36915 + * mmap_unix_file - mmap of struct file_operations
36916 + * @file: file to mmap
36917 + * @vma:
36918 + *
36919 + * This is implementation of vfs's mmap method of struct file_operations for
36920 + * unix file plugin. It converts file to extent if necessary. Sets
36921 + * reiser4_inode's flag - REISER4_HAS_MMAP.
36922 + */
36923 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
36924 +{
36925 + reiser4_context *ctx;
36926 + int result;
36927 + struct inode *inode;
36928 + unix_file_info_t *uf_info;
36929 + reiser4_block_nr needed;
36930 +
36931 + inode = file->f_dentry->d_inode;
36932 + ctx = reiser4_init_context(inode->i_sb);
36933 + if (IS_ERR(ctx))
36934 + return PTR_ERR(ctx);
36935 +
36936 + uf_info = unix_file_inode_data(inode);
36937 +
36938 + get_exclusive_access_careful(uf_info, inode);
36939 +
36940 + if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
36941 + /*
36942 + * we need file built of extent items. If it is still built of
36943 + * tail items we have to convert it. Find what items the file
36944 + * is built of
36945 + */
36946 + result = find_file_state(inode, uf_info);
36947 + if (result != 0) {
36948 + drop_exclusive_access(uf_info);
36949 + reiser4_exit_context(ctx);
36950 + return result;
36951 + }
36952 +
36953 + assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
36954 + uf_info->container == UF_CONTAINER_EXTENTS ||
36955 + uf_info->container == UF_CONTAINER_EMPTY));
36956 + if (uf_info->container == UF_CONTAINER_TAILS) {
36957 + /*
36958 + * invalidate all pages and convert file from tails to
36959 + * extents
36960 + */
36961 + result = check_pages_unix_file(file, inode);
36962 + if (result) {
36963 + drop_exclusive_access(uf_info);
36964 + reiser4_exit_context(ctx);
36965 + return result;
36966 + }
36967 + }
36968 + }
36969 +
36970 + /*
36971 + * generic_file_mmap will do update_atime. Grab space for stat data
36972 + * update.
36973 + */
36974 + needed = inode_file_plugin(inode)->estimate.update(inode);
36975 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
36976 + if (result) {
36977 + drop_exclusive_access(uf_info);
36978 + reiser4_exit_context(ctx);
36979 + return result;
36980 + }
36981 +
36982 + result = generic_file_mmap(file, vma);
36983 + if (result == 0) {
36984 + /* mark file as having mapping. */
36985 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
36986 + }
36987 +
36988 + drop_exclusive_access(uf_info);
36989 + reiser4_exit_context(ctx);
36990 + return result;
36991 +}
36992 +
36993 +/**
36994 + * find_first_item
36995 + * @inode:
36996 + *
36997 + * Finds file item which is responsible for first byte in the file.
36998 + */
36999 +static int find_first_item(struct inode *inode)
37000 +{
37001 + coord_t coord;
37002 + lock_handle lh;
37003 + reiser4_key key;
37004 + int result;
37005 +
37006 + coord_init_zero(&coord);
37007 + init_lh(&lh);
37008 + inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37009 + result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37010 + inode);
37011 + if (result == CBK_COORD_FOUND) {
37012 + if (coord.between == AT_UNIT) {
37013 + result = zload(coord.node);
37014 + if (result == 0) {
37015 + result = item_id_by_coord(&coord);
37016 + zrelse(coord.node);
37017 + if (result != EXTENT_POINTER_ID &&
37018 + result != FORMATTING_ID)
37019 + result = RETERR(-EIO);
37020 + }
37021 + } else
37022 + result = RETERR(-EIO);
37023 + }
37024 + done_lh(&lh);
37025 + return result;
37026 +}
37027 +
37028 +/**
37029 + * open_unix_file
37030 + * @inode:
37031 + * @file:
37032 + *
37033 + * If filesystem is not readonly - complete uncompleted tail conversion if
37034 + * there was one
37035 + */
37036 +int open_unix_file(struct inode *inode, struct file *file)
37037 +{
37038 + int result;
37039 + reiser4_context *ctx;
37040 + unix_file_info_t *uf_info;
37041 +
37042 + if (IS_RDONLY(inode))
37043 + return 0;
37044 +
37045 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
37046 + return 0;
37047 +
37048 + ctx = reiser4_init_context(inode->i_sb);
37049 + if (IS_ERR(ctx))
37050 + return PTR_ERR(ctx);
37051 +
37052 + uf_info = unix_file_inode_data(inode);
37053 +
37054 + get_exclusive_access_careful(uf_info, inode);
37055 +
37056 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37057 + /*
37058 + * other process completed the conversion
37059 + */
37060 + drop_exclusive_access(uf_info);
37061 + reiser4_exit_context(ctx);
37062 + return 0;
37063 + }
37064 +
37065 + /*
37066 + * file left in semi converted state after unclean shutdown or another
37067 + * thread is doing conversion and dropped exclusive access which doing
37068 + * balance dirty pages. Complete the conversion
37069 + */
37070 + result = find_first_item(inode);
37071 + if (result == EXTENT_POINTER_ID)
37072 + /*
37073 + * first item is extent, therefore there was incomplete
37074 + * tail2extent conversion. Complete it
37075 + */
37076 + result = tail2extent(unix_file_inode_data(inode));
37077 + else if (result == FORMATTING_ID)
37078 + /*
37079 + * first item is formatting item, therefore there was
37080 + * incomplete extent2tail conversion. Complete it
37081 + */
37082 + result = extent2tail(unix_file_inode_data(inode));
37083 + else
37084 + result = -EIO;
37085 +
37086 + assert("vs-1712",
37087 + ergo(result == 0,
37088 + (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37089 + !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
37090 + drop_exclusive_access(uf_info);
37091 + reiser4_exit_context(ctx);
37092 + return result;
37093 +}
37094 +
37095 +#define NEITHER_OBTAINED 0
37096 +#define EA_OBTAINED 1
37097 +#define NEA_OBTAINED 2
37098 +
37099 +static void drop_access(unix_file_info_t *uf_info)
37100 +{
37101 + if (uf_info->exclusive_use)
37102 + drop_exclusive_access(uf_info);
37103 + else
37104 + drop_nonexclusive_access(uf_info);
37105 +}
37106 +
37107 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37108 + __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37109 +
37110 +/**
37111 + * write_unix_file - write of struct file_operations
37112 + * @file: file to write to
37113 + * @buf: address of user-space buffer
37114 + * @write_amount: number of bytes to write
37115 + * @off: position in file to write to
37116 + *
37117 + * This is implementation of vfs's write method of struct file_operations for
37118 + * unix file plugin.
37119 + */
37120 +ssize_t write_unix_file(struct file *file, const char __user *buf,
37121 + size_t count, loff_t *pos)
37122 +{
37123 + int result;
37124 + reiser4_context *ctx;
37125 + struct inode *inode;
37126 + unix_file_info_t *uf_info;
37127 + ssize_t written;
37128 + int try_free_space;
37129 + int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37130 + size_t left;
37131 + ssize_t (*write_op)(struct file *, const char __user *, size_t,
37132 + loff_t *pos);
37133 + int ea;
37134 + loff_t new_size;
37135 +
37136 + inode = file->f_dentry->d_inode;
37137 + ctx = reiser4_init_context(inode->i_sb);
37138 + if (IS_ERR(ctx))
37139 + return PTR_ERR(ctx);
37140 +
37141 + mutex_lock(&inode->i_mutex);
37142 +
37143 + assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37144 + assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
37145 +
37146 + /* check amount of bytes to write and writing position */
37147 + result = generic_write_checks(file, pos, &count, 0);
37148 + if (result) {
37149 + mutex_unlock(&inode->i_mutex);
37150 + context_set_commit_async(ctx);
37151 + reiser4_exit_context(ctx);
37152 + return result;
37153 + }
37154 +
37155 + result = remove_suid(file->f_dentry);
37156 + if (result) {
37157 + mutex_unlock(&inode->i_mutex);
37158 + context_set_commit_async(ctx);
37159 + reiser4_exit_context(ctx);
37160 + return result;
37161 + }
37162 +
37163 + uf_info = unix_file_inode_data(inode);
37164 +
37165 + current->backing_dev_info = inode->i_mapping->backing_dev_info;
37166 + written = 0;
37167 + try_free_space = 0;
37168 + left = count;
37169 + ea = NEITHER_OBTAINED;
37170 +
37171 + new_size = i_size_read(inode);
37172 + if (*pos + count > new_size)
37173 + new_size = *pos + count;
37174 +
37175 + while (left) {
37176 + if (left < to_write)
37177 + to_write = left;
37178 +
37179 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37180 + get_exclusive_access(uf_info);
37181 + ea = EA_OBTAINED;
37182 + if (uf_info->container != UF_CONTAINER_EMPTY) {
37183 + /* file is made not empty by another process */
37184 + drop_exclusive_access(uf_info);
37185 + ea = NEITHER_OBTAINED;
37186 + continue;
37187 + }
37188 + } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37189 + /*
37190 + * get exclusive access directly just to not have to
37191 + * re-obtain it if file will appear empty
37192 + */
37193 + get_exclusive_access(uf_info);
37194 + ea = EA_OBTAINED;
37195 + result = find_file_state(inode, uf_info);
37196 + if (result) {
37197 + drop_exclusive_access(uf_info);
37198 + ea = NEITHER_OBTAINED;
37199 + break;
37200 + }
37201 + } else {
37202 + get_nonexclusive_access(uf_info);
37203 + ea = NEA_OBTAINED;
37204 + }
37205 +
37206 + /* either EA or NEA is obtained. Choose item write method */
37207 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
37208 + /* file is built of extent items */
37209 + write_op = reiser4_write_extent;
37210 + } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37211 + /* file is empty */
37212 + if (should_have_notail(uf_info, new_size))
37213 + write_op = reiser4_write_extent;
37214 + else
37215 + write_op = reiser4_write_tail;
37216 + } else {
37217 + /* file is built of tail items */
37218 + if (should_have_notail(uf_info, new_size)) {
37219 + if (ea == NEA_OBTAINED) {
37220 + drop_nonexclusive_access(uf_info);
37221 + get_exclusive_access(uf_info);
37222 + ea = EA_OBTAINED;
37223 + }
37224 + if (uf_info->container == UF_CONTAINER_TAILS) {
37225 + /*
37226 + * if file is being convered by another
37227 + * process - wait until it completes
37228 + */
37229 + while (1) {
37230 + if (reiser4_inode_get_flag(inode,
37231 + REISER4_PART_IN_CONV)) {
37232 + drop_exclusive_access(uf_info);
37233 + schedule();
37234 + get_exclusive_access(uf_info);
37235 + continue;
37236 + }
37237 + break;
37238 + }
37239 + if (uf_info->container == UF_CONTAINER_TAILS) {
37240 + result = tail2extent(uf_info);
37241 + if (result)
37242 + break;
37243 + }
37244 + }
37245 + drop_exclusive_access(uf_info);
37246 + ea = NEITHER_OBTAINED;
37247 + continue;
37248 + }
37249 + write_op = reiser4_write_tail;
37250 + }
37251 +
37252 + written = write_op(file, buf, to_write, pos);
37253 + if (written == -ENOSPC && try_free_space) {
37254 + drop_access(uf_info);
37255 + txnmgr_force_commit_all(inode->i_sb, 0);
37256 + try_free_space = 0;
37257 + continue;
37258 + }
37259 + if (written < 0) {
37260 + drop_access(uf_info);
37261 + result = written;
37262 + break;
37263 + }
37264 + /* something is written. */
37265 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37266 + assert("", ea == EA_OBTAINED);
37267 + uf_info->container =
37268 + (write_op == reiser4_write_extent) ?
37269 + UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37270 + } else {
37271 + assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37272 + write_op == reiser4_write_extent));
37273 + assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37274 + write_op == reiser4_write_tail));
37275 + }
37276 + if (*pos + written > inode->i_size)
37277 + INODE_SET_FIELD(inode, i_size, *pos + written);
37278 + file_update_time(file);
37279 + result = reiser4_update_sd(inode);
37280 + if (result) {
37281 + mutex_unlock(&inode->i_mutex);
37282 + current->backing_dev_info = NULL;
37283 + drop_access(uf_info);
37284 + context_set_commit_async(ctx);
37285 + reiser4_exit_context(ctx);
37286 + return result;
37287 + }
37288 + drop_access(uf_info);
37289 + ea = NEITHER_OBTAINED;
37290 + reiser4_txn_restart(ctx);
37291 + current->journal_info = NULL;
37292 + /*
37293 + * tell VM how many pages were dirtied. Maybe number of pages
37294 + * which were dirty already should not be counted
37295 + */
37296 + balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37297 + (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
37298 + current->journal_info = ctx;
37299 +
37300 + left -= written;
37301 + buf += written;
37302 + *pos += written;
37303 + }
37304 +
37305 + mutex_unlock(&inode->i_mutex);
37306 +
37307 + if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37308 + reiser4_txn_restart_current();
37309 + grab_space_enable();
37310 + result = sync_unix_file(file, file->f_dentry,
37311 + 0 /* data and stat data */ );
37312 + if (result)
37313 + warning("reiser4-7", "failed to sync file %llu",
37314 + (unsigned long long)get_inode_oid(inode));
37315 + }
37316 +
37317 + current->backing_dev_info = NULL;
37318 +
37319 + reiser4_exit_context(ctx);
37320 +
37321 + /*
37322 + * return number of written bytes or error code if nothing is
37323 + * written. Note, that it does not work correctly in case when
37324 + * sync_unix_file returns error
37325 + */
37326 + return (count - left) ? (count - left) : result;
37327 +}
37328 +
37329 +/**
37330 + * release_unix_file - release of struct file_operations
37331 + * @inode: inode of released file
37332 + * @file: file to release
37333 + *
37334 + * Implementation of release method of struct file_operations for unix file
37335 + * plugin. If last reference to indode is released - convert all extent items
37336 + * into tail items if necessary. Frees reiser4 specific file data.
37337 + */
37338 +int release_unix_file(struct inode *inode, struct file *file)
37339 +{
37340 + reiser4_context *ctx;
37341 + unix_file_info_t *uf_info;
37342 + int result;
37343 + int in_reiser4;
37344 +
37345 + in_reiser4 = is_in_reiser4_context();
37346 +
37347 + ctx = reiser4_init_context(inode->i_sb);
37348 + if (IS_ERR(ctx))
37349 + return PTR_ERR(ctx);
37350 +
37351 + result = 0;
37352 + if (in_reiser4 == 0) {
37353 + uf_info = unix_file_inode_data(inode);
37354 +
37355 + get_exclusive_access_careful(uf_info, inode);
37356 + if (atomic_read(&file->f_dentry->d_count) == 1 &&
37357 + uf_info->container == UF_CONTAINER_EXTENTS &&
37358 + !should_have_notail(uf_info, inode->i_size) &&
37359 + !rofs_inode(inode)) {
37360 + result = extent2tail(uf_info);
37361 + if (result != 0) {
37362 + warning("nikita-3233",
37363 + "Failed (%d) to convert in %s (%llu)",
37364 + result, __FUNCTION__,
37365 + (unsigned long long)
37366 + get_inode_oid(inode));
37367 + }
37368 + }
37369 + drop_exclusive_access(uf_info);
37370 + } else {
37371 + /*
37372 + we are within reiser4 context already. How latter is
37373 + possible? Simple:
37374 +
37375 + (gdb) bt
37376 + #0 get_exclusive_access ()
37377 + #2 0xc01e56d3 in release_unix_file ()
37378 + #3 0xc01c3643 in reiser4_release ()
37379 + #4 0xc014cae0 in __fput ()
37380 + #5 0xc013ffc3 in remove_vm_struct ()
37381 + #6 0xc0141786 in exit_mmap ()
37382 + #7 0xc0118480 in mmput ()
37383 + #8 0xc0133205 in oom_kill ()
37384 + #9 0xc01332d1 in out_of_memory ()
37385 + #10 0xc013bc1d in try_to_free_pages ()
37386 + #11 0xc013427b in __alloc_pages ()
37387 + #12 0xc013f058 in do_anonymous_page ()
37388 + #13 0xc013f19d in do_no_page ()
37389 + #14 0xc013f60e in handle_mm_fault ()
37390 + #15 0xc01131e5 in do_page_fault ()
37391 + #16 0xc0104935 in error_code ()
37392 + #17 0xc025c0c6 in __copy_to_user_ll ()
37393 + #18 0xc01d496f in reiser4_read_tail ()
37394 + #19 0xc01e4def in read_unix_file ()
37395 + #20 0xc01c3504 in reiser4_read ()
37396 + #21 0xc014bd4f in vfs_read ()
37397 + #22 0xc014bf66 in sys_read ()
37398 + */
37399 + warning("vs-44", "out of memory?");
37400 + }
37401 +
37402 + reiser4_free_file_fsdata(file);
37403 +
37404 + reiser4_exit_context(ctx);
37405 + return result;
37406 +}
37407 +
37408 +static void set_file_notail(struct inode *inode)
37409 +{
37410 + reiser4_inode *state;
37411 + formatting_plugin *tplug;
37412 +
37413 + state = reiser4_inode_data(inode);
37414 + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
37415 + force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
37416 +}
37417 +
37418 +/* if file is built of tails - convert it to extents */
37419 +static int unpack(struct file *filp, struct inode *inode, int forever)
37420 +{
37421 + int result = 0;
37422 + unix_file_info_t *uf_info;
37423 +
37424 + uf_info = unix_file_inode_data(inode);
37425 + assert("vs-1628", ea_obtained(uf_info));
37426 +
37427 + result = find_file_state(inode, uf_info);
37428 + if (result)
37429 + return result;
37430 + assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37431 +
37432 + if (uf_info->container == UF_CONTAINER_TAILS) {
37433 + /*
37434 + * if file is being convered by another process - wait until it
37435 + * completes
37436 + */
37437 + while (1) {
37438 + if (reiser4_inode_get_flag(inode,
37439 + REISER4_PART_IN_CONV)) {
37440 + drop_exclusive_access(uf_info);
37441 + schedule();
37442 + get_exclusive_access(uf_info);
37443 + continue;
37444 + }
37445 + break;
37446 + }
37447 + if (uf_info->container == UF_CONTAINER_TAILS) {
37448 + result = tail2extent(uf_info);
37449 + if (result)
37450 + return result;
37451 + }
37452 + }
37453 + if (forever) {
37454 + /* safe new formatting plugin in stat data */
37455 + __u64 tograb;
37456 +
37457 + set_file_notail(inode);
37458 +
37459 + grab_space_enable();
37460 + tograb = inode_file_plugin(inode)->estimate.update(inode);
37461 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37462 + result = reiser4_update_sd(inode);
37463 + }
37464 +
37465 + return result;
37466 +}
37467 +
37468 +/* implentation of vfs' ioctl method of struct file_operations for unix file
37469 + plugin
37470 +*/
37471 +int
37472 +ioctl_unix_file(struct inode *inode, struct file *filp,
37473 + unsigned int cmd, unsigned long arg UNUSED_ARG)
37474 +{
37475 + reiser4_context *ctx;
37476 + int result;
37477 +
37478 + ctx = reiser4_init_context(inode->i_sb);
37479 + if (IS_ERR(ctx))
37480 + return PTR_ERR(ctx);
37481 +
37482 + switch (cmd) {
37483 + case REISER4_IOC_UNPACK:
37484 + get_exclusive_access(unix_file_inode_data(inode));
37485 + result = unpack(filp, inode, 1 /* forever */ );
37486 + drop_exclusive_access(unix_file_inode_data(inode));
37487 + break;
37488 +
37489 + default:
37490 + result = RETERR(-ENOSYS);
37491 + break;
37492 + }
37493 + reiser4_exit_context(ctx);
37494 + return result;
37495 +}
37496 +
37497 +/* implentation of vfs' bmap method of struct address_space_operations for unix
37498 + file plugin
37499 +*/
37500 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
37501 +{
37502 + reiser4_context *ctx;
37503 + sector_t result;
37504 + reiser4_key key;
37505 + coord_t coord;
37506 + lock_handle lh;
37507 + struct inode *inode;
37508 + item_plugin *iplug;
37509 + sector_t block;
37510 +
37511 + inode = mapping->host;
37512 +
37513 + ctx = reiser4_init_context(inode->i_sb);
37514 + if (IS_ERR(ctx))
37515 + return PTR_ERR(ctx);
37516 + key_by_inode_and_offset_common(inode,
37517 + (loff_t) lblock * current_blocksize,
37518 + &key);
37519 +
37520 + init_lh(&lh);
37521 + result =
37522 + find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
37523 + if (cbk_errored(result)) {
37524 + done_lh(&lh);
37525 + reiser4_exit_context(ctx);
37526 + return result;
37527 + }
37528 +
37529 + result = zload(coord.node);
37530 + if (result) {
37531 + done_lh(&lh);
37532 + reiser4_exit_context(ctx);
37533 + return result;
37534 + }
37535 +
37536 + iplug = item_plugin_by_coord(&coord);
37537 + if (iplug->s.file.get_block) {
37538 + result = iplug->s.file.get_block(&coord, lblock, &block);
37539 + if (result == 0)
37540 + result = block;
37541 + } else
37542 + result = RETERR(-EINVAL);
37543 +
37544 + zrelse(coord.node);
37545 + done_lh(&lh);
37546 + reiser4_exit_context(ctx);
37547 + return result;
37548 +}
37549 +
37550 +/**
37551 + * flow_by_inode_unix_file - initizlize structure flow
37552 + * @inode: inode of file for which read or write is abou
37553 + * @buf: buffer to perform read to or write from
37554 + * @user: flag showing whether @buf is user space or kernel space
37555 + * @size: size of buffer @buf
37556 + * @off: start offset fro read or write
37557 + * @op: READ or WRITE
37558 + * @flow:
37559 + *
37560 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
37561 + */
37562 +int flow_by_inode_unix_file(struct inode *inode,
37563 + const char __user *buf, int user,
37564 + loff_t size, loff_t off,
37565 + rw_op op, flow_t *flow)
37566 +{
37567 + assert("nikita-1100", inode != NULL);
37568 +
37569 + flow->length = size;
37570 + memcpy(&flow->data, &buf, sizeof(buf));
37571 + flow->user = user;
37572 + flow->op = op;
37573 + assert("nikita-1931", inode_file_plugin(inode) != NULL);
37574 + assert("nikita-1932",
37575 + inode_file_plugin(inode)->key_by_inode ==
37576 + key_by_inode_and_offset_common);
37577 + /* calculate key of write position and insert it into flow->key */
37578 + return key_by_inode_and_offset_common(inode, off, &flow->key);
37579 +}
37580 +
37581 +/* plugin->u.file.set_plug_in_sd = NULL
37582 + plugin->u.file.set_plug_in_inode = NULL
37583 + plugin->u.file.create_blank_sd = NULL */
37584 +/* plugin->u.file.delete */
37585 +/*
37586 + plugin->u.file.add_link = reiser4_add_link_common
37587 + plugin->u.file.rem_link = NULL */
37588 +
37589 +/* plugin->u.file.owns_item
37590 + this is common_file_owns_item with assertion */
37591 +/* Audited by: green(2002.06.15) */
37592 +int
37593 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
37594 + const coord_t * coord /* coord to check */ )
37595 +{
37596 + int result;
37597 +
37598 + result = owns_item_common(inode, coord);
37599 + if (!result)
37600 + return 0;
37601 + if (!plugin_of_group(item_plugin_by_coord(coord),
37602 + UNIX_FILE_METADATA_ITEM_TYPE))
37603 + return 0;
37604 + assert("vs-547",
37605 + item_id_by_coord(coord) == EXTENT_POINTER_ID ||
37606 + item_id_by_coord(coord) == FORMATTING_ID);
37607 + return 1;
37608 +}
37609 +
37610 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
37611 +{
37612 + int result;
37613 + int s_result;
37614 + loff_t old_size;
37615 + reiser4_tree *tree;
37616 +
37617 + inode_check_scale(inode, inode->i_size, attr->ia_size);
37618 +
37619 + old_size = inode->i_size;
37620 + tree = reiser4_tree_by_inode(inode);
37621 +
37622 + result = safe_link_grab(tree, BA_CAN_COMMIT);
37623 + if (result == 0)
37624 + result = safe_link_add(inode, SAFE_TRUNCATE);
37625 + if (result == 0)
37626 + result = truncate_file_body(inode, attr->ia_size);
37627 + if (result)
37628 + warning("vs-1588", "truncate_file failed: oid %lli, "
37629 + "old size %lld, new size %lld, retval %d",
37630 + (unsigned long long)get_inode_oid(inode),
37631 + old_size, attr->ia_size, result);
37632 +
37633 + s_result = safe_link_grab(tree, BA_CAN_COMMIT);
37634 + if (s_result == 0)
37635 + s_result =
37636 + safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
37637 + if (s_result != 0) {
37638 + warning("nikita-3417", "Cannot kill safelink %lli: %i",
37639 + (unsigned long long)get_inode_oid(inode), s_result);
37640 + }
37641 + safe_link_release(tree);
37642 + return result;
37643 +}
37644 +
37645 +/* plugin->u.file.setattr method */
37646 +/* This calls inode_setattr and if truncate is in effect it also takes
37647 + exclusive inode access to avoid races */
37648 +int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
37649 + struct iattr *attr /* change description */ )
37650 +{
37651 + int result;
37652 +
37653 + if (attr->ia_valid & ATTR_SIZE) {
37654 + reiser4_context *ctx;
37655 + unix_file_info_t *uf_info;
37656 +
37657 + /* truncate does reservation itself and requires exclusive
37658 + access obtained */
37659 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
37660 + if (IS_ERR(ctx))
37661 + return PTR_ERR(ctx);
37662 +
37663 + uf_info = unix_file_inode_data(dentry->d_inode);
37664 + get_exclusive_access_careful(uf_info, dentry->d_inode);
37665 + result = setattr_truncate(dentry->d_inode, attr);
37666 + drop_exclusive_access(uf_info);
37667 + context_set_commit_async(ctx);
37668 + reiser4_exit_context(ctx);
37669 + } else
37670 + result = reiser4_setattr_common(dentry, attr);
37671 +
37672 + return result;
37673 +}
37674 +
37675 +/* plugin->u.file.init_inode_data */
37676 +void
37677 +init_inode_data_unix_file(struct inode *inode,
37678 + reiser4_object_create_data * crd, int create)
37679 +{
37680 + unix_file_info_t *data;
37681 +
37682 + data = unix_file_inode_data(inode);
37683 + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
37684 + init_rwsem(&data->latch);
37685 + data->tplug = inode_formatting_plugin(inode);
37686 + data->exclusive_use = 0;
37687 +
37688 +#if REISER4_DEBUG
37689 + data->ea_owner = NULL;
37690 + atomic_set(&data->nr_neas, 0);
37691 +#endif
37692 + init_inode_ordering(inode, crd, create);
37693 +}
37694 +
37695 +/**
37696 + * delete_object_unix_file - delete_object of file_plugin
37697 + * @inode: inode to be deleted
37698 + *
37699 + * Truncates file to length 0, removes stat data and safe link.
37700 + */
37701 +int delete_object_unix_file(struct inode *inode)
37702 +{
37703 + unix_file_info_t *uf_info;
37704 + int result;
37705 +
37706 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
37707 + return 0;
37708 +
37709 + /* truncate file bogy first */
37710 + uf_info = unix_file_inode_data(inode);
37711 + get_exclusive_access(uf_info);
37712 + result = truncate_file_body(inode, 0 /* size */ );
37713 + drop_exclusive_access(uf_info);
37714 +
37715 + if (result)
37716 + warning("", "failed to truncate file (%llu) on removal: %d",
37717 + get_inode_oid(inode), result);
37718 +
37719 + /* remove stat data and safe link */
37720 + return reiser4_delete_object_common(inode);
37721 +}
37722 +
37723 +/**
37724 + * sendfile_unix_file - sendfile of struct file_operations
37725 + * @file: file to be sent
37726 + * @ppos: position to start from
37727 + * @count: number of bytes to send
37728 + * @actor: function to copy data
37729 + * @target: where to copy read data
37730 + *
37731 + * Reads @count bytes from @file and calls @actor for every page read. This is
37732 + * needed for loop back devices support.
37733 + */
37734 +ssize_t
37735 +sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
37736 + read_actor_t actor, void *target)
37737 +{
37738 + reiser4_context *ctx;
37739 + ssize_t result;
37740 + struct inode *inode;
37741 + unix_file_info_t *uf_info;
37742 +
37743 + inode = file->f_dentry->d_inode;
37744 + ctx = reiser4_init_context(inode->i_sb);
37745 + if (IS_ERR(ctx))
37746 + return PTR_ERR(ctx);
37747 +
37748 + /*
37749 + * generic_file_sndfile may want to call update_atime. Grab space for
37750 + * stat data update
37751 + */
37752 + result = reiser4_grab_space(estimate_update_common(inode),
37753 + BA_CAN_COMMIT);
37754 + if (result)
37755 + goto error;
37756 + mutex_lock(&inode->i_mutex);
37757 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37758 + mutex_unlock(&inode->i_mutex);
37759 +
37760 + uf_info = unix_file_inode_data(inode);
37761 + get_nonexclusive_access(uf_info);
37762 + result = generic_file_sendfile(file, ppos, count, actor, target);
37763 + drop_nonexclusive_access(uf_info);
37764 + error:
37765 + reiser4_exit_context(ctx);
37766 + return result;
37767 +}
37768 +
37769 +int
37770 +prepare_write_unix_file(struct file *file, struct page *page,
37771 + unsigned from, unsigned to)
37772 +{
37773 + reiser4_context *ctx;
37774 + unix_file_info_t *uf_info;
37775 + int ret;
37776 +
37777 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37778 + if (IS_ERR(ctx))
37779 + return PTR_ERR(ctx);
37780 +
37781 + uf_info = unix_file_inode_data(file->f_dentry->d_inode);
37782 + get_exclusive_access(uf_info);
37783 + ret = find_file_state(file->f_dentry->d_inode, uf_info);
37784 + if (ret == 0) {
37785 + if (uf_info->container == UF_CONTAINER_TAILS)
37786 + ret = -EINVAL;
37787 + else
37788 + ret = do_prepare_write(file, page, from, to);
37789 + }
37790 + drop_exclusive_access(uf_info);
37791 +
37792 + /* don't commit transaction under inode semaphore */
37793 + context_set_commit_async(ctx);
37794 + reiser4_exit_context(ctx);
37795 + return ret;
37796 +}
37797 +
37798 +/*
37799 + * Local variables:
37800 + * c-indentation-style: "K&R"
37801 + * mode-name: "LC"
37802 + * c-basic-offset: 8
37803 + * tab-width: 8
37804 + * fill-column: 79
37805 + * scroll-step: 1
37806 + * End:
37807 + */
37808 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c
37809 --- linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
37810 +++ linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c 2007-05-06 14:50:43.783001971 +0400
37811 @@ -0,0 +1,594 @@
37812 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
37813 + licensing governed by reiser4/README */
37814 +
37815 +/* This file contains hooks that converts (*) cryptcompress files to unix-files,
37816 + and a set of protected (**) methods of a cryptcompress file plugin to perform
37817 + such conversion.
37818 +
37819 +(*)
37820 + The conversion is performed for incompressible files to reduce cpu and memory
37821 + usage. If first logical cluster (64K by default) of a file is incompressible,
37822 + then we make a desicion, that the whole file is incompressible.
37823 + The conversion can be enabled via installing a special compression mode
37824 + plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
37825 + details).
37826 +
37827 +(**)
37828 + The protection means serialization of critical sections (readers and writers
37829 + of @pset->file)
37830 +*/
37831 +
37832 +#include "../../inode.h"
37833 +#include "../cluster.h"
37834 +#include "file.h"
37835 +
37836 +#define conversion_enabled(inode) \
37837 + (inode_compression_mode_plugin(inode) == \
37838 + compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
37839 +
37840 +
37841 +/* Located sections (readers and writers of @pset->file) are not
37842 + permanently critical: cryptcompress file can be converted only
37843 + if the conversion is enabled (see the macrio above). And we don't
37844 + convert unix files at all.
37845 + The following helper macro is a sanity check to decide if we
37846 + need to protect a located section.
37847 +*/
37848 +#define should_protect(inode) \
37849 + (inode_file_plugin(inode) == \
37850 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
37851 + conversion_enabled(inode))
37852 +
37853 +/* All protected methods have prefix "prot" in their names.
37854 + It is convenient to construct them by usual (unprotected) ones
37855 + using the following common macros:
37856 +*/
37857 +
37858 +/* Macro for passive protection.
37859 + method_cryptcompress contains only readers */
37860 +#define PROT_PASSIVE(type, method, args) \
37861 +({ \
37862 + type _result; \
37863 + struct rw_semaphore * guard = \
37864 + &reiser4_inode_data(inode)->conv_sem; \
37865 + \
37866 + if (should_protect(inode)) { \
37867 + down_read(guard); \
37868 + if (!should_protect(inode)) \
37869 + up_read(guard); \
37870 + } \
37871 + if (inode_file_plugin(inode) == \
37872 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37873 + _result = method ## _unix_file args; \
37874 + else \
37875 + _result = method ## _cryptcompress args; \
37876 + if (should_protect(inode)) \
37877 + up_read(guard); \
37878 + _result; \
37879 +})
37880 +
37881 +#define PROT_PASSIVE_VOID(method, args) \
37882 +({ \
37883 + struct rw_semaphore * guard = \
37884 + &reiser4_inode_data(inode)->conv_sem; \
37885 + \
37886 + if (should_protect(inode)) { \
37887 + down_read(guard); \
37888 + if (!should_protect(inode)) \
37889 + up_read(guard); \
37890 + } \
37891 + if (inode_file_plugin(inode) == \
37892 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37893 + method ## _unix_file args; \
37894 + else \
37895 + method ## _cryptcompress args; \
37896 + if (should_protect(inode)) \
37897 + up_read(guard); \
37898 +})
37899 +
37900 +/* Macro for active protection.
37901 + active_expr contains readers and writers; after its
37902 + evaluation conversion should be disabled */
37903 +#define PROT_ACTIVE(type, method, args, active_expr) \
37904 +({ \
37905 + type _result = 0; \
37906 + struct rw_semaphore * guard = \
37907 + &reiser4_inode_data(inode)->conv_sem; \
37908 + reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
37909 + if (IS_ERR(ctx)) \
37910 + return PTR_ERR(ctx); \
37911 + \
37912 + if (should_protect(inode)) { \
37913 + down_write(guard); \
37914 + if (should_protect(inode)) \
37915 + _result = active_expr; \
37916 + up_write(guard); \
37917 + } \
37918 + if (_result == 0) { \
37919 + if (inode_file_plugin(inode) == \
37920 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37921 + _result = method ## _unix_file args; \
37922 + else \
37923 + _result = method ## _cryptcompress args; \
37924 + } \
37925 + reiser4_exit_context(ctx); \
37926 + _result; \
37927 +})
37928 +
37929 +/* Pass management to the unix-file plugin with "notail" policy */
37930 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
37931 +{
37932 + int result;
37933 + reiser4_inode *info;
37934 + unix_file_info_t * uf;
37935 + info = reiser4_inode_data(inode);
37936 +
37937 + result = aset_set_unsafe(&info->pset,
37938 + PSET_FILE,
37939 + (reiser4_plugin *)
37940 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
37941 + if (result)
37942 + return result;
37943 + result = aset_set_unsafe(&info->pset,
37944 + PSET_FORMATTING,
37945 + (reiser4_plugin *)
37946 + formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
37947 + if (result)
37948 + return result;
37949 + /* get rid of non-standard plugins */
37950 + info->plugin_mask &= ~cryptcompress_mask;
37951 + /* get rid of plugin stat-data extension */
37952 + info->extmask &= ~(1 << PLUGIN_STAT);
37953 +
37954 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
37955 +
37956 + /* FIXME use init_inode_data_unix_file() instead,
37957 + but aviod init_inode_ordering() */
37958 + /* Init unix-file specific part of inode */
37959 + uf = unix_file_inode_data(inode);
37960 + uf->container = UF_CONTAINER_UNKNOWN;
37961 + init_rwsem(&uf->latch);
37962 + uf->tplug = inode_formatting_plugin(inode);
37963 + uf->exclusive_use = 0;
37964 +#if REISER4_DEBUG
37965 + uf->ea_owner = NULL;
37966 + atomic_set(&uf->nr_neas, 0);
37967 +#endif
37968 + inode->i_op =
37969 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
37970 + inode->i_fop =
37971 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
37972 + inode->i_mapping->a_ops =
37973 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
37974 + file->f_op = inode->i_fop;
37975 + return 0;
37976 +}
37977 +
37978 +#if REISER4_DEBUG
37979 +static int disabled_conversion_inode_ok(struct inode * inode)
37980 +{
37981 + __u64 extmask = reiser4_inode_data(inode)->extmask;
37982 + __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
37983 +
37984 + return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
37985 + (extmask & (1 << UNIX_STAT)) &&
37986 + (extmask & (1 << LARGE_TIMES_STAT)) &&
37987 + (extmask & (1 << PLUGIN_STAT)) &&
37988 + (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
37989 +}
37990 +#endif
37991 +
37992 +/* Assign another mode that will control
37993 + compression at flush time only */
37994 +static int disable_conversion_no_update_sd(struct inode * inode)
37995 +{
37996 + int result;
37997 + result =
37998 + force_plugin_pset(inode,
37999 + PSET_COMPRESSION_MODE,
38000 + (reiser4_plugin *)compression_mode_plugin_by_id
38001 + (LATTD_COMPRESSION_MODE_ID));
38002 + assert("edward-1500",
38003 + ergo(!result, disabled_conversion_inode_ok(inode)));
38004 + return result;
38005 +}
38006 +
38007 +/* Disable future attempts to check/convert. This function is called by
38008 + conversion hooks. */
38009 +static int disable_conversion(struct inode * inode)
38010 +{
38011 + return disable_conversion_no_update_sd(inode);
38012 +}
38013 +
38014 +static int check_position(struct inode * inode,
38015 + loff_t pos /* initial position in the file */,
38016 + reiser4_cluster_t * clust,
38017 + int * check_compress)
38018 +{
38019 + assert("edward-1505", conversion_enabled(inode));
38020 + assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38021 + /* if file size is more then cluster size, then compressible
38022 + status must be figured out (i.e. compression was disabled,
38023 + or file plugin was converted to unix_file) */
38024 +
38025 + if (pos > inode->i_size)
38026 + /* first logical cluster will contain a (partial) hole */
38027 + return disable_conversion(inode);
38028 + if (inode->i_size == inode_cluster_size(inode))
38029 + *check_compress = 1;
38030 + return 0;
38031 +}
38032 +
38033 +static void start_check_compressibility(struct inode * inode,
38034 + reiser4_cluster_t * clust,
38035 + hint_t * hint)
38036 +{
38037 + assert("edward-1507", clust->index == 1);
38038 + assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38039 + assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38040 +
38041 + hint_init_zero(hint);
38042 + clust->hint = hint;
38043 + clust->index --;
38044 + clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode));
38045 +
38046 + /* first logical cluster (of index #0) must be complete */
38047 + assert("edward-1510", fsize_to_count(clust, inode) ==
38048 + inode_cluster_size(inode));
38049 +}
38050 +
38051 +static void finish_check_compressibility(struct inode * inode,
38052 + reiser4_cluster_t * clust,
38053 + hint_t * hint)
38054 +{
38055 + reiser4_unset_hint(clust->hint);
38056 + clust->hint = hint;
38057 + clust->index ++;
38058 +}
38059 +
38060 +#if REISER4_DEBUG
38061 +static int prepped_dclust_ok(hint_t * hint)
38062 +{
38063 + reiser4_key key;
38064 + coord_t * coord = &hint->ext_coord.coord;
38065 +
38066 + item_key_by_coord(coord, &key);
38067 + return (item_id_by_coord(coord) == CTAIL_ID &&
38068 + !coord_is_unprepped_ctail(coord) &&
38069 + (get_key_offset(&key) + nr_units_ctail(coord) ==
38070 + dclust_get_extension_dsize(hint)));
38071 +}
38072 +#endif
38073 +
38074 +#define fifty_persent(size) (size >> 1)
38075 +/* evaluation of data compressibility */
38076 +#define data_is_compressible(osize, isize) \
38077 + (osize < fifty_persent(isize))
38078 +
38079 +/* This is called only once per file life.
38080 + Read first logical cluster (of index #0) and estimate its compressibility.
38081 + Save estimation result in @compressible */
38082 +static int read_check_compressibility(struct inode * inode,
38083 + reiser4_cluster_t * clust,
38084 + int * compressible)
38085 +{
38086 + int i;
38087 + int result;
38088 + __u32 dst_len;
38089 + hint_t tmp_hint;
38090 + hint_t * cur_hint = clust->hint;
38091 +
38092 + start_check_compressibility(inode, clust, &tmp_hint);
38093 +
38094 + result = grab_cluster_pages(inode, clust);
38095 + if (result)
38096 + return result;
38097 + /* Read page cluster here */
38098 + for (i = 0; i < clust->nr_pages; i++) {
38099 + struct page *page = clust->pages[i];
38100 + lock_page(page);
38101 + result = do_readpage_ctail(inode, clust, page,
38102 + ZNODE_READ_LOCK);
38103 + unlock_page(page);
38104 + if (result)
38105 + goto error;
38106 + }
38107 + tfm_cluster_clr_uptodate(&clust->tc);
38108 +
38109 + cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38110 +
38111 + if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38112 + /* lenght of compressed data is known, no need to compress */
38113 + assert("edward-1511",
38114 + znode_is_write_locked(tmp_hint.ext_coord.coord.node));
38115 + assert("edward-1512",
38116 + WITH_DATA(tmp_hint.ext_coord.coord.node,
38117 + prepped_dclust_ok(&tmp_hint)));
38118 + dst_len = dclust_get_extension_dsize(&tmp_hint);
38119 + }
38120 + else {
38121 + tfm_cluster_t * tc = &clust->tc;
38122 + compression_plugin * cplug = inode_compression_plugin(inode);
38123 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38124 + if (result)
38125 + goto error;
38126 + for (i = 0; i < clust->nr_pages; i++) {
38127 + char *data;
38128 + lock_page(clust->pages[i]);
38129 + BUG_ON(!PageUptodate(clust->pages[i]));
38130 + data = kmap(clust->pages[i]);
38131 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38132 + data, PAGE_CACHE_SIZE);
38133 + kunmap(clust->pages[i]);
38134 + unlock_page(clust->pages[i]);
38135 + }
38136 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38137 + if (result)
38138 + goto error;
38139 + result = grab_coa(tc, cplug);
38140 + if (result)
38141 + goto error;
38142 + tc->len = tc->lsize = fsize_to_count(clust, inode);
38143 + assert("edward-1513", tc->len == inode_cluster_size(inode));
38144 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38145 + cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38146 + tfm_input_data(clust), tc->len,
38147 + tfm_output_data(clust), &dst_len);
38148 + assert("edward-1514",
38149 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38150 + }
38151 + finish_check_compressibility(inode, clust, cur_hint);
38152 + *compressible = data_is_compressible(dst_len,
38153 + inode_cluster_size(inode));
38154 + return 0;
38155 + error:
38156 + reiser4_release_cluster_pages(clust);
38157 + return result;
38158 +}
38159 +
38160 +/* Cut disk cluster of index @idx */
38161 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38162 +{
38163 + reiser4_key from, to;
38164 + assert("edward-1515", inode_file_plugin(inode) ==
38165 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38166 + key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38167 + to = from;
38168 + set_key_offset(&to,
38169 + get_key_offset(&from) + inode_cluster_size(inode) - 1);
38170 + return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38171 + &from, &to, inode, 0);
38172 +}
38173 +
38174 +static int reserve_cryptcompress2unixfile(struct inode *inode)
38175 +{
38176 + reiser4_block_nr unformatted_nodes;
38177 + reiser4_tree *tree;
38178 +
38179 + tree = reiser4_tree_by_inode(inode);
38180 +
38181 + /* number of unformatted nodes which will be created */
38182 + unformatted_nodes = cluster_nrpages(inode); /* N */
38183 +
38184 + /*
38185 + * space required for one iteration of extent->tail conversion:
38186 + *
38187 + * 1. kill ctail items
38188 + *
38189 + * 2. insert N unformatted nodes
38190 + *
38191 + * 3. insert N (worst-case single-block
38192 + * extents) extent units.
38193 + *
38194 + * 4. drilling to the leaf level by coord_by_key()
38195 + *
38196 + * 5. possible update of stat-data
38197 + *
38198 + */
38199 + grab_space_enable();
38200 + return reiser4_grab_space
38201 + (2 * tree->height +
38202 + unformatted_nodes +
38203 + unformatted_nodes * estimate_one_insert_into_item(tree) +
38204 + 1 + estimate_one_insert_item(tree) +
38205 + inode_file_plugin(inode)->estimate.update(inode),
38206 + BA_CAN_COMMIT);
38207 +}
38208 +
38209 +/* clear flag that indicated conversion and update
38210 + stat-data with new (unix-file - specific) info */
38211 +static int complete_file_conversion(struct inode *inode)
38212 +{
38213 + int result;
38214 +
38215 + grab_space_enable();
38216 + result =
38217 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
38218 + BA_CAN_COMMIT);
38219 + if (result == 0) {
38220 + reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38221 + result = reiser4_update_sd(inode);
38222 + }
38223 + if (result)
38224 + warning("edward-1452",
38225 + "Converting %llu to unix-file: update sd failed (%i)",
38226 + (unsigned long long)get_inode_oid(inode), result);
38227 + return 0;
38228 +}
38229 +
38230 +
38231 +/* do conversion */
38232 +static int cryptcompress2unixfile(struct file *file, struct inode * inode,
38233 + reiser4_cluster_t * clust)
38234 +{
38235 + int i;
38236 + int result = 0;
38237 + cryptcompress_info_t *cr_info;
38238 + unix_file_info_t *uf_info;
38239 +
38240 + assert("edward-1516", clust->pages[0]->index == 0);
38241 + assert("edward-1517", clust->hint != NULL);
38242 +
38243 + /* release all cryptcompress-specific recources */
38244 + cr_info = cryptcompress_inode_data(inode);
38245 + result = reserve_cryptcompress2unixfile(inode);
38246 + if (result)
38247 + goto out;
38248 + reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38249 + reiser4_unset_hint(clust->hint);
38250 + result = cut_disk_cluster(inode, 0);
38251 + if (result)
38252 + goto out;
38253 + /* captured jnode of cluster and assotiated resources (pages,
38254 + reserved disk space) were released by ->kill_hook() method
38255 + of the item plugin */
38256 +
38257 + result = __cryptcompress2unixfile(file, inode);
38258 + if (result)
38259 + goto out;
38260 + /* At this point file is managed by unix file plugin */
38261 +
38262 + uf_info = unix_file_inode_data(inode);
38263 +
38264 + assert("edward-1518",
38265 + ergo(jprivate(clust->pages[0]),
38266 + !jnode_is_cluster_page(jprivate(clust->pages[0]))));
38267 + for(i = 0; i < clust->nr_pages; i++) {
38268 + assert("edward-1519", clust->pages[i]);
38269 + assert("edward-1520", PageUptodate(clust->pages[i]));
38270 +
38271 + result = find_or_create_extent(clust->pages[i]);
38272 + if (result)
38273 + break;
38274 + }
38275 + if (!result) {
38276 + uf_info->container = UF_CONTAINER_EXTENTS;
38277 + complete_file_conversion(inode);
38278 + }
38279 + out:
38280 + all_grabbed2free();
38281 + if (result)
38282 + warning("edward-1453", "Failed to convert file %llu: %i",
38283 + (unsigned long long)get_inode_oid(inode), result);
38284 + return result;
38285 +}
38286 +
38287 +/* Check, then perform or disable conversion if needed */
38288 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
38289 + reiser4_cluster_t * clust, int * progress)
38290 +{
38291 + int result;
38292 + int check_compress = 0;
38293 + int compressible = 0;
38294 +
38295 + if (!conversion_enabled(inode))
38296 + return 0;
38297 + result = check_position(inode, pos, clust, &check_compress);
38298 + if (result || !check_compress)
38299 + return result;
38300 + result = read_check_compressibility(inode, clust, &compressible);
38301 + if (result)
38302 + return result;
38303 +
38304 + /* At this point page cluster is grabbed and uptodate */
38305 + if (!compressible) {
38306 + result = cryptcompress2unixfile(file, inode, clust);
38307 + if (result == 0)
38308 + *progress = 1;
38309 + }
38310 + else
38311 + result = disable_conversion(inode);
38312 +
38313 + reiser4_release_cluster_pages(clust);
38314 + return result;
38315 +}
38316 +
38317 +static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
38318 +{
38319 + return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
38320 +}
38321 +
38322 +/* Protected methods of cryptcompress file plugin constructed
38323 + by the macros above */
38324 +
38325 +/* Wrappers with active protection for:
38326 + . write_cryptcompress;
38327 + . setattr_cryptcompress;
38328 +*/
38329 +
38330 +ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
38331 + size_t count, loff_t *off)
38332 +{
38333 + int prot = 0;
38334 + int conv = 0;
38335 + ssize_t written_cr = 0;
38336 + ssize_t written_uf = 0;
38337 + struct inode * inode = file->f_dentry->d_inode;
38338 + struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
38339 +
38340 + if (should_protect(inode)) {
38341 + prot = 1;
38342 + down_write(guard);
38343 + }
38344 + written_cr = write_cryptcompress(file, buf, count, off, &conv);
38345 + if (prot)
38346 + up_write(guard);
38347 + if (written_cr < 0)
38348 + return written_cr;
38349 + if (conv)
38350 + written_uf = write_unix_file(file, buf + written_cr,
38351 + count - written_cr, off);
38352 + return written_cr + (written_uf < 0 ? 0 : written_uf);
38353 +}
38354 +
38355 +int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
38356 +{
38357 + struct inode * inode = dentry->d_inode;
38358 + return PROT_ACTIVE(int, setattr, (dentry, attr),
38359 + setattr_conversion_hook(inode, attr));
38360 +}
38361 +
38362 +/* Wrappers with passive protection for:
38363 + . read_cryptcomperess;
38364 + . mmap_cryptcompress;
38365 + . release_cryptcompress;
38366 + . sendfile_cryptcompress;
38367 + . delete_object_cryptcompress.
38368 +*/
38369 +ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
38370 + size_t size, loff_t * off)
38371 +{
38372 + struct inode * inode = file->f_dentry->d_inode;
38373 + return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
38374 +}
38375 +
38376 +int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
38377 +{
38378 + struct inode *inode = file->f_dentry->d_inode;
38379 + return PROT_PASSIVE(int, mmap, (file, vma));
38380 +}
38381 +
38382 +int prot_release_cryptcompress(struct inode *inode, struct file *file)
38383 +{
38384 + return PROT_PASSIVE(int, release, (inode, file));
38385 +}
38386 +
38387 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
38388 + size_t count, read_actor_t actor,
38389 + void *target)
38390 +{
38391 + struct inode * inode = file->f_dentry->d_inode;
38392 + return PROT_PASSIVE(ssize_t, sendfile,
38393 + (file, ppos, count, actor, target));
38394 +}
38395 +
38396 +/*
38397 + Local variables:
38398 + c-indentation-style: "K&R"
38399 + mode-name: "LC"
38400 + c-basic-offset: 8
38401 + tab-width: 8
38402 + fill-column: 80
38403 + scroll-step: 1
38404 + End:
38405 +*/
38406 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.h linux-2.6.20/fs/reiser4/plugin/file/file.h
38407 --- linux-2.6.20.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
38408 +++ linux-2.6.20/fs/reiser4/plugin/file/file.h 2007-05-06 14:50:43.783001971 +0400
38409 @@ -0,0 +1,272 @@
38410 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38411 + * reiser4/README */
38412 +
38413 +/* this file contains declarations of methods implementing
38414 + file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38415 + and SYMLINK_FILE_PLUGIN_ID) */
38416 +
38417 +#if !defined( __REISER4_FILE_H__ )
38418 +#define __REISER4_FILE_H__
38419 +
38420 +/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38421 +
38422 +/* inode operations */
38423 +int setattr_unix_file(struct dentry *, struct iattr *);
38424 +
38425 +/* file operations */
38426 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38427 + loff_t *off);
38428 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38429 + loff_t * off);
38430 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38431 + unsigned long arg);
38432 +int mmap_unix_file(struct file *, struct vm_area_struct *);
38433 +int open_unix_file(struct inode *, struct file *);
38434 +int release_unix_file(struct inode *, struct file *);
38435 +int sync_unix_file(struct file *, struct dentry *, int datasync);
38436 +ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38437 + read_actor_t, void *target);
38438 +
38439 +/* address space operations */
38440 +int readpage_unix_file(struct file *, struct page *);
38441 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
38442 +int writepages_unix_file(struct address_space *, struct writeback_control *);
38443 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38444 + unsigned to);
38445 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
38446 + unsigned to);
38447 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38448 +
38449 +/* file plugin operations */
38450 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38451 + int user, loff_t, loff_t, rw_op, flow_t *);
38452 +int owns_item_unix_file(const struct inode *, const coord_t *);
38453 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38454 + int create);
38455 +int delete_object_unix_file(struct inode *);
38456 +
38457 +/*
38458 + * all the write into unix file is performed by item write method. Write method
38459 + * of unix file plugin only decides which item plugin (extent or tail) and in
38460 + * which mode (one from the enum below) to call
38461 + */
38462 +typedef enum {
38463 + FIRST_ITEM = 1,
38464 + APPEND_ITEM = 2,
38465 + OVERWRITE_ITEM = 3
38466 +} write_mode_t;
38467 +
38468 +/* unix file may be in one the following states */
38469 +typedef enum {
38470 + UF_CONTAINER_UNKNOWN = 0,
38471 + UF_CONTAINER_TAILS = 1,
38472 + UF_CONTAINER_EXTENTS = 2,
38473 + UF_CONTAINER_EMPTY = 3
38474 +} file_container_t;
38475 +
38476 +struct formatting_plugin;
38477 +struct inode;
38478 +
38479 +/* unix file plugin specific part of reiser4 inode */
38480 +typedef struct unix_file_info {
38481 + /*
38482 + * this read-write lock protects file containerization change. Accesses
38483 + * which do not change file containerization (see file_container_t)
38484 + * (read, readpage, writepage, write (until tail conversion is
38485 + * involved)) take read-lock. Accesses which modify file
38486 + * containerization (truncate, conversion from tail to extent and back)
38487 + * take write-lock.
38488 + */
38489 + struct rw_semaphore latch;
38490 + /* this enum specifies which items are used to build the file */
38491 + file_container_t container;
38492 + /*
38493 + * plugin which controls when file is to be converted to extents and
38494 + * back to tail
38495 + */
38496 + struct formatting_plugin *tplug;
38497 + /* if this is set, file is in exclusive use */
38498 + int exclusive_use;
38499 +#if REISER4_DEBUG
38500 + /* pointer to task struct of thread owning exclusive access to file */
38501 + void *ea_owner;
38502 + atomic_t nr_neas;
38503 + void *last_reader;
38504 +#endif
38505 +} unix_file_info_t;
38506 +
38507 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38508 +void get_exclusive_access(unix_file_info_t *);
38509 +void drop_exclusive_access(unix_file_info_t *);
38510 +void get_nonexclusive_access(unix_file_info_t *);
38511 +void drop_nonexclusive_access(unix_file_info_t *);
38512 +int try_to_get_nonexclusive_access(unix_file_info_t *);
38513 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38514 + struct inode *);
38515 +int find_file_item_nohint(coord_t *, lock_handle *,
38516 + const reiser4_key *, znode_lock_mode,
38517 + struct inode *);
38518 +
38519 +int load_file_hint(struct file *, hint_t *);
38520 +void save_file_hint(struct file *, const hint_t *);
38521 +
38522 +#include "../item/extent.h"
38523 +#include "../item/tail.h"
38524 +#include "../item/ctail.h"
38525 +
38526 +struct uf_coord {
38527 + coord_t coord;
38528 + lock_handle *lh;
38529 + int valid;
38530 + union {
38531 + extent_coord_extension_t extent;
38532 + tail_coord_extension_t tail;
38533 + ctail_coord_extension_t ctail;
38534 + } extension;
38535 +};
38536 +
38537 +#include "../../forward.h"
38538 +#include "../../seal.h"
38539 +#include "../../lock.h"
38540 +
38541 +/*
38542 + * This structure is used to speed up file operations (reads and writes). A
38543 + * hint is a suggestion about where a key resolved to last time. A seal
38544 + * indicates whether a node has been modified since a hint was last recorded.
38545 + * You check the seal, and if the seal is still valid, you can use the hint
38546 + * without traversing the tree again.
38547 + */
38548 +struct hint {
38549 + seal_t seal; /* a seal over last file item accessed */
38550 + uf_coord_t ext_coord;
38551 + loff_t offset;
38552 + znode_lock_mode mode;
38553 + lock_handle lh;
38554 +};
38555 +
38556 +static inline int hint_is_valid(hint_t * hint)
38557 +{
38558 + return hint->ext_coord.valid;
38559 +}
38560 +
38561 +static inline void hint_set_valid(hint_t * hint)
38562 +{
38563 + hint->ext_coord.valid = 1;
38564 +}
38565 +
38566 +static inline void hint_clr_valid(hint_t * hint)
38567 +{
38568 + hint->ext_coord.valid = 0;
38569 +}
38570 +
38571 +int load_file_hint(struct file *, hint_t *);
38572 +void save_file_hint(struct file *, const hint_t *);
38573 +void hint_init_zero(hint_t *);
38574 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38575 +int hint_is_set(const hint_t *);
38576 +void reiser4_unset_hint(hint_t *);
38577 +
38578 +int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
38579 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38580 + loff_t cur_size, int (*update_actor) (struct inode *,
38581 + reiser4_key *, int));
38582 +#if REISER4_DEBUG
38583 +
38584 +/* return 1 is exclusive access is obtained, 0 - otherwise */
38585 +static inline int ea_obtained(unix_file_info_t * uf_info)
38586 +{
38587 + int ret;
38588 +
38589 + ret = down_read_trylock(&uf_info->latch);
38590 + if (ret)
38591 + up_read(&uf_info->latch);
38592 + return !ret;
38593 +}
38594 +
38595 +#endif
38596 +
38597 +/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38598 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38599 + reiser4_object_create_data *);
38600 +void destroy_inode_symlink(struct inode *);
38601 +
38602 +/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38603 + file plugin */
38604 +
38605 +/* inode operations */
38606 +int setattr_cryptcompress(struct dentry *, struct iattr *);
38607 +int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
38608 +
38609 +/* file operations */
38610 +ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38611 + loff_t * off);
38612 +ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38613 + size_t read_amount, loff_t * off);
38614 +
38615 +ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38616 + loff_t * off, int * conv);
38617 +ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38618 + loff_t * off);
38619 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38620 +int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
38621 +ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38622 + read_actor_t actor, void *target);
38623 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38624 + read_actor_t actor, void *target);
38625 +
38626 +int release_cryptcompress(struct inode *, struct file *);
38627 +int prot_release_cryptcompress(struct inode *, struct file *);
38628 +
38629 +/* address space operations */
38630 +extern int readpage_cryptcompress(struct file *, struct page *);
38631 +extern int writepages_cryptcompress(struct address_space *,
38632 + struct writeback_control *);
38633 +/* file plugin operations */
38634 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38635 + int user, loff_t, loff_t, rw_op, flow_t *);
38636 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38637 +int create_cryptcompress(struct inode *, struct inode *,
38638 + reiser4_object_create_data *);
38639 +int delete_object_cryptcompress(struct inode *);
38640 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38641 + int create);
38642 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38643 + const reiser4_key * to_key,
38644 + reiser4_key * smallest_removed,
38645 + struct inode *object, int truncate,
38646 + int *progress);
38647 +void destroy_inode_cryptcompress(struct inode *);
38648 +int open_object_cryptcompress(struct inode * inode, struct file * file);
38649 +
38650 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
38651 +
38652 +#define WRITE_GRANULARITY 32
38653 +
38654 +int tail2extent(unix_file_info_t *);
38655 +int extent2tail(unix_file_info_t *);
38656 +
38657 +int goto_right_neighbor(coord_t *, lock_handle *);
38658 +int find_or_create_extent(struct page *);
38659 +int equal_to_ldk(znode *, const reiser4_key *);
38660 +
38661 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38662 +
38663 +static inline int cbk_errored(int cbk_result)
38664 +{
38665 + return (cbk_result != CBK_COORD_NOTFOUND
38666 + && cbk_result != CBK_COORD_FOUND);
38667 +}
38668 +
38669 +/* __REISER4_FILE_H__ */
38670 +#endif
38671 +
38672 +/*
38673 + * Local variables:
38674 + * c-indentation-style: "K&R"
38675 + * mode-name: "LC"
38676 + * c-basic-offset: 8
38677 + * tab-width: 8
38678 + * fill-column: 79
38679 + * scroll-step: 1
38680 + * End:
38681 +*/
38682 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c linux-2.6.20/fs/reiser4/plugin/file/invert.c
38683 --- linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c 1970-01-01 03:00:00.000000000 +0300
38684 +++ linux-2.6.20/fs/reiser4/plugin/file/invert.c 2007-05-06 14:50:43.783001971 +0400
38685 @@ -0,0 +1,493 @@
38686 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38687 +
38688 +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
38689 + buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
38690 + provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
38691 + when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
38692 + to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
38693 + make that easy for you by providing those delimiters in what you read from it.
38694 +
38695 + When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
38696 + bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
38697 + would create those files. But which files? Well, that must be specified in the body of the invert using a special
38698 + syntax, and that specification is called the invert of the assignment.
38699 +
38700 + When written to, an invert performs the assignment command that is written
38701 + to it, and modifies its own body to contain the invert of that
38702 + assignment.
38703 +
38704 + In other words, writing to an invert file what you have read from it
38705 + is the identity operation.
38706 +
38707 + Malformed assignments cause write errors. Partial writes are not
38708 + supported in v4.0, but will be.
38709 +
38710 + Example:
38711 +
38712 + If an invert contains:
38713 +
38714 + /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
38715 +
38716 +======================
38717 +Each element in this definition should be an invert, and all files
38718 +should be called recursively - too. This is bad. If one of the
38719 +included files in not a regular or invert file, then we can't read
38720 +main file.
38721 +
38722 +I think to make it is possible easier:
38723 +
38724 +internal structure of invert file should be like symlink file. But
38725 +read and write method should be explitely indicated in i/o operation..
38726 +
38727 +By default we read and write (if probably) as symlink and if we
38728 +specify ..invert at reading time that too we can specify it at write time.
38729 +
38730 +example:
38731 +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
38732 +will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
38733 +
38734 +read of /my_invert_file/..invert will be
38735 +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38736 +
38737 +but read of /my_invert_file/ will be
38738 +The contents of filenameAsome text stored in the invertThe contents of filenameB
38739 +
38740 +we also can creat this file as
38741 +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
38742 +will create /my_invert_file , and use existing files /filenameA and /filenameB.
38743 +
38744 +and when we will read it will be as previously invert file.
38745 +
38746 +This is correct?
38747 +
38748 + vv
38749 +DEMIDOV-FIXME-HANS:
38750 +
38751 +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
38752 +
38753 +Do you agree? Discuss it on reiserfs-list....
38754 +
38755 +-Hans
38756 +=======================
38757 +
38758 + Then a read will return:
38759 +
38760 + /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38761 +
38762 + and a write of the line above to the invert will set the contents of
38763 + the invert and filenameA and filenameB to their original values.
38764 +
38765 + Note that the contents of an invert have no influence on the effect
38766 + of a write unless the write is a partial write (and a write of a
38767 + shorter file without using truncate first is a partial write).
38768 +
38769 + truncate() has no effect on filenameA and filenameB, it merely
38770 + resets the value of the invert.
38771 +
38772 + Writes to subfiles via the invert are implemented by preceding them
38773 + with truncates.
38774 +
38775 + Parse failures cause write failures.
38776 +
38777 + Questions to ponder: should the invert be acted on prior to file
38778 + close when writing to an open filedescriptor?
38779 +
38780 + Example:
38781 +
38782 + If an invert contains:
38783 +
38784 + "(This text and a pair of quotes are all that is here.)
38785 +
38786 +Then a read will return:
38787 +
38788 + "(This text and a pair of quotes are all that is here.)
38789 +
38790 +*/
38791 +
38792 +/* OPEN method places a struct file in memory associated with invert body
38793 + and returns something like file descriptor to the user for the future access
38794 + to the invert file.
38795 + During opening we parse the body of invert and get a list of the 'entryes'
38796 + (that describes all its subfiles) and place pointer on the first struct in
38797 + reiserfs-specific part of invert inode (arbitrary decision).
38798 +
38799 + Each subfile is described by the struct inv_entry that has a pointer @sd on
38800 + in-core based stat-data and a pointer on struct file @f (if we find that the
38801 + subfile uses more then one unformated node (arbitrary decision), we load
38802 + struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
38803 + of some other information we need)
38804 +
38805 + Since READ and WRITE methods for inverts were formulated in assignment
38806 + language, they don't contain arguments 'size' and 'offset' that make sense
38807 + only in ordinary read/write methods.
38808 +
38809 + READ method is a combination of two methods:
38810 + 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
38811 + with @f != 0, this method uses pointer on struct file as an argument
38812 + 2) read method for inode-less files with @sd != 0, this method uses
38813 + in-core based stat-data instead struct file as an argument.
38814 + in the first case we don't use pagecache, just copy data that we got after
38815 + cbk() into userspace.
38816 +
38817 + WRITE method for invert files is more complex.
38818 + Besides declared WRITE-interface in assignment languageb above we need
38819 + to have an opportunity to edit unwrapped body of invert file with some
38820 + text editor, it means we need GENERIC WRITE METHOD for invert file:
38821 +
38822 + my_invert_file/..invert <- "string"
38823 +
38824 + this method parses "string" and looks for correct subfile signatures, also
38825 + the parsing process splits this "string" on the set of flows in accordance
38826 + with the set of subfiles specified by this signarure.
38827 + The found list of signatures #S is compared with the opened one #I of invert
38828 + file. If it doesn't have this one (#I==0, it will be so for instance if we
38829 + have just create this invert file) the write method assignes found signature
38830 + (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
38831 + itself to the some write methods for ordinary or light-weight, or call itself
38832 + recursively for invert files with corresponding flows.
38833 + I am not sure, but the list of signatures looks like what mr.Demidov means
38834 + by 'delimiters'.
38835 +
38836 + The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
38837 + and cause delete (create new) subfiles (arbitrary decision - it may looks
38838 + too complex, but this interface will be the completest). The order of entries
38839 + of list #S (#I) and inherited order on #I (#S) must coincide.
38840 + The other parsing results give malformed signature that aborts READ method
38841 + and releases all resources.
38842 +
38843 + Format of subfile (entry) signature:
38844 +
38845 + "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
38846 +
38847 + Legend:
38848 +
38849 + START_MAGIC - keyword indicates the start of subfile signature;
38850 +
38851 + <> indicates the start of 'subfile metadata', that is the pair
38852 + (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
38853 +
38854 + TYPE - the string "type" indicates the start of one of the three words:
38855 + - ORDINARY_FILE,
38856 + - LIGHT_WEIGHT_FILE,
38857 + - INVERT_FILE;
38858 +
38859 + LOOKUP_ARG - lookup argument depends on previous type:
38860 + */
38861 +
38862 + /************************************************************/
38863 + /* TYPE * LOOKUP ARGUMENT */
38864 + /************************************************************/
38865 + /* LIGH_WEIGHT_FILE * stat-data key */
38866 + /************************************************************/
38867 + /* ORDINARY_FILE * filename */
38868 + /************************************************************/
38869 + /* INVERT_FILE * filename */
38870 + /************************************************************/
38871 +
38872 + /* where:
38873 + *stat-data key - the string contains stat data key of this subfile, it will be
38874 + passed to fast-access lookup method for light-weight files;
38875 + *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
38876 + for ordinary and invert files;
38877 +
38878 + SUBFILE_BODY - data of this subfile (it will go to the flow)
38879 + END_MAGIC - the keyword indicates the end of subfile signature.
38880 +
38881 + The other simbols inside the signature interpreted as 'unformatted content',
38882 + which is available with VFS's read_link() (arbitraruy decision).
38883 +
38884 + NOTE: Parse method for a body of invert file uses mentioned signatures _without_
38885 + subfile bodies.
38886 +
38887 + Now the only unclear thing is WRITE in regular light-weight subfile A that we
38888 + can describe only in assignment language:
38889 +
38890 + A <- "some_string"
38891 +
38892 + I guess we don't want to change stat-data and body items of file A
38893 + if this file exist, and size(A) != size("some_string") because this operation is
38894 + expencive, so we only do the partial write if size(A) > size("some_string")
38895 + and do truncate of the "some_string", and then do A <- "truncated string", if
38896 + size(A) < size("some_string"). This decision is also arbitrary..
38897 + */
38898 +
38899 +/* here is infrastructure for formated flows */
38900 +
38901 +#define SUBFILE_HEADER_MAGIC 0x19196605
38902 +#define FLOW_HEADER_MAGIC 0x01194304
38903 +
38904 +#include "../plugin.h"
38905 +#include "../../debug.h"
38906 +#include "../../forward.h"
38907 +#include "../object.h"
38908 +#include "../item/item.h"
38909 +#include "../item/static_stat.h"
38910 +#include "../../dformat.h"
38911 +#include "../znode.h"
38912 +#include "../inode.h"
38913 +
38914 +#include <linux/types.h>
38915 +#include <linux/fs.h> /* for struct file */
38916 +#include <linux/list.h> /* for struct list_head */
38917 +
38918 +typedef enum {
38919 + LIGHT_WEIGHT_FILE,
38920 + ORDINARY_FILE,
38921 + INVERT_FILE
38922 +} inv_entry_type;
38923 +
38924 +typedef struct flow_header {
38925 + d32 fl_magic;
38926 + d16 fl_nr; /* number of subfiles in the flow */
38927 +};
38928 +
38929 +typedef struct subfile_header {
38930 + d32 sh_magic; /* subfile magic */
38931 + d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
38932 + d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
38933 + d32 sh_body_len; /* lenght of subfile body */
38934 +};
38935 +
38936 +/* functions to get/set fields of flow header */
38937 +
38938 +static void fl_set_magic(flow_header * fh, __u32 value)
38939 +{
38940 + cputod32(value, &fh->fh_magic);
38941 +}
38942 +
38943 +static __u32 fl_get_magic(flow_header * fh)
38944 +{
38945 + return d32tocpu(&fh->fh_magic);
38946 +}
38947 +static void fl_set_number(flow_header * fh, __u16 value)
38948 +{
38949 + cputod16(value, &fh->fh_nr);
38950 +}
38951 +static unsigned fl_get_number(flow_header * fh)
38952 +{
38953 + return d16tocpu(&fh->fh_nr);
38954 +}
38955 +
38956 +/* functions to get/set fields of subfile header */
38957 +
38958 +static void sh_set_magic(subfile_header * sh, __u32 value)
38959 +{
38960 + cputod32(value, &sh->sh_magic);
38961 +}
38962 +
38963 +static __u32 sh_get_magic(subfile_header * sh)
38964 +{
38965 + return d32tocpu(&sh->sh_magic);
38966 +}
38967 +static void sh_set_type(subfile_header * sh, __u16 value)
38968 +{
38969 + cputod16(value, &sh->sh_magic);
38970 +}
38971 +static unsigned sh_get_type(subfile_header * sh)
38972 +{
38973 + return d16tocpu(&sh->sh_magic);
38974 +}
38975 +static void sh_set_arg_len(subfile_header * sh, __u16 value)
38976 +{
38977 + cputod16(value, &sh->sh_arg_len);
38978 +}
38979 +static unsigned sh_get_arg_len(subfile_header * sh)
38980 +{
38981 + return d16tocpu(&sh->sh_arg_len);
38982 +}
38983 +static void sh_set_body_len(subfile_header * sh, __u32 value)
38984 +{
38985 + cputod32(value, &sh->sh_body_len);
38986 +}
38987 +
38988 +static __u32 sh_get_body_len(subfile_header * sh)
38989 +{
38990 + return d32tocpu(&sh->sh_body_len);
38991 +}
38992 +
38993 +/* in-core minimal stat-data, light-weight analog of inode */
38994 +
38995 +struct incore_sd_base {
38996 + umode_t isd_mode;
38997 + nlink_t isd_nlink;
38998 + loff_t isd_size;
38999 + char *isd_data; /* 'subflow' to write */
39000 +};
39001 +
39002 +/* open invert create a list of invert entries,
39003 + every entry is represented by structure inv_entry */
39004 +
39005 +struct inv_entry {
39006 + struct list_head *ie_list;
39007 + struct file *ie_file; /* this is NULL if the file doesn't
39008 + have unformated nodes */
39009 + struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
39010 +};
39011 +
39012 +/* allocate and init invert entry */
39013 +
39014 +static struct inv_entry *allocate_inv_entry(void)
39015 +{
39016 + struct inv_entry *inv_entry;
39017 +
39018 + inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
39019 + if (!inv_entry)
39020 + return ERR_PTR(RETERR(-ENOMEM));
39021 + inv_entry->ie_file = NULL;
39022 + inv_entry->ie_sd = NULL;
39023 + INIT_LIST_HEAD(&inv_entry->ie_list);
39024 + return inv_entry;
39025 +}
39026 +
39027 +static int put_inv_entry(struct inv_entry *ientry)
39028 +{
39029 + int result = 0;
39030 +
39031 + assert("edward-96", ientry != NULL);
39032 + assert("edward-97", ientry->ie_list != NULL);
39033 +
39034 + list_del(ientry->ie_list);
39035 + if (ientry->ie_sd != NULL) {
39036 + kfree(ientry->ie_sd);
39037 + kfree(ientry);
39038 + }
39039 + if (ientry->ie_file != NULL)
39040 + result = filp_close(ientry->file, NULL);
39041 + return result;
39042 +}
39043 +
39044 +static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39045 +{
39046 + struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39047 + assert("edward-99", inv_entry->ie_inode = NULL);
39048 + assert("edward-100", inv_entry->ie_sd = NULL);
39049 +
39050 + isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39051 + if (!isd_base)
39052 + return RETERR(-ENOMEM);
39053 + inv_entry->ie_sd = isd_base;
39054 + return 0;
39055 +}
39056 +
39057 +/* this can be installed as ->init_inv_entry () method of
39058 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39059 + Copies data from on-disk stat-data format into light-weight analog of inode .
39060 + Doesn't hanlde stat-data extensions. */
39061 +
39062 +static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39063 +{
39064 + reiser4_stat_data_base *sd_base;
39065 +
39066 + assert("edward-101", inv_entry != NULL);
39067 + assert("edward-101", inv_entry->ie_sd != NULL);
39068 + assert("edward-102", sd != NULL);
39069 +
39070 + sd_base = (reiser4_stat_data_base *) sd;
39071 + inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39072 + inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39073 + inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39074 + inv_entry->incore_sd_base->isd_data = NULL;
39075 +}
39076 +
39077 +/* initialise incore stat-data */
39078 +
39079 +static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39080 +{
39081 + reiser4_plugin *plugin = item_plugin_by_coord(coord);
39082 + void *body = item_body_by_coord(coord);
39083 +
39084 + assert("edward-103", inv_entry != NULL);
39085 + assert("edward-104", plugin != NULL);
39086 + assert("edward-105", body != NULL);
39087 +
39088 + sd_base_load(inv_entry, body);
39089 +}
39090 +
39091 +/* takes a key or filename and allocates new invert_entry,
39092 + init and adds it into the list,
39093 + we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39094 +
39095 +int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
39096 + inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
39097 + const reiser4_key * key, /* key of invert entry stat-data */
39098 + char *filename, /* filename of the file to be opened */
39099 + int flags, int mode)
39100 +{
39101 + int result;
39102 + struct inv_entry *ientry;
39103 +
39104 + assert("edward-107", invert_inode != NULL);
39105 +
39106 + ientry = allocate_inv_entry();
39107 + if (IS_ERR(ientry))
39108 + return (PTR_ERR(ientry));
39109 +
39110 + if (type == LIGHT_WEIGHT_FILE) {
39111 + coord_t coord;
39112 + lock_handle lh;
39113 +
39114 + assert("edward-108", key != NULL);
39115 +
39116 + init_coord(&coord);
39117 + init_lh(&lh);
39118 + result =
39119 + lookup_sd_by_key(reiser4_tree_by_inode(invert_inode),
39120 + ZNODE_READ_LOCK, &coord, &lh, key);
39121 + if (result == 0)
39122 + init_incore_sd_base(ientry, coord);
39123 +
39124 + done_lh(&lh);
39125 + done_coord(&coord);
39126 + return (result);
39127 + } else {
39128 + struct file *file = filp_open(filename, flags, mode);
39129 + /* FIXME_EDWARD here we need to check if we
39130 + did't follow to any mount point */
39131 +
39132 + assert("edward-108", filename != NULL);
39133 +
39134 + if (IS_ERR(file))
39135 + return (PTR_ERR(file));
39136 + ientry->ie_file = file;
39137 + return 0;
39138 + }
39139 +}
39140 +
39141 +/* takes inode of invert, reads the body of this invert, parses it,
39142 + opens all invert entries and return pointer on the first inv_entry */
39143 +
39144 +struct inv_entry *open_invert(struct file *invert_file)
39145 +{
39146 +
39147 +}
39148 +
39149 +ssize_t subfile_read(struct *invert_entry, flow * f)
39150 +{
39151 +
39152 +}
39153 +
39154 +ssize_t subfile_write(struct *invert_entry, flow * f)
39155 +{
39156 +
39157 +}
39158 +
39159 +ssize_t invert_read(struct *file, flow * f)
39160 +{
39161 +
39162 +}
39163 +
39164 +ssize_t invert_write(struct *file, flow * f)
39165 +{
39166 +
39167 +}
39168 +
39169 +/* Make Linus happy.
39170 + Local variables:
39171 + c-indentation-style: "K&R"
39172 + mode-name: "LC"
39173 + c-basic-offset: 8
39174 + tab-width: 8
39175 + fill-column: 120
39176 + scroll-step: 1
39177 + End:
39178 +*/
39179 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile linux-2.6.20/fs/reiser4/plugin/file/Makefile
39180 --- linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
39181 +++ linux-2.6.20/fs/reiser4/plugin/file/Makefile 2007-05-06 14:50:43.783001971 +0400
39182 @@ -0,0 +1,7 @@
39183 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
39184 +
39185 +file_plugins-objs := \
39186 + file.o \
39187 + tail_conversion.o \
39188 + symlink.o \
39189 + cryptcompress.o
39190 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.20/fs/reiser4/plugin/file/symfile.c
39191 --- linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
39192 +++ linux-2.6.20/fs/reiser4/plugin/file/symfile.c 2007-05-06 14:50:43.787003221 +0400
39193 @@ -0,0 +1,87 @@
39194 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39195 +
39196 +/* Symfiles are a generalization of Unix symlinks.
39197 +
39198 + A symfile when read behaves as though you took its contents and
39199 + substituted them into the reiser4 naming system as the right hand side
39200 + of an assignment, and then read that which you had assigned to it.
39201 +
39202 + A key issue for symfiles is how to implement writes through to
39203 + subfiles. In general, one must have some method of determining what
39204 + of that which is written to the symfile is written to what subfile.
39205 + This can be done by use of custom plugin methods written by users, or
39206 + by using a few general methods we provide for those willing to endure
39207 + the insertion of delimiters into what is read.
39208 +
39209 + Writing to symfiles without delimiters to denote what is written to
39210 + what subfile is not supported by any plugins we provide in this
39211 + release. Our most sophisticated support for writes is that embodied
39212 + by the invert plugin (see invert.c).
39213 +
39214 + A read only version of the /etc/passwd file might be
39215 + constructed as a symfile whose contents are as follows:
39216 +
39217 + /etc/passwd/userlines/*
39218 +
39219 + or
39220 +
39221 + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39222 +
39223 + or
39224 +
39225 + /etc/passwd/userlines/(demidov+edward+reiser+root)
39226 +
39227 + A symfile with contents
39228 +
39229 + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39230 +
39231 + will return when read
39232 +
39233 + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39234 +
39235 + and write of what has been read will not be possible to implement as
39236 + an identity operation because there are no delimiters denoting the
39237 + boundaries of what is to be written to what subfile.
39238 +
39239 + Note that one could make this a read/write symfile if one specified
39240 + delimiters, and the write method understood those delimiters delimited
39241 + what was written to subfiles.
39242 +
39243 + So, specifying the symfile in a manner that allows writes:
39244 +
39245 + /etc/passwd/userlines/demidov+"(
39246 + )+/etc/passwd/userlines/edward+"(
39247 + )+/etc/passwd/userlines/reiser+"(
39248 + )+/etc/passwd/userlines/root+"(
39249 + )
39250 +
39251 + or
39252 +
39253 + /etc/passwd/userlines/(demidov+"(
39254 + )+edward+"(
39255 + )+reiser+"(
39256 + )+root+"(
39257 + ))
39258 +
39259 + and the file demidov might be specified as:
39260 +
39261 + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39262 +
39263 + or
39264 +
39265 + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39266 +
39267 + Notice that if the file demidov has a carriage return in it, the
39268 + parsing fails, but then if you put carriage returns in the wrong place
39269 + in a normal /etc/passwd file it breaks things also.
39270 +
39271 + Note that it is forbidden to have no text between two interpolations
39272 + if one wants to be able to define what parts of a write go to what
39273 + subfiles referenced in an interpolation.
39274 +
39275 + If one wants to be able to add new lines by writing to the file, one
39276 + must either write a custom plugin for /etc/passwd that knows how to
39277 + name an added line, or one must use an invert, or one must use a more
39278 + sophisticated symfile syntax that we are not planning to write for
39279 + version 4.0.
39280 +*/
39281 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.20/fs/reiser4/plugin/file/symlink.c
39282 --- linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
39283 +++ linux-2.6.20/fs/reiser4/plugin/file/symlink.c 2007-05-06 14:50:43.787003221 +0400
39284 @@ -0,0 +1,95 @@
39285 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39286 +
39287 +#include "../../inode.h"
39288 +
39289 +#include <linux/types.h>
39290 +#include <linux/fs.h>
39291 +
39292 +/* file plugin methods specific for symlink files
39293 + (SYMLINK_FILE_PLUGIN_ID) */
39294 +
39295 +/* this is implementation of create_object method of file plugin for
39296 + SYMLINK_FILE_PLUGIN_ID
39297 + */
39298 +
39299 +/**
39300 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39301 + * @symlink: inode of symlink object
39302 + * @dir: inode of parent directory
39303 + * @info: parameters of new object
39304 + *
39305 + * Inserts stat data with symlink extension where into the tree.
39306 + */
39307 +int reiser4_create_symlink(struct inode *symlink,
39308 + struct inode *dir UNUSED_ARG,
39309 + reiser4_object_create_data *data /* info passed to us
39310 + * this is filled by
39311 + * reiser4() syscall
39312 + * in particular */)
39313 +{
39314 + int result;
39315 +
39316 + assert("nikita-680", symlink != NULL);
39317 + assert("nikita-681", S_ISLNK(symlink->i_mode));
39318 + assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
39319 + assert("nikita-682", dir != NULL);
39320 + assert("nikita-684", data != NULL);
39321 + assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39322 +
39323 + /*
39324 + * stat data of symlink has symlink extension in which we store
39325 + * symlink content, that is, path symlink is pointing to.
39326 + */
39327 + reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39328 +
39329 + assert("vs-838", symlink->i_private == NULL);
39330 + symlink->i_private = (void *)data->name;
39331 +
39332 + assert("vs-843", symlink->i_size == 0);
39333 + INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39334 +
39335 + /* insert stat data appended with data->name */
39336 + result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39337 + if (result) {
39338 + /* FIXME-VS: Make sure that symlink->i_private is not attached
39339 + to kmalloced data */
39340 + INODE_SET_FIELD(symlink, i_size, 0);
39341 + } else {
39342 + assert("vs-849", symlink->i_private
39343 + && reiser4_inode_get_flag(symlink,
39344 + REISER4_GENERIC_PTR_USED));
39345 + assert("vs-850",
39346 + !memcmp((char *)symlink->i_private, data->name,
39347 + (size_t) symlink->i_size + 1));
39348 + }
39349 + return result;
39350 +}
39351 +
39352 +/* this is implementation of destroy_inode method of file plugin for
39353 + SYMLINK_FILE_PLUGIN_ID
39354 + */
39355 +void destroy_inode_symlink(struct inode *inode)
39356 +{
39357 + assert("edward-799",
39358 + inode_file_plugin(inode) ==
39359 + file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39360 + assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39361 + assert("edward-801", reiser4_inode_get_flag(inode,
39362 + REISER4_GENERIC_PTR_USED));
39363 + assert("vs-839", S_ISLNK(inode->i_mode));
39364 +
39365 + kfree(inode->i_private);
39366 + inode->i_private = NULL;
39367 + reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39368 +}
39369 +
39370 +/*
39371 + Local variables:
39372 + c-indentation-style: "K&R"
39373 + mode-name: "LC"
39374 + c-basic-offset: 8
39375 + tab-width: 8
39376 + fill-column: 80
39377 + scroll-step: 1
39378 + End:
39379 +*/
39380 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c
39381 --- linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
39382 +++ linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c 2007-05-06 14:50:43.787003221 +0400
39383 @@ -0,0 +1,729 @@
39384 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39385 +
39386 +#include "../../inode.h"
39387 +#include "../../super.h"
39388 +#include "../../page_cache.h"
39389 +#include "../../carry.h"
39390 +#include "../../safe_link.h"
39391 +#include "../../vfs_ops.h"
39392 +
39393 +#include <linux/writeback.h>
39394 +
39395 +/* this file contains:
39396 + tail2extent and extent2tail */
39397 +
39398 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39399 +void get_exclusive_access(unix_file_info_t * uf_info)
39400 +{
39401 + assert("nikita-3028", reiser4_schedulable());
39402 + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39403 + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39404 + /*
39405 + * "deadlock avoidance": sometimes we commit a transaction under
39406 + * rw-semaphore on a file. Such commit can deadlock with another
39407 + * thread that captured some block (hence preventing atom from being
39408 + * committed) and waits on rw-semaphore.
39409 + */
39410 + reiser4_txn_restart_current();
39411 + LOCK_CNT_INC(inode_sem_w);
39412 + down_write(&uf_info->latch);
39413 + uf_info->exclusive_use = 1;
39414 + assert("vs-1713", uf_info->ea_owner == NULL);
39415 + assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39416 + ON_DEBUG(uf_info->ea_owner = current);
39417 +}
39418 +
39419 +void drop_exclusive_access(unix_file_info_t * uf_info)
39420 +{
39421 + assert("vs-1714", uf_info->ea_owner == current);
39422 + assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39423 + ON_DEBUG(uf_info->ea_owner = NULL);
39424 + uf_info->exclusive_use = 0;
39425 + up_write(&uf_info->latch);
39426 + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39427 + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39428 + LOCK_CNT_DEC(inode_sem_w);
39429 + reiser4_txn_restart_current();
39430 +}
39431 +
39432 +/**
39433 + * nea_grabbed - do something when file semaphore is down_read-ed
39434 + * @uf_info:
39435 + *
39436 + * This is called when nonexclisive access is obtained on file. All it does is
39437 + * for debugging purposes.
39438 + */
39439 +static void nea_grabbed(unix_file_info_t *uf_info)
39440 +{
39441 +#if REISER4_DEBUG
39442 + LOCK_CNT_INC(inode_sem_r);
39443 + assert("vs-1716", uf_info->ea_owner == NULL);
39444 + atomic_inc(&uf_info->nr_neas);
39445 + uf_info->last_reader = current;
39446 +#endif
39447 +}
39448 +
39449 +/**
39450 + * get_nonexclusive_access - get nonexclusive access to a file
39451 + * @uf_info: unix file specific part of inode to obtain access to
39452 + *
39453 + * Nonexclusive access is obtained on a file before read, write, readpage.
39454 + */
39455 +void get_nonexclusive_access(unix_file_info_t *uf_info)
39456 +{
39457 + assert("nikita-3029", reiser4_schedulable());
39458 + assert("nikita-3361", get_current_context()->trans->atom == NULL);
39459 +
39460 + down_read(&uf_info->latch);
39461 + nea_grabbed(uf_info);
39462 +}
39463 +
39464 +/**
39465 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39466 + * @uf_info: unix file specific part of inode to obtain access to
39467 + *
39468 + * Non-blocking version of nonexclusive access obtaining.
39469 + */
39470 +int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39471 +{
39472 + int result;
39473 +
39474 + result = down_read_trylock(&uf_info->latch);
39475 + if (result)
39476 + nea_grabbed(uf_info);
39477 + return result;
39478 +}
39479 +
39480 +void drop_nonexclusive_access(unix_file_info_t * uf_info)
39481 +{
39482 + assert("vs-1718", uf_info->ea_owner == NULL);
39483 + assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39484 + ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39485 +
39486 + up_read(&uf_info->latch);
39487 +
39488 + LOCK_CNT_DEC(inode_sem_r);
39489 + reiser4_txn_restart_current();
39490 +}
39491 +
39492 +/* part of tail2extent. Cut all items covering @count bytes starting from
39493 + @offset */
39494 +/* Audited by: green(2002.06.15) */
39495 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39496 +{
39497 + reiser4_key from, to;
39498 +
39499 + /* AUDIT: How about putting an assertion here, what would check
39500 + all provided range is covered by tail items only? */
39501 + /* key of first byte in the range to be cut */
39502 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39503 +
39504 + /* key of last byte in that range */
39505 + to = from;
39506 + set_key_offset(&to, (__u64) (offset + count - 1));
39507 +
39508 + /* cut everything between those keys */
39509 + return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
39510 + inode, 0);
39511 +}
39512 +
39513 +static void release_all_pages(struct page **pages, unsigned nr_pages)
39514 +{
39515 + unsigned i;
39516 +
39517 + for (i = 0; i < nr_pages; i++) {
39518 + if (pages[i] == NULL) {
39519 + unsigned j;
39520 + for (j = i + 1; j < nr_pages; j++)
39521 + assert("vs-1620", pages[j] == NULL);
39522 + break;
39523 + }
39524 + page_cache_release(pages[i]);
39525 + pages[i] = NULL;
39526 + }
39527 +}
39528 +
39529 +/* part of tail2extent. replace tail items with extent one. Content of tail
39530 + items (@count bytes) being cut are copied already into
39531 + pages. extent_writepage method is called to create extents corresponding to
39532 + those pages */
39533 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39534 +{
39535 + int result;
39536 + unsigned i;
39537 + STORE_COUNTERS;
39538 +
39539 + if (nr_pages == 0)
39540 + return 0;
39541 +
39542 + assert("vs-596", pages[0]);
39543 +
39544 + /* cut copied items */
39545 + result = cut_formatting_items(inode, page_offset(pages[0]), count);
39546 + if (result)
39547 + return result;
39548 +
39549 + CHECK_COUNTERS;
39550 +
39551 + /* put into tree replacement for just removed items: extent item, namely */
39552 + for (i = 0; i < nr_pages; i++) {
39553 + result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39554 + pages[i]->index,
39555 + mapping_gfp_mask(inode->
39556 + i_mapping));
39557 + if (result)
39558 + break;
39559 + unlock_page(pages[i]);
39560 + result = find_or_create_extent(pages[i]);
39561 + if (result)
39562 + break;
39563 + SetPageUptodate(pages[i]);
39564 + }
39565 + return result;
39566 +}
39567 +
39568 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39569 + * items */
39570 +
39571 +static int reserve_tail2extent_iteration(struct inode *inode)
39572 +{
39573 + reiser4_block_nr unformatted_nodes;
39574 + reiser4_tree *tree;
39575 +
39576 + tree = reiser4_tree_by_inode(inode);
39577 +
39578 + /* number of unformatted nodes which will be created */
39579 + unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39580 +
39581 + /*
39582 + * space required for one iteration of extent->tail conversion:
39583 + *
39584 + * 1. kill N tail items
39585 + *
39586 + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39587 + *
39588 + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39589 + * extents) extent units.
39590 + *
39591 + * 4. drilling to the leaf level by coord_by_key()
39592 + *
39593 + * 5. possible update of stat-data
39594 + *
39595 + */
39596 + grab_space_enable();
39597 + return reiser4_grab_space
39598 + (2 * tree->height +
39599 + TAIL2EXTENT_PAGE_NUM +
39600 + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39601 + 1 + estimate_one_insert_item(tree) +
39602 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39603 +}
39604 +
39605 +/* clear stat data's flag indicating that conversion is being converted */
39606 +static int complete_conversion(struct inode *inode)
39607 +{
39608 + int result;
39609 +
39610 + grab_space_enable();
39611 + result =
39612 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39613 + BA_CAN_COMMIT);
39614 + if (result == 0) {
39615 + reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
39616 + result = reiser4_update_sd(inode);
39617 + }
39618 + if (result)
39619 + warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39620 + (unsigned long long)get_inode_oid(inode), result);
39621 + return 0;
39622 +}
39623 +
39624 +/**
39625 + * find_start
39626 + * @inode:
39627 + * @id:
39628 + * @offset:
39629 + *
39630 + * this is used by tail2extent and extent2tail to detect where previous
39631 + * uncompleted conversion stopped
39632 + */
39633 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39634 +{
39635 + int result;
39636 + lock_handle lh;
39637 + coord_t coord;
39638 + unix_file_info_t *ufo;
39639 + int found;
39640 + reiser4_key key;
39641 +
39642 + ufo = unix_file_inode_data(inode);
39643 + init_lh(&lh);
39644 + result = 0;
39645 + found = 0;
39646 + inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39647 + do {
39648 + init_lh(&lh);
39649 + result = find_file_item_nohint(&coord, &lh, &key,
39650 + ZNODE_READ_LOCK, inode);
39651 +
39652 + if (result == CBK_COORD_FOUND) {
39653 + if (coord.between == AT_UNIT) {
39654 + /*coord_clear_iplug(&coord); */
39655 + result = zload(coord.node);
39656 + if (result == 0) {
39657 + if (item_id_by_coord(&coord) == id)
39658 + found = 1;
39659 + else
39660 + item_plugin_by_coord(&coord)->s.
39661 + file.append_key(&coord,
39662 + &key);
39663 + zrelse(coord.node);
39664 + }
39665 + } else
39666 + result = RETERR(-ENOENT);
39667 + }
39668 + done_lh(&lh);
39669 + } while (result == 0 && !found);
39670 + *offset = get_key_offset(&key);
39671 + return result;
39672 +}
39673 +
39674 +/**
39675 + * tail2extent
39676 + * @uf_info:
39677 + *
39678 + *
39679 + */
39680 +int tail2extent(unix_file_info_t *uf_info)
39681 +{
39682 + int result;
39683 + reiser4_key key; /* key of next byte to be moved to page */
39684 + char *p_data; /* data of page */
39685 + unsigned page_off = 0, /* offset within the page where to copy data */
39686 + count; /* number of bytes of item which can be
39687 + * copied to page */
39688 + struct page *pages[TAIL2EXTENT_PAGE_NUM];
39689 + struct page *page;
39690 + int done; /* set to 1 when all file is read */
39691 + char *item;
39692 + int i;
39693 + struct inode *inode;
39694 + int first_iteration;
39695 + int bytes;
39696 + __u64 offset;
39697 +
39698 + assert("nikita-3362", ea_obtained(uf_info));
39699 + inode = unix_file_info_to_inode(uf_info);
39700 + assert("nikita-3412", !IS_RDONLY(inode));
39701 + assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39702 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39703 +
39704 + offset = 0;
39705 + first_iteration = 1;
39706 + result = 0;
39707 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39708 + /*
39709 + * file is marked on disk as there was a conversion which did
39710 + * not complete due to either crash or some error. Find which
39711 + * offset tail conversion stopped at
39712 + */
39713 + result = find_start(inode, FORMATTING_ID, &offset);
39714 + if (result == -ENOENT) {
39715 + /* no tail items found, everything is converted */
39716 + uf_info->container = UF_CONTAINER_EXTENTS;
39717 + complete_conversion(inode);
39718 + return 0;
39719 + } else if (result != 0)
39720 + /* some other error */
39721 + return result;
39722 + first_iteration = 0;
39723 + }
39724 +
39725 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39726 +
39727 + /* get key of first byte of a file */
39728 + inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39729 +
39730 + done = 0;
39731 + while (done == 0) {
39732 + memset(pages, 0, sizeof(pages));
39733 + result = reserve_tail2extent_iteration(inode);
39734 + if (result != 0)
39735 + goto out;
39736 + if (first_iteration) {
39737 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39738 + reiser4_update_sd(inode);
39739 + first_iteration = 0;
39740 + }
39741 + bytes = 0;
39742 + for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39743 + assert("vs-598",
39744 + (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
39745 + page = alloc_page(reiser4_ctx_gfp_mask_get());
39746 + if (!page) {
39747 + result = RETERR(-ENOMEM);
39748 + goto error;
39749 + }
39750 +
39751 + page->index =
39752 + (unsigned long)(get_key_offset(&key) >>
39753 + PAGE_CACHE_SHIFT);
39754 + /*
39755 + * usually when one is going to longterm lock znode (as
39756 + * find_file_item does, for instance) he must not hold
39757 + * locked pages. However, there is an exception for
39758 + * case tail2extent. Pages appearing here are not
39759 + * reachable to everyone else, they are clean, they do
39760 + * not have jnodes attached so keeping them locked do
39761 + * not risk deadlock appearance
39762 + */
39763 + assert("vs-983", !PagePrivate(page));
39764 + reiser4_invalidate_pages(inode->i_mapping, page->index,
39765 + 1, 0);
39766 +
39767 + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39768 + coord_t coord;
39769 + lock_handle lh;
39770 +
39771 + /* get next item */
39772 + /* FIXME: we might want to readahead here */
39773 + init_lh(&lh);
39774 + result =
39775 + find_file_item_nohint(&coord, &lh, &key,
39776 + ZNODE_READ_LOCK,
39777 + inode);
39778 + if (result != CBK_COORD_FOUND) {
39779 + /*
39780 + * error happened of not items of file
39781 + * were found
39782 + */
39783 + done_lh(&lh);
39784 + page_cache_release(page);
39785 + goto error;
39786 + }
39787 +
39788 + if (coord.between == AFTER_UNIT) {
39789 + /*
39790 + * end of file is reached. Padd page
39791 + * with zeros
39792 + */
39793 + done_lh(&lh);
39794 + done = 1;
39795 + p_data = kmap_atomic(page, KM_USER0);
39796 + memset(p_data + page_off, 0,
39797 + PAGE_CACHE_SIZE - page_off);
39798 + kunmap_atomic(p_data, KM_USER0);
39799 + break;
39800 + }
39801 +
39802 + result = zload(coord.node);
39803 + if (result) {
39804 + page_cache_release(page);
39805 + done_lh(&lh);
39806 + goto error;
39807 + }
39808 + assert("vs-856", coord.between == AT_UNIT);
39809 + item = ((char *)item_body_by_coord(&coord)) +
39810 + coord.unit_pos;
39811 +
39812 + /* how many bytes to copy */
39813 + count =
39814 + item_length_by_coord(&coord) -
39815 + coord.unit_pos;
39816 + /* limit length of copy to end of page */
39817 + if (count > PAGE_CACHE_SIZE - page_off)
39818 + count = PAGE_CACHE_SIZE - page_off;
39819 +
39820 + /*
39821 + * copy item (as much as will fit starting from
39822 + * the beginning of the item) into the page
39823 + */
39824 + p_data = kmap_atomic(page, KM_USER0);
39825 + memcpy(p_data + page_off, item, count);
39826 + kunmap_atomic(p_data, KM_USER0);
39827 +
39828 + page_off += count;
39829 + bytes += count;
39830 + set_key_offset(&key,
39831 + get_key_offset(&key) + count);
39832 +
39833 + zrelse(coord.node);
39834 + done_lh(&lh);
39835 + } /* end of loop which fills one page by content of
39836 + * formatting items */
39837 +
39838 + if (page_off) {
39839 + /* something was copied into page */
39840 + pages[i] = page;
39841 + } else {
39842 + page_cache_release(page);
39843 + assert("vs-1648", done == 1);
39844 + break;
39845 + }
39846 + } /* end of loop through pages of one conversion iteration */
39847 +
39848 + if (i > 0) {
39849 + result = replace(inode, pages, i, bytes);
39850 + release_all_pages(pages, sizeof_array(pages));
39851 + if (result)
39852 + goto error;
39853 + /*
39854 + * We have to drop exclusive access to avoid deadlock
39855 + * which may happen because called by reiser4_writepages
39856 + * capture_unix_file requires to get non-exclusive
39857 + * access to a file. It is safe to drop EA in the middle
39858 + * of tail2extent conversion because write_unix_file,
39859 + * setattr_unix_file(truncate), mmap_unix_file,
39860 + * release_unix_file(extent2tail) checks if conversion
39861 + * is not in progress (see comments before
39862 + * get_exclusive_access_careful().
39863 + * Other processes that acquire non-exclusive access
39864 + * (read_unix_file, reiser4_writepages, etc) should work
39865 + * on partially converted files.
39866 + */
39867 + drop_exclusive_access(uf_info);
39868 + /* throttle the conversion */
39869 + reiser4_throttle_write(inode);
39870 + get_exclusive_access(uf_info);
39871 +
39872 + /*
39873 + * nobody is allowed to complete conversion but a
39874 + * process which started it
39875 + */
39876 + assert("", reiser4_inode_get_flag(inode,
39877 + REISER4_PART_MIXED));
39878 + }
39879 + }
39880 +
39881 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
39882 +
39883 + if (result == 0) {
39884 + /* file is converted to extent items */
39885 + assert("vs-1697", reiser4_inode_get_flag(inode,
39886 + REISER4_PART_MIXED));
39887 +
39888 + uf_info->container = UF_CONTAINER_EXTENTS;
39889 + complete_conversion(inode);
39890 + } else {
39891 + /*
39892 + * conversion is not complete. Inode was already marked as
39893 + * REISER4_PART_CONV and stat-data were updated at the first
39894 + * iteration of the loop above.
39895 + */
39896 + error:
39897 + release_all_pages(pages, sizeof_array(pages));
39898 + warning("nikita-2282", "Partial conversion of %llu: %i",
39899 + (unsigned long long)get_inode_oid(inode), result);
39900 + }
39901 +
39902 + out:
39903 + return result;
39904 +}
39905 +
39906 +static int reserve_extent2tail_iteration(struct inode *inode)
39907 +{
39908 + reiser4_tree *tree;
39909 +
39910 + tree = reiser4_tree_by_inode(inode);
39911 + /*
39912 + * reserve blocks for (in this order):
39913 + *
39914 + * 1. removal of extent item
39915 + *
39916 + * 2. insertion of tail by insert_flow()
39917 + *
39918 + * 3. drilling to the leaf level by coord_by_key()
39919 + *
39920 + * 4. possible update of stat-data
39921 + */
39922 + grab_space_enable();
39923 + return reiser4_grab_space
39924 + (estimate_one_item_removal(tree) +
39925 + estimate_insert_flow(tree->height) +
39926 + 1 + estimate_one_insert_item(tree) +
39927 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39928 +}
39929 +
39930 +/* for every page of file: read page, cut part of extent pointing to this page,
39931 + put data of page tree by tail item */
39932 +int extent2tail(unix_file_info_t *uf_info)
39933 +{
39934 + int result;
39935 + struct inode *inode;
39936 + struct page *page;
39937 + unsigned long num_pages, i;
39938 + unsigned long start_page;
39939 + reiser4_key from;
39940 + reiser4_key to;
39941 + unsigned count;
39942 + __u64 offset;
39943 +
39944 + assert("nikita-3362", ea_obtained(uf_info));
39945 + inode = unix_file_info_to_inode(uf_info);
39946 + assert("nikita-3412", !IS_RDONLY(inode));
39947 + assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
39948 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39949 +
39950 + offset = 0;
39951 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39952 + /*
39953 + * file is marked on disk as there was a conversion which did
39954 + * not complete due to either crash or some error. Find which
39955 + * offset tail conversion stopped at
39956 + */
39957 + result = find_start(inode, EXTENT_POINTER_ID, &offset);
39958 + if (result == -ENOENT) {
39959 + /* no extent found, everything is converted */
39960 + uf_info->container = UF_CONTAINER_TAILS;
39961 + complete_conversion(inode);
39962 + return 0;
39963 + } else if (result != 0)
39964 + /* some other error */
39965 + return result;
39966 + }
39967 +
39968 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39969 +
39970 + /* number of pages in the file */
39971 + num_pages =
39972 + (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39973 + start_page = offset >> PAGE_CACHE_SHIFT;
39974 +
39975 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39976 + to = from;
39977 +
39978 + result = 0;
39979 + for (i = 0; i < num_pages; i++) {
39980 + __u64 start_byte;
39981 +
39982 + result = reserve_extent2tail_iteration(inode);
39983 + if (result != 0)
39984 + break;
39985 + if (i == 0 && offset == 0) {
39986 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39987 + reiser4_update_sd(inode);
39988 + }
39989 +
39990 + page = read_mapping_page(inode->i_mapping,
39991 + (unsigned)(i + start_page), NULL);
39992 + if (IS_ERR(page)) {
39993 + result = PTR_ERR(page);
39994 + break;
39995 + }
39996 +
39997 + wait_on_page_locked(page);
39998 +
39999 + if (!PageUptodate(page)) {
40000 + page_cache_release(page);
40001 + result = RETERR(-EIO);
40002 + break;
40003 + }
40004 +
40005 + /* cut part of file we have read */
40006 + start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
40007 + set_key_offset(&from, start_byte);
40008 + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
40009 + /*
40010 + * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
40011 + * commits during over-long truncates. But
40012 + * extent->tail conversion should be performed in one
40013 + * transaction.
40014 + */
40015 + result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
40016 + &to, inode, 0);
40017 +
40018 + if (result) {
40019 + page_cache_release(page);
40020 + break;
40021 + }
40022 +
40023 + /* put page data into tree via tail_write */
40024 + count = PAGE_CACHE_SIZE;
40025 + if ((i == (num_pages - 1)) &&
40026 + (inode->i_size & ~PAGE_CACHE_MASK))
40027 + /* last page can be incompleted */
40028 + count = (inode->i_size & ~PAGE_CACHE_MASK);
40029 + while (count) {
40030 + struct dentry dentry;
40031 + struct file file;
40032 + loff_t pos;
40033 +
40034 + dentry.d_inode = inode;
40035 + file.f_dentry = &dentry;
40036 + file.private_data = NULL;
40037 + file.f_pos = start_byte;
40038 + file.private_data = NULL;
40039 + pos = start_byte;
40040 + result = reiser4_write_tail(&file,
40041 + (char __user *)kmap(page),
40042 + count, &pos);
40043 + reiser4_free_file_fsdata(&file);
40044 + if (result <= 0) {
40045 + warning("", "reiser4_write_tail failed");
40046 + page_cache_release(page);
40047 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40048 + return result;
40049 + }
40050 + count -= result;
40051 + }
40052 +
40053 + /* release page */
40054 + lock_page(page);
40055 + /* page is already detached from jnode and mapping. */
40056 + assert("vs-1086", page->mapping == NULL);
40057 + assert("nikita-2690",
40058 + (!PagePrivate(page) && jprivate(page) == 0));
40059 + /* waiting for writeback completion with page lock held is
40060 + * perfectly valid. */
40061 + wait_on_page_writeback(page);
40062 + reiser4_drop_page(page);
40063 + /* release reference taken by read_cache_page() above */
40064 + page_cache_release(page);
40065 +
40066 + drop_exclusive_access(uf_info);
40067 + /* throttle the conversion */
40068 + reiser4_throttle_write(inode);
40069 + get_exclusive_access(uf_info);
40070 + /*
40071 + * nobody is allowed to complete conversion but a process which
40072 + * started it
40073 + */
40074 + assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
40075 + }
40076 +
40077 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40078 +
40079 + if (i == num_pages) {
40080 + /* file is converted to formatted items */
40081 + assert("vs-1698", reiser4_inode_get_flag(inode,
40082 + REISER4_PART_MIXED));
40083 + assert("vs-1260",
40084 + inode_has_no_jnodes(reiser4_inode_data(inode)));
40085 +
40086 + uf_info->container = UF_CONTAINER_TAILS;
40087 + complete_conversion(inode);
40088 + return 0;
40089 + }
40090 + /*
40091 + * conversion is not complete. Inode was already marked as
40092 + * REISER4_PART_MIXED and stat-data were updated at the first *
40093 + * iteration of the loop above.
40094 + */
40095 + warning("nikita-2282",
40096 + "Partial conversion of %llu: %lu of %lu: %i",
40097 + (unsigned long long)get_inode_oid(inode), i,
40098 + num_pages, result);
40099 +
40100 + return result;
40101 +}
40102 +
40103 +/*
40104 + * Local variables:
40105 + * c-indentation-style: "K&R"
40106 + * mode-name: "LC"
40107 + * c-basic-offset: 8
40108 + * tab-width: 8
40109 + * fill-column: 79
40110 + * scroll-step: 1
40111 + * End:
40112 + */
40113 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c linux-2.6.20/fs/reiser4/plugin/file_ops.c
40114 --- linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
40115 +++ linux-2.6.20/fs/reiser4/plugin/file_ops.c 2007-05-06 14:50:43.787003221 +0400
40116 @@ -0,0 +1,168 @@
40117 +/* Copyright 2005 by Hans Reiser, licensing governed by
40118 + reiser4/README */
40119 +
40120 +/* this file contains typical implementations for some of methods of
40121 + struct file_operations and of struct address_space_operations
40122 +*/
40123 +
40124 +#include "../inode.h"
40125 +#include "object.h"
40126 +
40127 +/* file operations */
40128 +
40129 +/* implementation of vfs's llseek method of struct file_operations for
40130 + typical directory can be found in readdir_common.c
40131 +*/
40132 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
40133 +
40134 +/* implementation of vfs's readdir method of struct file_operations for
40135 + typical directory can be found in readdir_common.c
40136 +*/
40137 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
40138 +
40139 +/**
40140 + * reiser4_release_dir_common - release of struct file_operations
40141 + * @inode: inode of released file
40142 + * @file: file to release
40143 + *
40144 + * Implementation of release method of struct file_operations for typical
40145 + * directory. All it does is freeing of reiser4 specific file data.
40146 +*/
40147 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
40148 +{
40149 + reiser4_context *ctx;
40150 +
40151 + ctx = reiser4_init_context(inode->i_sb);
40152 + if (IS_ERR(ctx))
40153 + return PTR_ERR(ctx);
40154 + reiser4_free_file_fsdata(file);
40155 + reiser4_exit_context(ctx);
40156 + return 0;
40157 +}
40158 +
40159 +/* this is common implementation of vfs's fsync method of struct
40160 + file_operations
40161 +*/
40162 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
40163 +{
40164 + reiser4_context *ctx;
40165 + int result;
40166 +
40167 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
40168 + if (IS_ERR(ctx))
40169 + return PTR_ERR(ctx);
40170 + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40171 +
40172 + context_set_commit_async(ctx);
40173 + reiser4_exit_context(ctx);
40174 + return result;
40175 +}
40176 +
40177 +/* this is common implementation of vfs's sendfile method of struct
40178 + file_operations
40179 +
40180 + Reads @count bytes from @file and calls @actor for every page read. This is
40181 + needed for loop back devices support.
40182 +*/
40183 +#if 0
40184 +ssize_t
40185 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
40186 + read_actor_t actor, void *target)
40187 +{
40188 + reiser4_context *ctx;
40189 + ssize_t result;
40190 +
40191 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
40192 + if (IS_ERR(ctx))
40193 + return PTR_ERR(ctx);
40194 + result = generic_file_sendfile(file, ppos, count, actor, target);
40195 + reiser4_exit_context(ctx);
40196 + return result;
40197 +}
40198 +#endif /* 0 */
40199 +
40200 +/* address space operations */
40201 +
40202 +/* this is common implementation of vfs's prepare_write method of struct
40203 + address_space_operations
40204 +*/
40205 +int
40206 +prepare_write_common(struct file *file, struct page *page, unsigned from,
40207 + unsigned to)
40208 +{
40209 + reiser4_context *ctx;
40210 + int result;
40211 +
40212 + ctx = reiser4_init_context(page->mapping->host->i_sb);
40213 + result = do_prepare_write(file, page, from, to);
40214 +
40215 + /* don't commit transaction under inode semaphore */
40216 + context_set_commit_async(ctx);
40217 + reiser4_exit_context(ctx);
40218 +
40219 + return result;
40220 +}
40221 +
40222 +/* this is helper for prepare_write_common and prepare_write_unix_file
40223 + */
40224 +int
40225 +do_prepare_write(struct file *file, struct page *page, unsigned from,
40226 + unsigned to)
40227 +{
40228 + int result;
40229 + file_plugin *fplug;
40230 + struct inode *inode;
40231 +
40232 + assert("umka-3099", file != NULL);
40233 + assert("umka-3100", page != NULL);
40234 + assert("umka-3095", PageLocked(page));
40235 +
40236 + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40237 + return 0;
40238 +
40239 + inode = page->mapping->host;
40240 + fplug = inode_file_plugin(inode);
40241 +
40242 + if (page->mapping->a_ops->readpage == NULL)
40243 + return RETERR(-EINVAL);
40244 +
40245 + result = page->mapping->a_ops->readpage(file, page);
40246 + if (result != 0) {
40247 + SetPageError(page);
40248 + ClearPageUptodate(page);
40249 + /* All reiser4 readpage() implementations should return the
40250 + * page locked in case of error. */
40251 + assert("nikita-3472", PageLocked(page));
40252 + } else {
40253 + /*
40254 + * ->readpage() either:
40255 + *
40256 + * 1. starts IO against @page. @page is locked for IO in
40257 + * this case.
40258 + *
40259 + * 2. doesn't start IO. @page is unlocked.
40260 + *
40261 + * In either case, page should be locked.
40262 + */
40263 + lock_page(page);
40264 + /*
40265 + * IO (if any) is completed at this point. Check for IO
40266 + * errors.
40267 + */
40268 + if (!PageUptodate(page))
40269 + result = RETERR(-EIO);
40270 + }
40271 + assert("umka-3098", PageLocked(page));
40272 + return result;
40273 +}
40274 +
40275 +/*
40276 + * Local variables:
40277 + * c-indentation-style: "K&R"
40278 + * mode-name: "LC"
40279 + * c-basic-offset: 8
40280 + * tab-width: 8
40281 + * fill-column: 79
40282 + * scroll-step: 1
40283 + * End:
40284 + */
40285 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c
40286 --- linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
40287 +++ linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c 2007-05-06 14:50:43.791004471 +0400
40288 @@ -0,0 +1,657 @@
40289 +/* Copyright 2005 by Hans Reiser, licensing governed by
40290 + * reiser4/README */
40291 +
40292 +#include "../inode.h"
40293 +
40294 +/* return true, iff @coord points to the valid directory item that is part of
40295 + * @inode directory. */
40296 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40297 +{
40298 + return plugin_of_group(item_plugin_by_coord(coord),
40299 + DIR_ENTRY_ITEM_TYPE) &&
40300 + inode_file_plugin(inode)->owns_item(inode, coord);
40301 +}
40302 +
40303 +/* compare two logical positions within the same directory */
40304 +static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40305 +{
40306 + cmp_t result;
40307 +
40308 + assert("nikita-2534", p1 != NULL);
40309 + assert("nikita-2535", p2 != NULL);
40310 +
40311 + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40312 + if (result == EQUAL_TO) {
40313 + int diff;
40314 +
40315 + diff = p1->pos - p2->pos;
40316 + result =
40317 + (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40318 + }
40319 + return result;
40320 +}
40321 +
40322 +/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
40323 + * necessary. */
40324 +static void
40325 +adjust_dir_pos(struct file *dir,
40326 + readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40327 +{
40328 + dir_pos *pos;
40329 +
40330 + /*
40331 + * new directory entry was added (adj == +1) or removed (adj == -1) at
40332 + * the @mod_point. Directory file descriptor @dir is doing readdir and
40333 + * is currently positioned at @readdir_spot. Latter has to be updated
40334 + * to maintain stable readdir.
40335 + */
40336 + /* directory is positioned to the beginning. */
40337 + if (readdir_spot->entry_no == 0)
40338 + return;
40339 +
40340 + pos = &readdir_spot->position;
40341 + switch (dir_pos_cmp(mod_point, pos)) {
40342 + case LESS_THAN:
40343 + /* @mod_pos is _before_ @readdir_spot, that is, entry was
40344 + * added/removed on the left (in key order) of current
40345 + * position. */
40346 + /* logical number of directory entry readdir is "looking" at
40347 + * changes */
40348 + readdir_spot->entry_no += adj;
40349 + assert("nikita-2577",
40350 + ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
40351 + if (de_id_cmp(&pos->dir_entry_key,
40352 + &mod_point->dir_entry_key) == EQUAL_TO) {
40353 + assert("nikita-2575", mod_point->pos < pos->pos);
40354 + /*
40355 + * if entry added/removed has the same key as current
40356 + * for readdir, update counter of duplicate keys in
40357 + * @readdir_spot.
40358 + */
40359 + pos->pos += adj;
40360 + }
40361 + break;
40362 + case GREATER_THAN:
40363 + /* directory is modified after @pos: nothing to do. */
40364 + break;
40365 + case EQUAL_TO:
40366 + /* cannot insert an entry readdir is looking at, because it
40367 + already exists. */
40368 + assert("nikita-2576", adj < 0);
40369 + /* directory entry to which @pos points to is being
40370 + removed.
40371 +
40372 + NOTE-NIKITA: Right thing to do is to update @pos to point
40373 + to the next entry. This is complex (we are under spin-lock
40374 + for one thing). Just rewind it to the beginning. Next
40375 + readdir will have to scan the beginning of
40376 + directory. Proper solution is to use semaphore in
40377 + spin lock's stead and use rewind_right() here.
40378 +
40379 + NOTE-NIKITA: now, semaphore is used, so...
40380 + */
40381 + memset(readdir_spot, 0, sizeof *readdir_spot);
40382 + }
40383 +}
40384 +
40385 +/* scan all file-descriptors for this directory and adjust their
40386 + positions respectively. Should be used by implementations of
40387 + add_entry and rem_entry of dir plugin */
40388 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40389 + int offset, int adj)
40390 +{
40391 + reiser4_file_fsdata *scan;
40392 + dir_pos mod_point;
40393 +
40394 + assert("nikita-2536", dir != NULL);
40395 + assert("nikita-2538", de != NULL);
40396 + assert("nikita-2539", adj != 0);
40397 +
40398 + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40399 + mod_point.pos = offset;
40400 +
40401 + spin_lock_inode(dir);
40402 +
40403 + /*
40404 + * new entry was added/removed in directory @dir. Scan all file
40405 + * descriptors for @dir that are currently involved into @readdir and
40406 + * update them.
40407 + */
40408 +
40409 + list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40410 + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40411 +
40412 + spin_unlock_inode(dir);
40413 +}
40414 +
40415 +/*
40416 + * traverse tree to start/continue readdir from the readdir position @pos.
40417 + */
40418 +static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40419 +{
40420 + reiser4_key key;
40421 + int result;
40422 + struct inode *inode;
40423 +
40424 + assert("nikita-2554", pos != NULL);
40425 +
40426 + inode = dir->f_dentry->d_inode;
40427 + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40428 + if (result != 0)
40429 + return result;
40430 + result = reiser4_object_lookup(inode,
40431 + &key,
40432 + tap->coord,
40433 + tap->lh,
40434 + tap->mode,
40435 + FIND_EXACT,
40436 + LEAF_LEVEL, LEAF_LEVEL,
40437 + 0, &tap->ra_info);
40438 + if (result == CBK_COORD_FOUND)
40439 + result = rewind_right(tap, (int)pos->position.pos);
40440 + else {
40441 + tap->coord->node = NULL;
40442 + done_lh(tap->lh);
40443 + result = RETERR(-EIO);
40444 + }
40445 + return result;
40446 +}
40447 +
40448 +/*
40449 + * handling of non-unique keys: calculate at what ordinal position within
40450 + * sequence of directory items with identical keys @pos is.
40451 + */
40452 +static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40453 +{
40454 + int result;
40455 + coord_t coord;
40456 + lock_handle lh;
40457 + tap_t scan;
40458 + de_id *did;
40459 + reiser4_key de_key;
40460 +
40461 + coord_init_zero(&coord);
40462 + init_lh(&lh);
40463 + reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40464 + reiser4_tap_copy(&scan, tap);
40465 + reiser4_tap_load(&scan);
40466 + pos->position.pos = 0;
40467 +
40468 + did = &pos->position.dir_entry_key;
40469 +
40470 + if (is_valid_dir_coord(inode, scan.coord)) {
40471 +
40472 + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40473 +
40474 + while (1) {
40475 +
40476 + result = go_prev_unit(&scan);
40477 + if (result != 0)
40478 + break;
40479 +
40480 + if (!is_valid_dir_coord(inode, scan.coord)) {
40481 + result = -EINVAL;
40482 + break;
40483 + }
40484 +
40485 + /* get key of directory entry */
40486 + unit_key_by_coord(scan.coord, &de_key);
40487 + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40488 + /* duplicate-sequence is over */
40489 + break;
40490 + }
40491 + pos->position.pos++;
40492 + }
40493 + } else
40494 + result = RETERR(-ENOENT);
40495 + reiser4_tap_relse(&scan);
40496 + reiser4_tap_done(&scan);
40497 + return result;
40498 +}
40499 +
40500 +/*
40501 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40502 + */
40503 +static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
40504 +{
40505 + __u64 destination;
40506 + __s64 shift;
40507 + int result;
40508 + struct inode *inode;
40509 + loff_t dirpos;
40510 +
40511 + assert("nikita-2553", dir != NULL);
40512 + assert("nikita-2548", pos != NULL);
40513 + assert("nikita-2551", tap->coord != NULL);
40514 + assert("nikita-2552", tap->lh != NULL);
40515 +
40516 + dirpos = reiser4_get_dir_fpos(dir);
40517 + shift = dirpos - pos->fpos;
40518 + /* this is logical directory entry within @dir which we are rewinding
40519 + * to */
40520 + destination = pos->entry_no + shift;
40521 +
40522 + inode = dir->f_dentry->d_inode;
40523 + if (dirpos < 0)
40524 + return RETERR(-EINVAL);
40525 + else if (destination == 0ll || dirpos == 0) {
40526 + /* rewind to the beginning of directory */
40527 + memset(pos, 0, sizeof *pos);
40528 + return dir_go_to(dir, pos, tap);
40529 + } else if (destination >= inode->i_size)
40530 + return RETERR(-ENOENT);
40531 +
40532 + if (shift < 0) {
40533 + /* I am afraid of negative numbers */
40534 + shift = -shift;
40535 + /* rewinding to the left */
40536 + if (shift <= (int)pos->position.pos) {
40537 + /* destination is within sequence of entries with
40538 + duplicate keys. */
40539 + result = dir_go_to(dir, pos, tap);
40540 + } else {
40541 + shift -= pos->position.pos;
40542 + while (1) {
40543 + /* repetitions: deadlock is possible when
40544 + going to the left. */
40545 + result = dir_go_to(dir, pos, tap);
40546 + if (result == 0) {
40547 + result = rewind_left(tap, shift);
40548 + if (result == -E_DEADLOCK) {
40549 + reiser4_tap_done(tap);
40550 + continue;
40551 + }
40552 + }
40553 + break;
40554 + }
40555 + }
40556 + } else {
40557 + /* rewinding to the right */
40558 + result = dir_go_to(dir, pos, tap);
40559 + if (result == 0)
40560 + result = rewind_right(tap, shift);
40561 + }
40562 + if (result == 0) {
40563 + result = set_pos(inode, pos, tap);
40564 + if (result == 0) {
40565 + /* update pos->position.pos */
40566 + pos->entry_no = destination;
40567 + pos->fpos = dirpos;
40568 + }
40569 + }
40570 + return result;
40571 +}
40572 +
40573 +/*
40574 + * Function that is called by common_readdir() on each directory entry while
40575 + * doing readdir. ->filldir callback may block, so we had to release long term
40576 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
40577 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40578 + *
40579 + * Whether node is unlocked in case of any other error is undefined. It is
40580 + * guaranteed to be still locked if success (0) is returned.
40581 + *
40582 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
40583 + * unlocked.
40584 + */
40585 +static int
40586 +feed_entry(struct file *f,
40587 + readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
40588 +{
40589 + item_plugin *iplug;
40590 + char *name;
40591 + reiser4_key sd_key;
40592 + int result;
40593 + char buf[DE_NAME_BUF_LEN];
40594 + char name_buf[32];
40595 + char *local_name;
40596 + unsigned file_type;
40597 + seal_t seal;
40598 + coord_t *coord;
40599 + reiser4_key entry_key;
40600 +
40601 + coord = tap->coord;
40602 + iplug = item_plugin_by_coord(coord);
40603 +
40604 + /* pointer to name within the node */
40605 + name = iplug->s.dir.extract_name(coord, buf);
40606 + assert("nikita-1371", name != NULL);
40607 +
40608 + /* key of object the entry points to */
40609 + if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40610 + return RETERR(-EIO);
40611 +
40612 + /* we must release longterm znode lock before calling filldir to avoid
40613 + deadlock which may happen if filldir causes page fault. So, copy
40614 + name to intermediate buffer */
40615 + if (strlen(name) + 1 > sizeof(name_buf)) {
40616 + local_name = kmalloc(strlen(name) + 1,
40617 + reiser4_ctx_gfp_mask_get());
40618 + if (local_name == NULL)
40619 + return RETERR(-ENOMEM);
40620 + } else
40621 + local_name = name_buf;
40622 +
40623 + strcpy(local_name, name);
40624 + file_type = iplug->s.dir.extract_file_type(coord);
40625 +
40626 + unit_key_by_coord(coord, &entry_key);
40627 + reiser4_seal_init(&seal, coord, &entry_key);
40628 +
40629 + longterm_unlock_znode(tap->lh);
40630 +
40631 + /*
40632 + * send information about directory entry to the ->filldir() filler
40633 + * supplied to us by caller (VFS).
40634 + *
40635 + * ->filldir is entitled to do weird things. For example, ->filldir
40636 + * supplied by knfsd re-enters file system. Make sure no locks are
40637 + * held.
40638 + */
40639 + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40640 +
40641 + reiser4_txn_restart_current();
40642 + result = filldir(dirent, name, (int)strlen(name),
40643 + /* offset of this entry */
40644 + f->f_pos,
40645 + /* inode number of object bounden by this entry */
40646 + oid_to_uino(get_key_objectid(&sd_key)), file_type);
40647 + if (local_name != name_buf)
40648 + kfree(local_name);
40649 + if (result < 0)
40650 + /* ->filldir() is satisfied. (no space in buffer, IOW) */
40651 + result = 1;
40652 + else
40653 + result = reiser4_seal_validate(&seal, coord, &entry_key,
40654 + tap->lh, tap->mode,
40655 + ZNODE_LOCK_HIPRI);
40656 + return result;
40657 +}
40658 +
40659 +static void move_entry(readdir_pos * pos, coord_t * coord)
40660 +{
40661 + reiser4_key de_key;
40662 + de_id *did;
40663 +
40664 + /* update @pos */
40665 + ++pos->entry_no;
40666 + did = &pos->position.dir_entry_key;
40667 +
40668 + /* get key of directory entry */
40669 + unit_key_by_coord(coord, &de_key);
40670 +
40671 + if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40672 + /* we are within sequence of directory entries
40673 + with duplicate keys. */
40674 + ++pos->position.pos;
40675 + else {
40676 + pos->position.pos = 0;
40677 + build_de_id_by_key(&de_key, did);
40678 + }
40679 + ++pos->fpos;
40680 +}
40681 +
40682 +/*
40683 + * STATELESS READDIR
40684 + *
40685 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
40686 + * into reiser4_file_fsdata on each directory modification (name insertion and
40687 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
40688 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
40689 + * across client READDIR requests for the same directory.
40690 + *
40691 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
40692 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40693 + * find detached reiser4_file_fsdata corresponding to previous readdir
40694 + * request. In other words, additional state is maintained on the
40695 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
40696 + *
40697 + * To efficiently detect when our ->readdir() method is called by NFS server,
40698 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40699 + * file_is_stateless() function).
40700 + *
40701 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
40702 + * bits of NFS readdir cookie: when first readdir request comes to the given
40703 + * directory from the given client, cookie is set to 0. This situation is
40704 + * detected, global cid_counter is incremented, and stored in highest bits of
40705 + * all direntry offsets returned to the client, including last one. As the
40706 + * only valid readdir cookie is one obtained as direntry->offset, we are
40707 + * guaranteed that next readdir request (continuing current one) will have
40708 + * current cid in the highest bits of starting readdir cookie. All d_cursors
40709 + * are hashed into per-super-block hash table by (oid, cid) key.
40710 + *
40711 + * In addition d_cursors are placed into per-super-block radix tree where they
40712 + * are keyed by oid alone. This is necessary to efficiently remove them during
40713 + * rmdir.
40714 + *
40715 + * At last, currently unused d_cursors are linked into special list. This list
40716 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40717 + *
40718 + */
40719 +
40720 +/*
40721 + * prepare for readdir.
40722 + */
40723 +static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
40724 +{
40725 + struct inode *inode;
40726 + reiser4_file_fsdata *fsdata;
40727 + int result;
40728 +
40729 + assert("nikita-1359", f != NULL);
40730 + inode = f->f_dentry->d_inode;
40731 + assert("nikita-1360", inode != NULL);
40732 +
40733 + if (!S_ISDIR(inode->i_mode))
40734 + return RETERR(-ENOTDIR);
40735 +
40736 + /* try to find detached readdir state */
40737 + result = reiser4_attach_fsdata(f, inode);
40738 + if (result != 0)
40739 + return result;
40740 +
40741 + fsdata = reiser4_get_file_fsdata(f);
40742 + assert("nikita-2571", fsdata != NULL);
40743 + if (IS_ERR(fsdata))
40744 + return PTR_ERR(fsdata);
40745 +
40746 + /* add file descriptor to the readdir list hanging of directory
40747 + * inode. This list is used to scan "readdirs-in-progress" while
40748 + * inserting or removing names in the directory. */
40749 + spin_lock_inode(inode);
40750 + if (list_empty_careful(&fsdata->dir.linkage))
40751 + list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40752 + *pos = &fsdata->dir.readdir;
40753 + spin_unlock_inode(inode);
40754 +
40755 + /* move @tap to the current position */
40756 + return dir_rewind(f, *pos, tap);
40757 +}
40758 +
40759 +/* this is implementation of vfs's llseek method of struct file_operations for
40760 + typical directory
40761 + See comment before reiser4_readdir_common() for explanation.
40762 +*/
40763 +loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
40764 +{
40765 + reiser4_context *ctx;
40766 + loff_t result;
40767 + struct inode *inode;
40768 +
40769 + inode = file->f_dentry->d_inode;
40770 +
40771 + ctx = reiser4_init_context(inode->i_sb);
40772 + if (IS_ERR(ctx))
40773 + return PTR_ERR(ctx);
40774 +
40775 + mutex_lock(&inode->i_mutex);
40776 +
40777 + /* update ->f_pos */
40778 + result = default_llseek(file, off, origin);
40779 + if (result >= 0) {
40780 + int ff;
40781 + coord_t coord;
40782 + lock_handle lh;
40783 + tap_t tap;
40784 + readdir_pos *pos;
40785 +
40786 + coord_init_zero(&coord);
40787 + init_lh(&lh);
40788 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40789 +
40790 + ff = dir_readdir_init(file, &tap, &pos);
40791 + reiser4_detach_fsdata(file);
40792 + if (ff != 0)
40793 + result = (loff_t) ff;
40794 + reiser4_tap_done(&tap);
40795 + }
40796 + reiser4_detach_fsdata(file);
40797 + mutex_unlock(&inode->i_mutex);
40798 +
40799 + reiser4_exit_context(ctx);
40800 + return result;
40801 +}
40802 +
40803 +/* this is common implementation of vfs's readdir method of struct
40804 + file_operations
40805 +
40806 + readdir problems:
40807 +
40808 + readdir(2)/getdents(2) interface is based on implicit assumption that
40809 + readdir can be restarted from any particular point by supplying file system
40810 + with off_t-full of data. That is, file system fills ->d_off field in struct
40811 + dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40812 + implemented by glibc as lseek(2) on directory.
40813 +
40814 + Reiser4 cannot restart readdir from 64 bits of data, because two last
40815 + components of the key of directory entry are unknown, which given 128 bits:
40816 + locality and type fields in the key of directory entry are always known, to
40817 + start readdir() from given point objectid and offset fields have to be
40818 + filled.
40819 +
40820 + Traditional UNIX API for scanning through directory
40821 + (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40822 + assumption that directory is structured very much like regular file, in
40823 + particular, it is implied that each name within given directory (directory
40824 + entry) can be uniquely identified by scalar offset and that such offset is
40825 + stable across the life-time of the name is identifies.
40826 +
40827 + This is manifestly not so for reiser4. In reiser4 the only stable unique
40828 + identifies for the directory entry is its key that doesn't fit into
40829 + seekdir/telldir API.
40830 +
40831 + solution:
40832 +
40833 + Within each file descriptor participating in readdir-ing of directory
40834 + plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40835 + the "current" directory entry that file descriptor looks at. It contains a
40836 + key of directory entry (plus some additional info to deal with non-unique
40837 + keys that we wouldn't dwell onto here) and a logical position of this
40838 + directory entry starting from the beginning of the directory, that is
40839 + ordinal number of this entry in the readdir order.
40840 +
40841 + Obviously this logical position is not stable in the face of directory
40842 + modifications. To work around this, on each addition or removal of directory
40843 + entry all file descriptors for directory inode are scanned and their
40844 + readdir_pos are updated accordingly (adjust_dir_pos()).
40845 +*/
40846 +int reiser4_readdir_common(struct file *f /* directory file being read */,
40847 + void *dirent /* opaque data passed to us by VFS */,
40848 + filldir_t filld /* filler function passed to us
40849 + * by VFS */)
40850 +{
40851 + reiser4_context *ctx;
40852 + int result;
40853 + struct inode *inode;
40854 + coord_t coord;
40855 + lock_handle lh;
40856 + tap_t tap;
40857 + readdir_pos *pos;
40858 +
40859 + assert("nikita-1359", f != NULL);
40860 + inode = f->f_dentry->d_inode;
40861 + assert("nikita-1360", inode != NULL);
40862 +
40863 + if (!S_ISDIR(inode->i_mode))
40864 + return RETERR(-ENOTDIR);
40865 +
40866 + ctx = reiser4_init_context(inode->i_sb);
40867 + if (IS_ERR(ctx))
40868 + return PTR_ERR(ctx);
40869 +
40870 + coord_init_zero(&coord);
40871 + init_lh(&lh);
40872 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40873 +
40874 + reiser4_readdir_readahead_init(inode, &tap);
40875 +
40876 + repeat:
40877 + result = dir_readdir_init(f, &tap, &pos);
40878 + if (result == 0) {
40879 + result = reiser4_tap_load(&tap);
40880 + /* scan entries one by one feeding them to @filld */
40881 + while (result == 0) {
40882 + coord_t *coord;
40883 +
40884 + coord = tap.coord;
40885 + assert("nikita-2572", coord_is_existing_unit(coord));
40886 + assert("nikita-3227", is_valid_dir_coord(inode, coord));
40887 +
40888 + result = feed_entry(f, pos, &tap, filld, dirent);
40889 + if (result > 0) {
40890 + break;
40891 + } else if (result == 0) {
40892 + ++f->f_pos;
40893 + result = go_next_unit(&tap);
40894 + if (result == -E_NO_NEIGHBOR ||
40895 + result == -ENOENT) {
40896 + result = 0;
40897 + break;
40898 + } else if (result == 0) {
40899 + if (is_valid_dir_coord(inode, coord))
40900 + move_entry(pos, coord);
40901 + else
40902 + break;
40903 + }
40904 + } else if (result == -E_REPEAT) {
40905 + /* feed_entry() had to restart. */
40906 + ++f->f_pos;
40907 + reiser4_tap_relse(&tap);
40908 + goto repeat;
40909 + } else
40910 + warning("vs-1617",
40911 + "reiser4_readdir_common: unexpected error %d",
40912 + result);
40913 + }
40914 + reiser4_tap_relse(&tap);
40915 +
40916 + if (result >= 0)
40917 + f->f_version = inode->i_version;
40918 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40919 + result = 0;
40920 + reiser4_tap_done(&tap);
40921 + reiser4_detach_fsdata(f);
40922 +
40923 + /* try to update directory's atime */
40924 + if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
40925 + BA_CAN_COMMIT) != 0)
40926 + warning("", "failed to update atime on readdir: %llu",
40927 + get_inode_oid(inode));
40928 + else
40929 + file_accessed(f);
40930 +
40931 + context_set_commit_async(ctx);
40932 + reiser4_exit_context(ctx);
40933 +
40934 + return (result <= 0) ? result : 0;
40935 +}
40936 +
40937 +/*
40938 + * Local variables:
40939 + * c-indentation-style: "K&R"
40940 + * mode-name: "LC"
40941 + * c-basic-offset: 8
40942 + * tab-width: 8
40943 + * fill-column: 79
40944 + * End:
40945 + */
40946 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c
40947 --- linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
40948 +++ linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c 2007-05-06 14:50:43.791004471 +0400
40949 @@ -0,0 +1,1007 @@
40950 +/* Copyright 2005 by Hans Reiser, licensing governed by
40951 + reiser4/README */
40952 +
40953 +/* this file contains typical implementations for most of methods of
40954 + file plugin
40955 +*/
40956 +
40957 +#include "../inode.h"
40958 +#include "object.h"
40959 +#include "../safe_link.h"
40960 +
40961 +#include <linux/quotaops.h>
40962 +
40963 +static int insert_new_sd(struct inode *inode);
40964 +static int update_sd(struct inode *inode);
40965 +
40966 +/* this is common implementation of write_sd_by_inode method of file plugin
40967 + either insert stat data or update it
40968 + */
40969 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40970 +{
40971 + int result;
40972 +
40973 + assert("nikita-730", inode != NULL);
40974 +
40975 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
40976 + /* object doesn't have stat-data yet */
40977 + result = insert_new_sd(inode);
40978 + else
40979 + result = update_sd(inode);
40980 + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40981 + /* Don't issue warnings about "name is too long" */
40982 + warning("nikita-2221", "Failed to save sd for %llu: %i",
40983 + (unsigned long long)get_inode_oid(inode), result);
40984 + return result;
40985 +}
40986 +
40987 +/* this is common implementation of key_by_inode method of file plugin
40988 + */
40989 +int
40990 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40991 + reiser4_key * key)
40992 +{
40993 + reiser4_key_init(key);
40994 + set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40995 + set_key_ordering(key, get_inode_ordering(inode));
40996 + set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40997 + set_key_type(key, KEY_BODY_MINOR);
40998 + set_key_offset(key, (__u64) off);
40999 + return 0;
41000 +}
41001 +
41002 +/* this is common implementation of set_plug_in_inode method of file plugin
41003 + */
41004 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
41005 + struct inode *parent /* parent object */ ,
41006 + reiser4_object_create_data * data /* creational
41007 + * data */ )
41008 +{
41009 + __u64 mask;
41010 +
41011 + object->i_mode = data->mode;
41012 + /* this should be plugin decision */
41013 + object->i_uid = current->fsuid;
41014 + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
41015 +
41016 + /* support for BSD style group-id assignment. See mount's manual page
41017 + description of bsdgroups ext2 mount options for more details */
41018 + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
41019 + object->i_gid = parent->i_gid;
41020 + else if (parent->i_mode & S_ISGID) {
41021 + /* parent directory has sguid bit */
41022 + object->i_gid = parent->i_gid;
41023 + if (S_ISDIR(object->i_mode))
41024 + /* sguid is inherited by sub-directories */
41025 + object->i_mode |= S_ISGID;
41026 + } else
41027 + object->i_gid = current->fsgid;
41028 +
41029 + /* this object doesn't have stat-data yet */
41030 + reiser4_inode_set_flag(object, REISER4_NO_SD);
41031 +#if 0
41032 + /* this is now called after all inode plugins are initialized:
41033 + do_create_vfs_child after adjust_to_parent */
41034 + /* setup inode and file-operations for this inode */
41035 + setup_inode_ops(object, data);
41036 +#endif
41037 + object->i_nlink = 0;
41038 + reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41039 + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41040 + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41041 + mask |= (1 << LARGE_TIMES_STAT);
41042 +
41043 + reiser4_inode_data(object)->extmask = mask;
41044 + return 0;
41045 +}
41046 +
41047 +/* this is common implementation of adjust_to_parent method of file plugin for
41048 + regular files
41049 + */
41050 +int adjust_to_parent_common(struct inode *object /* new object */ ,
41051 + struct inode *parent /* parent directory */ ,
41052 + struct inode *root /* root directory */ )
41053 +{
41054 + assert("nikita-2165", object != NULL);
41055 + if (parent == NULL)
41056 + parent = root;
41057 + assert("nikita-2069", parent != NULL);
41058 +
41059 + /*
41060 + * inherit missing plugins from parent
41061 + */
41062 +
41063 + grab_plugin_pset(object, parent, PSET_FILE);
41064 + grab_plugin_pset(object, parent, PSET_SD);
41065 + grab_plugin_pset(object, parent, PSET_FORMATTING);
41066 + grab_plugin_pset(object, parent, PSET_PERM);
41067 + return 0;
41068 +}
41069 +
41070 +/* this is common implementation of adjust_to_parent method of file plugin for
41071 + typical directories
41072 + */
41073 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41074 + struct inode *parent /* parent directory */ ,
41075 + struct inode *root /* root directory */ )
41076 +{
41077 + int result = 0;
41078 + pset_member memb;
41079 +
41080 + assert("nikita-2166", object != NULL);
41081 + if (parent == NULL)
41082 + parent = root;
41083 + assert("nikita-2167", parent != NULL);
41084 +
41085 + /*
41086 + * inherit missing plugins from parent
41087 + */
41088 + for (memb = 0; memb < PSET_LAST; ++memb) {
41089 + result = grab_plugin_pset(object, parent, memb);
41090 + if (result != 0)
41091 + break;
41092 + }
41093 + return result;
41094 +}
41095 +
41096 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41097 + struct inode *parent /* parent directory */,
41098 + struct inode *root /* root directory */)
41099 +{
41100 + int result;
41101 + result = adjust_to_parent_common(object, parent, root);
41102 + if (result)
41103 + return result;
41104 + assert("edward-1416", parent != NULL);
41105 +
41106 + grab_plugin_pset(object, parent, PSET_CLUSTER);
41107 + grab_plugin_pset(object, parent, PSET_CIPHER);
41108 + grab_plugin_pset(object, parent, PSET_DIGEST);
41109 + grab_plugin_pset(object, parent, PSET_COMPRESSION);
41110 + grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
41111 +
41112 + return 0;
41113 +}
41114 +
41115 +/* this is common implementation of create_object method of file plugin
41116 + */
41117 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
41118 + reiser4_object_create_data * data)
41119 +{
41120 + reiser4_block_nr reserve;
41121 + assert("nikita-744", object != NULL);
41122 + assert("nikita-745", parent != NULL);
41123 + assert("nikita-747", data != NULL);
41124 + assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
41125 +
41126 + reserve = estimate_create_common(object);
41127 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41128 + return RETERR(-ENOSPC);
41129 + return write_sd_by_inode_common(object);
41130 +}
41131 +
41132 +static int common_object_delete_no_reserve(struct inode *inode);
41133 +
41134 +/**
41135 + * reiser4_delete_object_common - delete_object of file_plugin
41136 + * @inode: inode to be deleted
41137 + *
41138 + * This is common implementation of delete_object method of file_plugin. It
41139 + * applies to object its deletion consists of removing two items - stat data
41140 + * and safe-link.
41141 + */
41142 +int reiser4_delete_object_common(struct inode *inode)
41143 +{
41144 + int result;
41145 +
41146 + assert("nikita-1477", inode != NULL);
41147 + /* FIXME: if file body deletion failed (i/o error, for instance),
41148 + inode->i_size can be != 0 here */
41149 + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41150 + assert("nikita-3421", inode->i_nlink == 0);
41151 +
41152 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41153 + reiser4_block_nr reserve;
41154 +
41155 + /* grab space which is needed to remove 2 items from the tree:
41156 + stat data and safe-link */
41157 + reserve = 2 *
41158 + estimate_one_item_removal(reiser4_tree_by_inode(inode));
41159 + if (reiser4_grab_space_force(reserve,
41160 + BA_RESERVED | BA_CAN_COMMIT))
41161 + return RETERR(-ENOSPC);
41162 + result = common_object_delete_no_reserve(inode);
41163 + } else
41164 + result = 0;
41165 + return result;
41166 +}
41167 +
41168 +/**
41169 + * reiser4_delete_dir_common - delete_object of file_plugin
41170 + * @inode: inode to be deleted
41171 + *
41172 + * This is common implementation of delete_object method of file_plugin for
41173 + * typical directory. It calls done method of dir_plugin to remove "." and
41174 + * removes stat data and safe-link.
41175 + */
41176 +int reiser4_delete_dir_common(struct inode *inode)
41177 +{
41178 + int result;
41179 + dir_plugin *dplug;
41180 +
41181 + assert("", (get_current_context() &&
41182 + get_current_context()->trans->atom == NULL));
41183 +
41184 + dplug = inode_dir_plugin(inode);
41185 + assert("vs-1101", dplug && dplug->done);
41186 +
41187 + /* kill cursors which might be attached to inode */
41188 + reiser4_kill_cursors(inode);
41189 +
41190 + /* grab space enough for removing two items */
41191 + if (reiser4_grab_space
41192 + (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
41193 + BA_RESERVED | BA_CAN_COMMIT))
41194 + return RETERR(-ENOSPC);
41195 +
41196 + result = dplug->done(inode);
41197 + if (!result)
41198 + result = common_object_delete_no_reserve(inode);
41199 + return result;
41200 +}
41201 +
41202 +/* this is common implementation of add_link method of file plugin
41203 + */
41204 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
41205 +{
41206 + /*
41207 + * increment ->i_nlink and update ->i_ctime
41208 + */
41209 +
41210 + INODE_INC_FIELD(object, i_nlink);
41211 + object->i_ctime = CURRENT_TIME;
41212 + return 0;
41213 +}
41214 +
41215 +/* this is common implementation of rem_link method of file plugin
41216 + */
41217 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
41218 +{
41219 + assert("nikita-2021", object != NULL);
41220 + assert("nikita-2163", object->i_nlink > 0);
41221 +
41222 + /*
41223 + * decrement ->i_nlink and update ->i_ctime
41224 + */
41225 +
41226 + INODE_DEC_FIELD(object, i_nlink);
41227 + object->i_ctime = CURRENT_TIME;
41228 + return 0;
41229 +}
41230 +
41231 +/* this is common implementation of rem_link method of file plugin for typical
41232 + directory
41233 +*/
41234 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41235 +{
41236 + assert("nikita-20211", object != NULL);
41237 + assert("nikita-21631", object->i_nlink > 0);
41238 +
41239 + /*
41240 + * decrement ->i_nlink and update ->i_ctime
41241 + */
41242 + INODE_DEC_FIELD(object, i_nlink);
41243 + if (object->i_nlink == 1)
41244 + INODE_DEC_FIELD(object, i_nlink);
41245 + object->i_ctime = CURRENT_TIME;
41246 + return 0;
41247 +}
41248 +
41249 +/* this is common implementation of owns_item method of file plugin
41250 + compare objectids of keys in inode and coord */
41251 +int owns_item_common(const struct inode *inode, /* object to check
41252 + * against */
41253 + const coord_t * coord /* coord to check */ )
41254 +{
41255 + reiser4_key item_key;
41256 + reiser4_key file_key;
41257 +
41258 + assert("nikita-760", inode != NULL);
41259 + assert("nikita-761", coord != NULL);
41260 +
41261 + return coord_is_existing_item(coord) &&
41262 + (get_key_objectid(build_sd_key(inode, &file_key)) ==
41263 + get_key_objectid(item_key_by_coord(coord, &item_key)));
41264 +}
41265 +
41266 +/* this is common implementation of owns_item method of file plugin
41267 + for typical directory
41268 +*/
41269 +int owns_item_common_dir(const struct inode *inode, /* object to check against */
41270 + const coord_t * coord /* coord of item to check */ )
41271 +{
41272 + reiser4_key item_key;
41273 +
41274 + assert("nikita-1335", inode != NULL);
41275 + assert("nikita-1334", coord != NULL);
41276 +
41277 + if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
41278 + return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41279 + get_inode_oid(inode);
41280 + else
41281 + return owns_item_common(inode, coord);
41282 +}
41283 +
41284 +/* this is common implementation of can_add_link method of file plugin
41285 + checks whether yet another hard links to this object can be added
41286 +*/
41287 +int can_add_link_common(const struct inode *object /* object to check */ )
41288 +{
41289 + assert("nikita-732", object != NULL);
41290 +
41291 + /* inode->i_nlink is unsigned int, so just check for integer
41292 + overflow */
41293 + return object->i_nlink + 1 != 0;
41294 +}
41295 +
41296 +/* this is common implementation of can_rem_link method of file plugin for
41297 + typical directory
41298 +*/
41299 +int can_rem_link_common_dir(const struct inode *inode)
41300 +{
41301 + /* is_dir_empty() returns 0 is dir is empty */
41302 + return !is_dir_empty(inode);
41303 +}
41304 +
41305 +/* this is common implementation of detach method of file plugin for typical
41306 + directory
41307 +*/
41308 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
41309 +{
41310 + dir_plugin *dplug;
41311 +
41312 + dplug = inode_dir_plugin(child);
41313 + assert("nikita-2883", dplug != NULL);
41314 + assert("nikita-2884", dplug->detach != NULL);
41315 + return dplug->detach(child, parent);
41316 +}
41317 +
41318 +/* this is common implementation of bind method of file plugin for typical
41319 + directory
41320 +*/
41321 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
41322 +{
41323 + dir_plugin *dplug;
41324 +
41325 + dplug = inode_dir_plugin(child);
41326 + assert("nikita-2646", dplug != NULL);
41327 + return dplug->attach(child, parent);
41328 +}
41329 +
41330 +static int process_truncate(struct inode *, __u64 size);
41331 +
41332 +/* this is common implementation of safelink method of file plugin
41333 + */
41334 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41335 +{
41336 + int result;
41337 +
41338 + assert("vs-1705", get_current_context()->trans->atom == NULL);
41339 + if (link == SAFE_UNLINK)
41340 + /* nothing to do. iput() in the caller (process_safelink) will
41341 + * finish with file */
41342 + result = 0;
41343 + else if (link == SAFE_TRUNCATE)
41344 + result = process_truncate(object, value);
41345 + else {
41346 + warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41347 + result = RETERR(-EIO);
41348 + }
41349 + return result;
41350 +}
41351 +
41352 +/* this is common implementation of estimate.create method of file plugin
41353 + can be used when object creation involves insertion of one item (usually stat
41354 + data) into tree
41355 +*/
41356 +reiser4_block_nr estimate_create_common(const struct inode * object)
41357 +{
41358 + return estimate_one_insert_item(reiser4_tree_by_inode(object));
41359 +}
41360 +
41361 +/* this is common implementation of estimate.create method of file plugin for
41362 + typical directory
41363 + can be used when directory creation involves insertion of two items (usually
41364 + stat data and item containing "." and "..") into tree
41365 +*/
41366 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41367 +{
41368 + return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
41369 +}
41370 +
41371 +/* this is common implementation of estimate.update method of file plugin
41372 + can be used when stat data update does not do more than inserting a unit
41373 + into a stat data item which is probably true for most cases
41374 +*/
41375 +reiser4_block_nr estimate_update_common(const struct inode * inode)
41376 +{
41377 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
41378 +}
41379 +
41380 +/* this is common implementation of estimate.unlink method of file plugin
41381 + */
41382 +reiser4_block_nr
41383 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
41384 + const struct inode * parent UNUSED_ARG)
41385 +{
41386 + return 0;
41387 +}
41388 +
41389 +/* this is common implementation of estimate.unlink method of file plugin for
41390 + typical directory
41391 +*/
41392 +reiser4_block_nr
41393 +estimate_unlink_common_dir(const struct inode * object,
41394 + const struct inode * parent)
41395 +{
41396 + dir_plugin *dplug;
41397 +
41398 + dplug = inode_dir_plugin(object);
41399 + assert("nikita-2888", dplug != NULL);
41400 + assert("nikita-2887", dplug->estimate.unlink != NULL);
41401 + return dplug->estimate.unlink(object, parent);
41402 +}
41403 +
41404 +char *wire_write_common(struct inode *inode, char *start)
41405 +{
41406 + return build_inode_onwire(inode, start);
41407 +}
41408 +
41409 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41410 +{
41411 + return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41412 +}
41413 +
41414 +struct dentry *wire_get_common(struct super_block *sb,
41415 + reiser4_object_on_wire * obj)
41416 +{
41417 + struct inode *inode;
41418 + struct dentry *dentry;
41419 + reiser4_key key;
41420 +
41421 + extract_key_from_id(&obj->u.std.key_id, &key);
41422 + inode = reiser4_iget(sb, &key, 1);
41423 + if (!IS_ERR(inode)) {
41424 + reiser4_iget_complete(inode);
41425 + dentry = d_alloc_anon(inode);
41426 + if (dentry == NULL) {
41427 + iput(inode);
41428 + dentry = ERR_PTR(-ENOMEM);
41429 + } else
41430 + dentry->d_op = &get_super_private(sb)->ops.dentry;
41431 + } else if (PTR_ERR(inode) == -ENOENT)
41432 + /*
41433 + * inode wasn't found at the key encoded in the file
41434 + * handle. Hence, file handle is stale.
41435 + */
41436 + dentry = ERR_PTR(RETERR(-ESTALE));
41437 + else
41438 + dentry = (void *)inode;
41439 + return dentry;
41440 +}
41441 +
41442 +int wire_size_common(struct inode *inode)
41443 +{
41444 + return inode_onwire_size(inode);
41445 +}
41446 +
41447 +void wire_done_common(reiser4_object_on_wire * obj)
41448 +{
41449 + /* nothing to do */
41450 +}
41451 +
41452 +/* helper function to print errors */
41453 +static void key_warning(const reiser4_key * key /* key to print */ ,
41454 + const struct inode *inode,
41455 + int code /* error code to print */ )
41456 +{
41457 + assert("nikita-716", key != NULL);
41458 +
41459 + if (code != -ENOMEM) {
41460 + warning("nikita-717", "Error for inode %llu (%i)",
41461 + (unsigned long long)get_key_objectid(key), code);
41462 + reiser4_print_key("for key", key);
41463 + }
41464 +}
41465 +
41466 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41467 +#if REISER4_DEBUG
41468 +static void
41469 +check_inode_seal(const struct inode *inode,
41470 + const coord_t * coord, const reiser4_key * key)
41471 +{
41472 + reiser4_key unit_key;
41473 +
41474 + unit_key_by_coord(coord, &unit_key);
41475 + assert("nikita-2752",
41476 + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41477 + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41478 +}
41479 +
41480 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41481 +{
41482 + reiser4_key ukey;
41483 +
41484 + coord_clear_iplug(coord);
41485 + if (zload(coord->node))
41486 + return;
41487 +
41488 + if (!coord_is_existing_unit(coord) ||
41489 + !item_plugin_by_coord(coord) ||
41490 + !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41491 + (znode_get_level(coord->node) != LEAF_LEVEL) ||
41492 + !item_is_statdata(coord)) {
41493 + warning("nikita-1901", "Conspicuous seal");
41494 + reiser4_print_key("key", key);
41495 + print_coord("coord", coord, 1);
41496 + impossible("nikita-2877", "no way");
41497 + }
41498 + zrelse(coord->node);
41499 +}
41500 +
41501 +#else
41502 +#define check_inode_seal(inode, coord, key) noop
41503 +#define check_sd_coord(coord, key) noop
41504 +#endif
41505 +
41506 +/* insert new stat-data into tree. Called with inode state
41507 + locked. Return inode state locked. */
41508 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41509 +{
41510 + int result;
41511 + reiser4_key key;
41512 + coord_t coord;
41513 + reiser4_item_data data;
41514 + char *area;
41515 + reiser4_inode *ref;
41516 + lock_handle lh;
41517 + oid_t oid;
41518 +
41519 + assert("nikita-723", inode != NULL);
41520 + assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
41521 +
41522 + ref = reiser4_inode_data(inode);
41523 + spin_lock_inode(inode);
41524 +
41525 + if (ref->plugin_mask != 0)
41526 + /* inode has non-standard plugins */
41527 + inode_set_extension(inode, PLUGIN_STAT);
41528 + /*
41529 + * prepare specification of new item to be inserted
41530 + */
41531 +
41532 + data.iplug = inode_sd_plugin(inode);
41533 + data.length = data.iplug->s.sd.save_len(inode);
41534 + spin_unlock_inode(inode);
41535 +
41536 + data.data = NULL;
41537 + data.user = 0;
41538 +/* could be optimized for case where there is only one node format in
41539 + * use in the filesystem, probably there are lots of such
41540 + * places we could optimize for only one node layout.... -Hans */
41541 + if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
41542 + /* This is silly check, but we don't know actual node where
41543 + insertion will go into. */
41544 + return RETERR(-ENAMETOOLONG);
41545 + }
41546 + oid = oid_allocate(inode->i_sb);
41547 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41548 + if (oid == ABSOLUTE_MAX_OID)
41549 + return RETERR(-EOVERFLOW);
41550 +
41551 + set_inode_oid(inode, oid);
41552 +
41553 + coord_init_zero(&coord);
41554 + init_lh(&lh);
41555 +
41556 + result = insert_by_key(reiser4_tree_by_inode(inode),
41557 + build_sd_key(inode, &key), &data, &coord, &lh,
41558 + /* stat data lives on a leaf level */
41559 + LEAF_LEVEL, CBK_UNIQUE);
41560 +
41561 + /* we don't want to re-check that somebody didn't insert
41562 + stat-data while we were doing io, because if it did,
41563 + insert_by_key() returned error. */
41564 + /* but what _is_ possible is that plugin for inode's stat-data,
41565 + list of non-standard plugins or their state would change
41566 + during io, so that stat-data wouldn't fit into sd. To avoid
41567 + this race we keep inode_state lock. This lock has to be
41568 + taken each time you access inode in a way that would cause
41569 + changes in sd size: changing plugins etc.
41570 + */
41571 +
41572 + if (result == IBK_INSERT_OK) {
41573 + coord_clear_iplug(&coord);
41574 + result = zload(coord.node);
41575 + if (result == 0) {
41576 + /* have we really inserted stat data? */
41577 + assert("nikita-725", item_is_statdata(&coord));
41578 +
41579 + /* inode was just created. It is inserted into hash
41580 + table, but no directory entry was yet inserted into
41581 + parent. So, inode is inaccessible through
41582 + ->lookup(). All places that directly grab inode
41583 + from hash-table (like old knfsd), should check
41584 + IMMUTABLE flag that is set by common_create_child.
41585 + */
41586 + assert("nikita-3240", data.iplug != NULL);
41587 + assert("nikita-3241", data.iplug->s.sd.save != NULL);
41588 + area = item_body_by_coord(&coord);
41589 + result = data.iplug->s.sd.save(inode, &area);
41590 + znode_make_dirty(coord.node);
41591 + if (result == 0) {
41592 + /* object has stat-data now */
41593 + reiser4_inode_clr_flag(inode, REISER4_NO_SD);
41594 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41595 + /* initialise stat-data seal */
41596 + reiser4_seal_init(&ref->sd_seal, &coord, &key);
41597 + ref->sd_coord = coord;
41598 + check_inode_seal(inode, &coord, &key);
41599 + } else if (result != -ENOMEM)
41600 + /*
41601 + * convert any other error code to -EIO to
41602 + * avoid confusing user level with unexpected
41603 + * errors.
41604 + */
41605 + result = RETERR(-EIO);
41606 + zrelse(coord.node);
41607 + }
41608 + }
41609 + done_lh(&lh);
41610 +
41611 + if (result != 0)
41612 + key_warning(&key, inode, result);
41613 + else
41614 + oid_count_allocated();
41615 +
41616 + return result;
41617 +}
41618 +
41619 +/* find sd of inode in a tree, deal with errors */
41620 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41621 + znode_lock_mode lock_mode /* lock mode */ ,
41622 + coord_t * coord /* resulting coord */ ,
41623 + lock_handle * lh /* resulting lock handle */ ,
41624 + const reiser4_key * key /* resulting key */ ,
41625 + int silent)
41626 +{
41627 + int result;
41628 + __u32 flags;
41629 +
41630 + assert("nikita-1692", inode != NULL);
41631 + assert("nikita-1693", coord != NULL);
41632 + assert("nikita-1694", key != NULL);
41633 +
41634 + /* look for the object's stat data in a tree.
41635 + This returns in "node" pointer to a locked znode and in "pos"
41636 + position of an item found in node. Both are only valid if
41637 + coord_found is returned. */
41638 + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41639 + flags |= CBK_UNIQUE;
41640 + /*
41641 + * traverse tree to find stat data. We cannot use vroot here, because
41642 + * it only covers _body_ of the file, and stat data don't belong
41643 + * there.
41644 + */
41645 + result = coord_by_key(reiser4_tree_by_inode(inode),
41646 + key,
41647 + coord,
41648 + lh,
41649 + lock_mode,
41650 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41651 + if (REISER4_DEBUG && result == 0)
41652 + check_sd_coord(coord, key);
41653 +
41654 + if (result != 0 && !silent)
41655 + key_warning(key, inode, result);
41656 + return result;
41657 +}
41658 +
41659 +static int
41660 +locate_inode_sd(struct inode *inode,
41661 + reiser4_key * key, coord_t * coord, lock_handle * lh)
41662 +{
41663 + reiser4_inode *state;
41664 + seal_t seal;
41665 + int result;
41666 +
41667 + assert("nikita-3483", inode != NULL);
41668 +
41669 + state = reiser4_inode_data(inode);
41670 + spin_lock_inode(inode);
41671 + *coord = state->sd_coord;
41672 + coord_clear_iplug(coord);
41673 + seal = state->sd_seal;
41674 + spin_unlock_inode(inode);
41675 +
41676 + build_sd_key(inode, key);
41677 + if (reiser4_seal_is_set(&seal)) {
41678 + /* first, try to use seal */
41679 + result = reiser4_seal_validate(&seal,
41680 + coord,
41681 + key,
41682 + lh, ZNODE_WRITE_LOCK,
41683 + ZNODE_LOCK_LOPRI);
41684 + if (result == 0)
41685 + check_sd_coord(coord, key);
41686 + } else
41687 + result = -E_REPEAT;
41688 +
41689 + if (result != 0) {
41690 + coord_init_zero(coord);
41691 + result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41692 + }
41693 + return result;
41694 +}
41695 +
41696 +#if REISER4_DEBUG
41697 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
41698 +{
41699 + return (get_key_locality(k1) == get_key_locality(k2) &&
41700 + get_key_type(k1) == get_key_type(k2) &&
41701 + get_key_band(k1) == get_key_band(k2) &&
41702 + get_key_ordering(k1) == get_key_ordering(k2) &&
41703 + get_key_objectid(k1) == get_key_objectid(k2));
41704 +}
41705 +
41706 +#include "../tree_walk.h"
41707 +
41708 +/* make some checks before and after stat-data resize operation */
41709 +static int check_sd_resize(struct inode * inode, coord_t * coord,
41710 + int length, int progress /* 1 means after resize */)
41711 +{
41712 + int ret = 0;
41713 + lock_handle left_lock;
41714 + coord_t left_coord;
41715 + reiser4_key left_key;
41716 + reiser4_key key;
41717 +
41718 + if (inode_file_plugin(inode) !=
41719 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
41720 + return 0;
41721 + if (!length)
41722 + return 0;
41723 + if (coord->item_pos != 0)
41724 + return 0;
41725 +
41726 + init_lh(&left_lock);
41727 + ret = reiser4_get_left_neighbor(&left_lock,
41728 + coord->node,
41729 + ZNODE_WRITE_LOCK,
41730 + GN_CAN_USE_UPPER_LEVELS);
41731 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
41732 + ret == -ENOENT || ret == -EINVAL
41733 + || ret == -E_DEADLOCK) {
41734 + ret = 0;
41735 + goto exit;
41736 + }
41737 + ret = zload(left_lock.node);
41738 + if (ret)
41739 + goto exit;
41740 + coord_init_last_unit(&left_coord, left_lock.node);
41741 + item_key_by_coord(&left_coord, &left_key);
41742 + item_key_by_coord(coord, &key);
41743 +
41744 + if (all_but_offset_key_eq(&key, &left_key))
41745 + /* corruption occured */
41746 + ret = 1;
41747 + zrelse(left_lock.node);
41748 + exit:
41749 + done_lh(&left_lock);
41750 + return ret;
41751 +}
41752 +#endif
41753 +
41754 +/* update stat-data at @coord */
41755 +static int
41756 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41757 + lock_handle * lh)
41758 +{
41759 + int result;
41760 + reiser4_item_data data;
41761 + char *area;
41762 + reiser4_inode *state;
41763 + znode *loaded;
41764 +
41765 + state = reiser4_inode_data(inode);
41766 +
41767 + coord_clear_iplug(coord);
41768 + result = zload(coord->node);
41769 + if (result != 0)
41770 + return result;
41771 + loaded = coord->node;
41772 +
41773 + spin_lock_inode(inode);
41774 + assert("nikita-728", inode_sd_plugin(inode) != NULL);
41775 + data.iplug = inode_sd_plugin(inode);
41776 +
41777 + /* if inode has non-standard plugins, add appropriate stat data
41778 + * extension */
41779 + if (state->extmask & (1 << PLUGIN_STAT)) {
41780 + if (state->plugin_mask == 0)
41781 + inode_clr_extension(inode, PLUGIN_STAT);
41782 + } else if (state->plugin_mask != 0)
41783 + inode_set_extension(inode, PLUGIN_STAT);
41784 +
41785 + if (state->extmask & (1 << HEIR_STAT)) {
41786 + if (state->heir_mask == 0)
41787 + inode_clr_extension(inode, HEIR_STAT);
41788 + } else if (state->heir_mask != 0)
41789 + inode_set_extension(inode, HEIR_STAT);
41790 +
41791 + /* data.length is how much space to add to (or remove
41792 + from if negative) sd */
41793 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41794 + /* recalculate stat-data length */
41795 + data.length =
41796 + data.iplug->s.sd.save_len(inode) -
41797 + item_length_by_coord(coord);
41798 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41799 + } else
41800 + data.length = 0;
41801 + spin_unlock_inode(inode);
41802 +
41803 + /* if on-disk stat data is of different length than required
41804 + for this inode, resize it */
41805 +
41806 + if (data.length != 0) {
41807 + data.data = NULL;
41808 + data.user = 0;
41809 +
41810 + assert("edward-1441",
41811 + !check_sd_resize(inode, coord,
41812 + data.length, 0/* before resize */));
41813 +
41814 + /* insertion code requires that insertion point (coord) was
41815 + * between units. */
41816 + coord->between = AFTER_UNIT;
41817 + result = reiser4_resize_item(coord, &data, key, lh,
41818 + COPI_DONT_SHIFT_LEFT);
41819 + if (result != 0) {
41820 + key_warning(key, inode, result);
41821 + zrelse(loaded);
41822 + return result;
41823 + }
41824 + if (loaded != coord->node) {
41825 + /* reiser4_resize_item moved coord to another node.
41826 + Zload it */
41827 + zrelse(loaded);
41828 + coord_clear_iplug(coord);
41829 + result = zload(coord->node);
41830 + if (result != 0)
41831 + return result;
41832 + loaded = coord->node;
41833 + }
41834 + assert("edward-1442",
41835 + !check_sd_resize(inode, coord,
41836 + data.length, 1/* after resize */));
41837 + }
41838 + area = item_body_by_coord(coord);
41839 + spin_lock_inode(inode);
41840 + result = data.iplug->s.sd.save(inode, &area);
41841 + znode_make_dirty(coord->node);
41842 +
41843 + /* re-initialise stat-data seal */
41844 +
41845 + /*
41846 + * coord.between was possibly skewed from AT_UNIT when stat-data size
41847 + * was changed and new extensions were pasted into item.
41848 + */
41849 + coord->between = AT_UNIT;
41850 + reiser4_seal_init(&state->sd_seal, coord, key);
41851 + state->sd_coord = *coord;
41852 + spin_unlock_inode(inode);
41853 + check_inode_seal(inode, coord, key);
41854 + zrelse(loaded);
41855 + return result;
41856 +}
41857 +
41858 +/* Update existing stat-data in a tree. Called with inode state locked. Return
41859 + inode state locked. */
41860 +static int update_sd(struct inode *inode /* inode to update sd for */ )
41861 +{
41862 + int result;
41863 + reiser4_key key;
41864 + coord_t coord;
41865 + lock_handle lh;
41866 +
41867 + assert("nikita-726", inode != NULL);
41868 +
41869 + /* no stat-data, nothing to update?! */
41870 + assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
41871 +
41872 + init_lh(&lh);
41873 +
41874 + result = locate_inode_sd(inode, &key, &coord, &lh);
41875 + if (result == 0)
41876 + result = update_sd_at(inode, &coord, &key, &lh);
41877 + done_lh(&lh);
41878 +
41879 + return result;
41880 +}
41881 +
41882 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
41883 + Remove object stat data. Space for that must be reserved by caller before
41884 +*/
41885 +static int
41886 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41887 +{
41888 + int result;
41889 +
41890 + assert("nikita-1477", inode != NULL);
41891 +
41892 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41893 + reiser4_key sd_key;
41894 +
41895 + DQUOT_FREE_INODE(inode);
41896 + DQUOT_DROP(inode);
41897 +
41898 + build_sd_key(inode, &sd_key);
41899 + result =
41900 + reiser4_cut_tree(reiser4_tree_by_inode(inode),
41901 + &sd_key, &sd_key, NULL, 0);
41902 + if (result == 0) {
41903 + reiser4_inode_set_flag(inode, REISER4_NO_SD);
41904 + result = oid_release(inode->i_sb, get_inode_oid(inode));
41905 + if (result == 0) {
41906 + oid_count_released();
41907 +
41908 + result = safe_link_del(reiser4_tree_by_inode(inode),
41909 + get_inode_oid(inode),
41910 + SAFE_UNLINK);
41911 + }
41912 + }
41913 + } else
41914 + result = 0;
41915 + return result;
41916 +}
41917 +
41918 +/* helper for safelink_common */
41919 +static int process_truncate(struct inode *inode, __u64 size)
41920 +{
41921 + int result;
41922 + struct iattr attr;
41923 + file_plugin *fplug;
41924 + reiser4_context *ctx;
41925 + struct dentry dentry;
41926 +
41927 + assert("vs-21", is_in_reiser4_context());
41928 + ctx = reiser4_init_context(inode->i_sb);
41929 + assert("vs-22", !IS_ERR(ctx));
41930 +
41931 + attr.ia_size = size;
41932 + attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41933 + fplug = inode_file_plugin(inode);
41934 +
41935 + mutex_lock(&inode->i_mutex);
41936 + assert("vs-1704", get_current_context()->trans->atom == NULL);
41937 + dentry.d_inode = inode;
41938 + result = inode->i_op->setattr(&dentry, &attr);
41939 + mutex_unlock(&inode->i_mutex);
41940 +
41941 + context_set_commit_async(ctx);
41942 + reiser4_exit_context(ctx);
41943 +
41944 + return result;
41945 +}
41946 +
41947 +/*
41948 + Local variables:
41949 + c-indentation-style: "K&R"
41950 + mode-name: "LC"
41951 + c-basic-offset: 8
41952 + tab-width: 8
41953 + fill-column: 80
41954 + scroll-step: 1
41955 + End:
41956 +*/
41957 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/hash.c linux-2.6.20/fs/reiser4/plugin/hash.c
41958 --- linux-2.6.20.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
41959 +++ linux-2.6.20/fs/reiser4/plugin/hash.c 2007-05-06 14:50:43.791004471 +0400
41960 @@ -0,0 +1,353 @@
41961 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41962 + * reiser4/README */
41963 +
41964 +/* Hash functions */
41965 +
41966 +#include "../debug.h"
41967 +#include "plugin_header.h"
41968 +#include "plugin.h"
41969 +#include "../super.h"
41970 +#include "../inode.h"
41971 +
41972 +#include <linux/types.h>
41973 +
41974 +/* old rupasov (yura) hash */
41975 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41976 + int len /* @name's length */ )
41977 +{
41978 + int i;
41979 + int j;
41980 + int pow;
41981 + __u64 a;
41982 + __u64 c;
41983 +
41984 + assert("nikita-672", name != NULL);
41985 + assert("nikita-673", len >= 0);
41986 +
41987 + for (pow = 1, i = 1; i < len; ++i)
41988 + pow = pow * 10;
41989 +
41990 + if (len == 1)
41991 + a = name[0] - 48;
41992 + else
41993 + a = (name[0] - 48) * pow;
41994 +
41995 + for (i = 1; i < len; ++i) {
41996 + c = name[i] - 48;
41997 + for (pow = 1, j = i; j < len - 1; ++j)
41998 + pow = pow * 10;
41999 + a = a + c * pow;
42000 + }
42001 + for (; i < 40; ++i) {
42002 + c = '0' - 48;
42003 + for (pow = 1, j = i; j < len - 1; ++j)
42004 + pow = pow * 10;
42005 + a = a + c * pow;
42006 + }
42007 +
42008 + for (; i < 256; ++i) {
42009 + c = i;
42010 + for (pow = 1, j = i; j < len - 1; ++j)
42011 + pow = pow * 10;
42012 + a = a + c * pow;
42013 + }
42014 +
42015 + a = a << 7;
42016 + return a;
42017 +}
42018 +
42019 +/* r5 hash */
42020 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
42021 + int len UNUSED_ARG /* @name's length */ )
42022 +{
42023 + __u64 a = 0;
42024 +
42025 + assert("nikita-674", name != NULL);
42026 + assert("nikita-675", len >= 0);
42027 +
42028 + while (*name) {
42029 + a += *name << 4;
42030 + a += *name >> 4;
42031 + a *= 11;
42032 + name++;
42033 + }
42034 + return a;
42035 +}
42036 +
42037 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
42038 + H0 = Key
42039 + Hi = E Mi(Hi-1) + Hi-1
42040 +
42041 + (see Applied Cryptography, 2nd edition, p448).
42042 +
42043 + Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
42044 +
42045 + Jeremy has agreed to the contents of reiserfs/README. -Hans
42046 +
42047 + This code was blindly upgraded to __u64 by s/__u32/__u64/g.
42048 +*/
42049 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
42050 + int len /* @name's length */ )
42051 +{
42052 + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
42053 +
42054 + __u64 h0 = k[0], h1 = k[1];
42055 + __u64 a, b, c, d;
42056 + __u64 pad;
42057 + int i;
42058 +
42059 + assert("nikita-676", name != NULL);
42060 + assert("nikita-677", len >= 0);
42061 +
42062 +#define DELTA 0x9E3779B9u
42063 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
42064 +#define PARTROUNDS 6 /* 6 gets complete mixing */
42065 +
42066 +/* a, b, c, d - data; h0, h1 - accumulated hash */
42067 +#define TEACORE(rounds) \
42068 + do { \
42069 + __u64 sum = 0; \
42070 + int n = rounds; \
42071 + __u64 b0, b1; \
42072 + \
42073 + b0 = h0; \
42074 + b1 = h1; \
42075 + \
42076 + do \
42077 + { \
42078 + sum += DELTA; \
42079 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
42080 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
42081 + } while(--n); \
42082 + \
42083 + h0 += b0; \
42084 + h1 += b1; \
42085 + } while(0)
42086 +
42087 + pad = (__u64) len | ((__u64) len << 8);
42088 + pad |= pad << 16;
42089 +
42090 + while (len >= 16) {
42091 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42092 + 16 | (__u64) name[3] << 24;
42093 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42094 + 16 | (__u64) name[7] << 24;
42095 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42096 + 16 | (__u64) name[11] << 24;
42097 + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
42098 + << 16 | (__u64) name[15] << 24;
42099 +
42100 + TEACORE(PARTROUNDS);
42101 +
42102 + len -= 16;
42103 + name += 16;
42104 + }
42105 +
42106 + if (len >= 12) {
42107 + //assert(len < 16);
42108 + if (len >= 16)
42109 + *(int *)0 = 0;
42110 +
42111 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42112 + 16 | (__u64) name[3] << 24;
42113 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42114 + 16 | (__u64) name[7] << 24;
42115 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42116 + 16 | (__u64) name[11] << 24;
42117 +
42118 + d = pad;
42119 + for (i = 12; i < len; i++) {
42120 + d <<= 8;
42121 + d |= name[i];
42122 + }
42123 + } else if (len >= 8) {
42124 + //assert(len < 12);
42125 + if (len >= 12)
42126 + *(int *)0 = 0;
42127 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42128 + 16 | (__u64) name[3] << 24;
42129 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42130 + 16 | (__u64) name[7] << 24;
42131 +
42132 + c = d = pad;
42133 + for (i = 8; i < len; i++) {
42134 + c <<= 8;
42135 + c |= name[i];
42136 + }
42137 + } else if (len >= 4) {
42138 + //assert(len < 8);
42139 + if (len >= 8)
42140 + *(int *)0 = 0;
42141 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42142 + 16 | (__u64) name[3] << 24;
42143 +
42144 + b = c = d = pad;
42145 + for (i = 4; i < len; i++) {
42146 + b <<= 8;
42147 + b |= name[i];
42148 + }
42149 + } else {
42150 + //assert(len < 4);
42151 + if (len >= 4)
42152 + *(int *)0 = 0;
42153 + a = b = c = d = pad;
42154 + for (i = 0; i < len; i++) {
42155 + a <<= 8;
42156 + a |= name[i];
42157 + }
42158 + }
42159 +
42160 + TEACORE(FULLROUNDS);
42161 +
42162 +/* return 0;*/
42163 + return h0 ^ h1;
42164 +
42165 +}
42166 +
42167 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42168 +
42169 + See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42170 +
42171 + Excerpts:
42172 +
42173 + FNV hashes are designed to be fast while maintaining a low collision
42174 + rate.
42175 +
42176 + [This version also seems to preserve lexicographical order locally.]
42177 +
42178 + FNV hash algorithms and source code have been released into the public
42179 + domain.
42180 +
42181 +*/
42182 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42183 + int len UNUSED_ARG /* @name's length */ )
42184 +{
42185 + unsigned long long a = 0xcbf29ce484222325ull;
42186 + const unsigned long long fnv_64_prime = 0x100000001b3ull;
42187 +
42188 + assert("nikita-678", name != NULL);
42189 + assert("nikita-679", len >= 0);
42190 +
42191 + /* FNV-1 hash each octet in the buffer */
42192 + for (; *name; ++name) {
42193 + /* multiply by the 32 bit FNV magic prime mod 2^64 */
42194 + a *= fnv_64_prime;
42195 + /* xor the bottom with the current octet */
42196 + a ^= (unsigned long long)(*name);
42197 + }
42198 + /* return our new hash value */
42199 + return a;
42200 +}
42201 +
42202 +/* degenerate hash function used to simplify testing of non-unique key
42203 + handling */
42204 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42205 + int len UNUSED_ARG /* @name's length */ )
42206 +{
42207 + return 0xc0c0c0c010101010ull;
42208 +}
42209 +
42210 +static int change_hash(struct inode *inode,
42211 + reiser4_plugin * plugin,
42212 + pset_member memb)
42213 +{
42214 + int result;
42215 +
42216 + assert("nikita-3503", inode != NULL);
42217 + assert("nikita-3504", plugin != NULL);
42218 +
42219 + assert("nikita-3505", is_reiser4_inode(inode));
42220 + assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42221 +
42222 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
42223 + return RETERR(-EINVAL);
42224 +
42225 + result = 0;
42226 + if (inode_hash_plugin(inode) == NULL ||
42227 + inode_hash_plugin(inode)->h.id != plugin->h.id) {
42228 + if (is_dir_empty(inode) == 0)
42229 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
42230 + PSET_HASH, plugin);
42231 + else
42232 + result = RETERR(-ENOTEMPTY);
42233 +
42234 + }
42235 + return result;
42236 +}
42237 +
42238 +static reiser4_plugin_ops hash_plugin_ops = {
42239 + .init = NULL,
42240 + .load = NULL,
42241 + .save_len = NULL,
42242 + .save = NULL,
42243 + .change = change_hash
42244 +};
42245 +
42246 +/* hash plugins */
42247 +hash_plugin hash_plugins[LAST_HASH_ID] = {
42248 + [RUPASOV_HASH_ID] = {
42249 + .h = {
42250 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42251 + .id = RUPASOV_HASH_ID,
42252 + .pops = &hash_plugin_ops,
42253 + .label = "rupasov",
42254 + .desc = "Original Yura's hash",
42255 + .linkage = {NULL, NULL}
42256 + },
42257 + .hash = hash_rupasov
42258 + },
42259 + [R5_HASH_ID] = {
42260 + .h = {
42261 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42262 + .id = R5_HASH_ID,
42263 + .pops = &hash_plugin_ops,
42264 + .label = "r5",
42265 + .desc = "r5 hash",
42266 + .linkage = {NULL, NULL}
42267 + },
42268 + .hash = hash_r5
42269 + },
42270 + [TEA_HASH_ID] = {
42271 + .h = {
42272 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42273 + .id = TEA_HASH_ID,
42274 + .pops = &hash_plugin_ops,
42275 + .label = "tea",
42276 + .desc = "tea hash",
42277 + .linkage = {NULL, NULL}
42278 + },
42279 + .hash = hash_tea
42280 + },
42281 + [FNV1_HASH_ID] = {
42282 + .h = {
42283 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42284 + .id = FNV1_HASH_ID,
42285 + .pops = &hash_plugin_ops,
42286 + .label = "fnv1",
42287 + .desc = "fnv1 hash",
42288 + .linkage = {NULL, NULL}
42289 + },
42290 + .hash = hash_fnv1
42291 + },
42292 + [DEGENERATE_HASH_ID] = {
42293 + .h = {
42294 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42295 + .id = DEGENERATE_HASH_ID,
42296 + .pops = &hash_plugin_ops,
42297 + .label = "degenerate hash",
42298 + .desc = "Degenerate hash: only for testing",
42299 + .linkage = {NULL, NULL}
42300 + },
42301 + .hash = hash_deg
42302 + }
42303 +};
42304 +
42305 +/* Make Linus happy.
42306 + Local variables:
42307 + c-indentation-style: "K&R"
42308 + mode-name: "LC"
42309 + c-basic-offset: 8
42310 + tab-width: 8
42311 + fill-column: 120
42312 + End:
42313 +*/
42314 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.20/fs/reiser4/plugin/inode_ops.c
42315 --- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
42316 +++ linux-2.6.20/fs/reiser4/plugin/inode_ops.c 2007-05-06 14:50:43.795005721 +0400
42317 @@ -0,0 +1,897 @@
42318 +/*
42319 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42320 + */
42321 +
42322 +/*
42323 + * this file contains typical implementations for most of methods of struct
42324 + * inode_operations
42325 + */
42326 +
42327 +#include "../inode.h"
42328 +#include "../safe_link.h"
42329 +
42330 +#include <linux/quotaops.h>
42331 +#include <linux/namei.h>
42332 +
42333 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42334 + reiser4_object_create_data *data);
42335 +
42336 +/**
42337 + * reiser4_create_common - create of inode operations
42338 + * @parent: inode of parent directory
42339 + * @dentry: dentry of new object to create
42340 + * @mode: the permissions to use
42341 + * @nameidata:
42342 + *
42343 + * This is common implementation of vfs's create method of struct
42344 + * inode_operations.
42345 + * Creates regular file using file plugin from parent directory plugin set.
42346 + */
42347 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42348 + int mode, struct nameidata *nameidata)
42349 +{
42350 + reiser4_object_create_data data;
42351 + file_plugin *fplug;
42352 +
42353 + memset(&data, 0, sizeof data);
42354 + data.mode = S_IFREG | mode;
42355 + fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42356 + if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42357 + warning("vpf-1900", "'%s' is not a regular file plugin.",
42358 + fplug->h.label);
42359 + return RETERR(-EIO);
42360 + }
42361 + data.id = fplug->h.id;
42362 + return create_vfs_object(parent, dentry, &data);
42363 +}
42364 +
42365 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42366 +void check_light_weight(struct inode *inode, struct inode *parent);
42367 +
42368 +/**
42369 + * reiser4_lookup_common - lookup of inode operations
42370 + * @parent: inode of directory to lookup into
42371 + * @dentry: name to look for
42372 + * @nameidata:
42373 + *
42374 + * This is common implementation of vfs's lookup method of struct
42375 + * inode_operations.
42376 + */
42377 +struct dentry *reiser4_lookup_common(struct inode *parent,
42378 + struct dentry *dentry,
42379 + struct nameidata *nameidata)
42380 +{
42381 + reiser4_context *ctx;
42382 + int result;
42383 + struct dentry *new;
42384 + struct inode *inode;
42385 + reiser4_dir_entry_desc entry;
42386 +
42387 + ctx = reiser4_init_context(parent->i_sb);
42388 + if (IS_ERR(ctx))
42389 + return (struct dentry *)ctx;
42390 +
42391 + /* set up operations on dentry. */
42392 + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42393 +
42394 + result = reiser4_lookup_name(parent, dentry, &entry.key);
42395 + if (result) {
42396 + context_set_commit_async(ctx);
42397 + reiser4_exit_context(ctx);
42398 + if (result == -ENOENT) {
42399 + /* object not found */
42400 + if (!IS_DEADDIR(parent))
42401 + d_add(dentry, NULL);
42402 + return NULL;
42403 + }
42404 + return ERR_PTR(result);
42405 + }
42406 +
42407 + inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42408 + if (IS_ERR(inode)) {
42409 + context_set_commit_async(ctx);
42410 + reiser4_exit_context(ctx);
42411 + return ERR_PTR(PTR_ERR(inode));
42412 + }
42413 +
42414 + /* success */
42415 + check_light_weight(inode, parent);
42416 + new = d_splice_alias(inode, dentry);
42417 + reiser4_iget_complete(inode);
42418 +
42419 + /* prevent balance_dirty_pages() from being called: we don't want to
42420 + * do this under directory i_mutex. */
42421 + context_set_commit_async(ctx);
42422 + reiser4_exit_context(ctx);
42423 + return new;
42424 +}
42425 +
42426 +static reiser4_block_nr common_estimate_link(struct inode *parent,
42427 + struct inode *object);
42428 +int reiser4_update_dir(struct inode *);
42429 +
42430 +/**
42431 + * reiser4_link_common - link of inode operations
42432 + * @existing: dentry of object which is to get new name
42433 + * @parent: directory where new name is to be created
42434 + * @newname: new name
42435 + *
42436 + * This is common implementation of vfs's link method of struct
42437 + * inode_operations.
42438 + */
42439 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
42440 + struct dentry *newname)
42441 +{
42442 + reiser4_context *ctx;
42443 + int result;
42444 + struct inode *object;
42445 + dir_plugin *parent_dplug;
42446 + reiser4_dir_entry_desc entry;
42447 + reiser4_object_create_data data;
42448 + reiser4_block_nr reserve;
42449 +
42450 + ctx = reiser4_init_context(parent->i_sb);
42451 + if (IS_ERR(ctx))
42452 + return PTR_ERR(ctx);
42453 +
42454 + assert("nikita-1431", existing != NULL);
42455 + assert("nikita-1432", parent != NULL);
42456 + assert("nikita-1433", newname != NULL);
42457 +
42458 + object = existing->d_inode;
42459 + assert("nikita-1434", object != NULL);
42460 +
42461 + /* check for race with create_object() */
42462 + if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
42463 + context_set_commit_async(ctx);
42464 + reiser4_exit_context(ctx);
42465 + return RETERR(-E_REPEAT);
42466 + }
42467 +
42468 + parent_dplug = inode_dir_plugin(parent);
42469 +
42470 + memset(&entry, 0, sizeof entry);
42471 + entry.obj = object;
42472 +
42473 + data.mode = object->i_mode;
42474 + data.id = inode_file_plugin(object)->h.id;
42475 +
42476 + reserve = common_estimate_link(parent, existing->d_inode);
42477 + if ((__s64) reserve < 0) {
42478 + context_set_commit_async(ctx);
42479 + reiser4_exit_context(ctx);
42480 + return reserve;
42481 + }
42482 +
42483 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42484 + context_set_commit_async(ctx);
42485 + reiser4_exit_context(ctx);
42486 + return RETERR(-ENOSPC);
42487 + }
42488 +
42489 + /*
42490 + * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42491 + * means that link(2) can race against unlink(2) or rename(2), and
42492 + * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42493 + *
42494 + * For such inode we have to undo special processing done in
42495 + * reiser4_unlink() viz. creation of safe-link.
42496 + */
42497 + if (unlikely(object->i_nlink == 0)) {
42498 + result = safe_link_del(reiser4_tree_by_inode(object),
42499 + get_inode_oid(object), SAFE_UNLINK);
42500 + if (result != 0) {
42501 + context_set_commit_async(ctx);
42502 + reiser4_exit_context(ctx);
42503 + return result;
42504 + }
42505 + }
42506 +
42507 + /* increment nlink of @existing and update its stat data */
42508 + result = reiser4_add_nlink(object, parent, 1);
42509 + if (result == 0) {
42510 + /* add entry to the parent */
42511 + result =
42512 + parent_dplug->add_entry(parent, newname, &data, &entry);
42513 + if (result != 0) {
42514 + /* failed to add entry to the parent, decrement nlink
42515 + of @existing */
42516 + reiser4_del_nlink(object, parent, 1);
42517 + /*
42518 + * now, if that failed, we have a file with too big
42519 + * nlink---space leak, much better than directory
42520 + * entry pointing to nowhere
42521 + */
42522 + }
42523 + }
42524 + if (result == 0) {
42525 + atomic_inc(&object->i_count);
42526 + /*
42527 + * Upon successful completion, link() shall mark for update
42528 + * the st_ctime field of the file. Also, the st_ctime and
42529 + * st_mtime fields of the directory that contains the new
42530 + * entry shall be marked for update. --SUS
42531 + */
42532 + result = reiser4_update_dir(parent);
42533 + }
42534 + if (result == 0)
42535 + d_instantiate(newname, existing->d_inode);
42536 +
42537 + context_set_commit_async(ctx);
42538 + reiser4_exit_context(ctx);
42539 + return result;
42540 +}
42541 +
42542 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42543 +
42544 +/**
42545 + * reiser4_unlink_common - unlink of inode operations
42546 + * @parent: inode of directory to remove name from
42547 + * @victim: name to be removed
42548 + *
42549 + * This is common implementation of vfs's unlink method of struct
42550 + * inode_operations.
42551 + */
42552 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
42553 +{
42554 + reiser4_context *ctx;
42555 + int result;
42556 + struct inode *object;
42557 + file_plugin *fplug;
42558 +
42559 + ctx = reiser4_init_context(parent->i_sb);
42560 + if (IS_ERR(ctx))
42561 + return PTR_ERR(ctx);
42562 +
42563 + object = victim->d_inode;
42564 + fplug = inode_file_plugin(object);
42565 + assert("nikita-2882", fplug->detach != NULL);
42566 +
42567 + result = unlink_check_and_grab(parent, victim);
42568 + if (result != 0) {
42569 + context_set_commit_async(ctx);
42570 + reiser4_exit_context(ctx);
42571 + return result;
42572 + }
42573 +
42574 + result = fplug->detach(object, parent);
42575 + if (result == 0) {
42576 + dir_plugin *parent_dplug;
42577 + reiser4_dir_entry_desc entry;
42578 +
42579 + parent_dplug = inode_dir_plugin(parent);
42580 + memset(&entry, 0, sizeof entry);
42581 +
42582 + /* first, delete directory entry */
42583 + result = parent_dplug->rem_entry(parent, victim, &entry);
42584 + if (result == 0) {
42585 + /*
42586 + * if name was removed successfully, we _have_ to
42587 + * return 0 from this function, because upper level
42588 + * caller (vfs_{rmdir,unlink}) expect this.
42589 + *
42590 + * now that directory entry is removed, update
42591 + * stat-data
42592 + */
42593 + reiser4_del_nlink(object, parent, 1);
42594 + /*
42595 + * Upon successful completion, unlink() shall mark for
42596 + * update the st_ctime and st_mtime fields of the
42597 + * parent directory. Also, if the file's link count is
42598 + * not 0, the st_ctime field of the file shall be
42599 + * marked for update. --SUS
42600 + */
42601 + reiser4_update_dir(parent);
42602 + /* add safe-link for this file */
42603 + if (object->i_nlink == 0)
42604 + safe_link_add(object, SAFE_UNLINK);
42605 + }
42606 + }
42607 +
42608 + if (unlikely(result != 0)) {
42609 + if (result != -ENOMEM)
42610 + warning("nikita-3398", "Cannot unlink %llu (%i)",
42611 + (unsigned long long)get_inode_oid(object),
42612 + result);
42613 + /* if operation failed commit pending inode modifications to
42614 + * the stat-data */
42615 + reiser4_update_sd(object);
42616 + reiser4_update_sd(parent);
42617 + }
42618 +
42619 + reiser4_release_reserved(object->i_sb);
42620 +
42621 + /* @object's i_ctime was updated by ->rem_link() method(). */
42622 +
42623 + /* @victim can be already removed from the disk by this time. Inode is
42624 + then marked so that iput() wouldn't try to remove stat data. But
42625 + inode itself is still there.
42626 + */
42627 +
42628 + /*
42629 + * we cannot release directory semaphore here, because name has
42630 + * already been deleted, but dentry (@victim) still exists. Prevent
42631 + * balance_dirty_pages() from being called on exiting this context: we
42632 + * don't want to do this under directory i_mutex.
42633 + */
42634 + context_set_commit_async(ctx);
42635 + reiser4_exit_context(ctx);
42636 + return result;
42637 +}
42638 +
42639 +/**
42640 + * reiser4_symlink_common - symlink of inode operations
42641 + * @parent: inode of parent directory
42642 + * @dentry: dentry of object to be created
42643 + * @linkname: string symlink is to contain
42644 + *
42645 + * This is common implementation of vfs's symlink method of struct
42646 + * inode_operations.
42647 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42648 + */
42649 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
42650 + const char *linkname)
42651 +{
42652 + reiser4_object_create_data data;
42653 +
42654 + memset(&data, 0, sizeof data);
42655 + data.name = linkname;
42656 + data.id = SYMLINK_FILE_PLUGIN_ID;
42657 + data.mode = S_IFLNK | S_IRWXUGO;
42658 + return create_vfs_object(parent, dentry, &data);
42659 +}
42660 +
42661 +/**
42662 + * reiser4_mkdir_common - mkdir of inode operations
42663 + * @parent: inode of parent directory
42664 + * @dentry: dentry of object to be created
42665 + * @mode: the permissions to use
42666 + *
42667 + * This is common implementation of vfs's mkdir method of struct
42668 + * inode_operations.
42669 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42670 + */
42671 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42672 +{
42673 + reiser4_object_create_data data;
42674 +
42675 + memset(&data, 0, sizeof data);
42676 + data.mode = S_IFDIR | mode;
42677 + data.id = DIRECTORY_FILE_PLUGIN_ID;
42678 + return create_vfs_object(parent, dentry, &data);
42679 +}
42680 +
42681 +/**
42682 + * reiser4_mknod_common - mknod of inode operations
42683 + * @parent: inode of parent directory
42684 + * @dentry: dentry of object to be created
42685 + * @mode: the permissions to use and file type
42686 + * @rdev: minor and major of new device file
42687 + *
42688 + * This is common implementation of vfs's mknod method of struct
42689 + * inode_operations.
42690 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42691 + */
42692 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
42693 + int mode, dev_t rdev)
42694 +{
42695 + reiser4_object_create_data data;
42696 +
42697 + memset(&data, 0, sizeof data);
42698 + data.mode = mode;
42699 + data.rdev = rdev;
42700 + data.id = SPECIAL_FILE_PLUGIN_ID;
42701 + return create_vfs_object(parent, dentry, &data);
42702 +}
42703 +
42704 +/*
42705 + * implementation of vfs's rename method of struct inode_operations for typical
42706 + * directory is in inode_ops_rename.c
42707 + */
42708 +
42709 +/**
42710 + * reiser4_follow_link_common - follow_link of inode operations
42711 + * @dentry: dentry of symlink
42712 + * @data:
42713 + *
42714 + * This is common implementation of vfs's followlink method of struct
42715 + * inode_operations.
42716 + * Assumes that inode's i_private points to the content of symbolic link.
42717 + */
42718 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
42719 +{
42720 + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42721 +
42722 + if (!dentry->d_inode->i_private
42723 + || !reiser4_inode_get_flag(dentry->d_inode,
42724 + REISER4_GENERIC_PTR_USED))
42725 + return ERR_PTR(RETERR(-EINVAL));
42726 + nd_set_link(nd, dentry->d_inode->i_private);
42727 + return NULL;
42728 +}
42729 +
42730 +/**
42731 + * reiser4_permission_common - permission of inode operations
42732 + * @inode: inode to check permissions for
42733 + * @mask: mode bits to check permissions for
42734 + * @nameidata:
42735 + *
42736 + * Uses generic function to check for rwx permissions.
42737 + */
42738 +int reiser4_permission_common(struct inode *inode, int mask,
42739 + struct nameidata *nameidata)
42740 +{
42741 + return generic_permission(inode, mask, NULL);
42742 +}
42743 +
42744 +static int setattr_reserve(reiser4_tree *);
42745 +
42746 +/* this is common implementation of vfs's setattr method of struct
42747 + inode_operations
42748 +*/
42749 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
42750 +{
42751 + reiser4_context *ctx;
42752 + struct inode *inode;
42753 + int result;
42754 +
42755 + inode = dentry->d_inode;
42756 + result = inode_change_ok(inode, attr);
42757 + if (result)
42758 + return result;
42759 +
42760 + ctx = reiser4_init_context(inode->i_sb);
42761 + if (IS_ERR(ctx))
42762 + return PTR_ERR(ctx);
42763 +
42764 + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42765 +
42766 + /*
42767 + * grab disk space and call standard inode_setattr().
42768 + */
42769 + result = setattr_reserve(reiser4_tree_by_inode(inode));
42770 + if (!result) {
42771 + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42772 + || (attr->ia_valid & ATTR_GID
42773 + && attr->ia_gid != inode->i_gid)) {
42774 + result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42775 + if (result) {
42776 + context_set_commit_async(ctx);
42777 + reiser4_exit_context(ctx);
42778 + return result;
42779 + }
42780 + }
42781 + result = inode_setattr(inode, attr);
42782 + if (!result)
42783 + reiser4_update_sd(inode);
42784 + }
42785 +
42786 + context_set_commit_async(ctx);
42787 + reiser4_exit_context(ctx);
42788 + return result;
42789 +}
42790 +
42791 +/* this is common implementation of vfs's getattr method of struct
42792 + inode_operations
42793 +*/
42794 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
42795 + struct dentry *dentry, struct kstat *stat)
42796 +{
42797 + struct inode *obj;
42798 +
42799 + assert("nikita-2298", dentry != NULL);
42800 + assert("nikita-2299", stat != NULL);
42801 + assert("nikita-2300", dentry->d_inode != NULL);
42802 +
42803 + obj = dentry->d_inode;
42804 +
42805 + stat->dev = obj->i_sb->s_dev;
42806 + stat->ino = oid_to_uino(get_inode_oid(obj));
42807 + stat->mode = obj->i_mode;
42808 + /* don't confuse userland with huge nlink. This is not entirely
42809 + * correct, because nlink_t is not necessary 16 bit signed. */
42810 + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42811 + stat->uid = obj->i_uid;
42812 + stat->gid = obj->i_gid;
42813 + stat->rdev = obj->i_rdev;
42814 + stat->atime = obj->i_atime;
42815 + stat->mtime = obj->i_mtime;
42816 + stat->ctime = obj->i_ctime;
42817 + stat->size = obj->i_size;
42818 + stat->blocks =
42819 + (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42820 + /* "preferred" blocksize for efficient file system I/O */
42821 + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42822 +
42823 + return 0;
42824 +}
42825 +
42826 +/* Estimate the maximum amount of nodes which might be allocated or changed on
42827 + typical new object creation. Typical creation consists of calling create
42828 + method of file plugin, adding directory entry to parent and update parent
42829 + directory's stat data.
42830 +*/
42831 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42832 + struct inode *object
42833 + /* object */ )
42834 +{
42835 + assert("vpf-309", parent != NULL);
42836 + assert("vpf-307", object != NULL);
42837 +
42838 + return
42839 + /* object creation estimation */
42840 + inode_file_plugin(object)->estimate.create(object) +
42841 + /* stat data of parent directory estimation */
42842 + inode_file_plugin(parent)->estimate.update(parent) +
42843 + /* adding entry estimation */
42844 + inode_dir_plugin(parent)->estimate.add_entry(parent) +
42845 + /* to undo in the case of failure */
42846 + inode_dir_plugin(parent)->estimate.rem_entry(parent);
42847 +}
42848 +
42849 +/* Create child in directory.
42850 +
42851 + . get object's plugin
42852 + . get fresh inode
42853 + . initialize inode
42854 + . add object's stat-data
42855 + . initialize object's directory
42856 + . add entry to the parent
42857 + . instantiate dentry
42858 +
42859 +*/
42860 +static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42861 + object */
42862 + struct inode **retobj)
42863 +{
42864 + int result;
42865 +
42866 + struct dentry *dentry; /* parent object */
42867 + struct inode *parent; /* new name */
42868 +
42869 + dir_plugin *par_dir; /* directory plugin on the parent */
42870 + dir_plugin *obj_dir; /* directory plugin on the new object */
42871 + file_plugin *obj_plug; /* object plugin on the new object */
42872 + struct inode *object; /* new object */
42873 + reiser4_block_nr reserve;
42874 +
42875 + reiser4_dir_entry_desc entry; /* new directory entry */
42876 +
42877 + assert("nikita-1420", data != NULL);
42878 + parent = data->parent;
42879 + dentry = data->dentry;
42880 +
42881 + assert("nikita-1418", parent != NULL);
42882 + assert("nikita-1419", dentry != NULL);
42883 +
42884 + /* check, that name is acceptable for parent */
42885 + par_dir = inode_dir_plugin(parent);
42886 + if (par_dir->is_name_acceptable &&
42887 + !par_dir->is_name_acceptable(parent,
42888 + dentry->d_name.name,
42889 + (int)dentry->d_name.len))
42890 + return RETERR(-ENAMETOOLONG);
42891 +
42892 + result = 0;
42893 + obj_plug = file_plugin_by_id((int)data->id);
42894 + if (obj_plug == NULL) {
42895 + warning("nikita-430", "Cannot find plugin %i", data->id);
42896 + return RETERR(-ENOENT);
42897 + }
42898 + object = new_inode(parent->i_sb);
42899 + if (object == NULL)
42900 + return RETERR(-ENOMEM);
42901 + /* we'll update i_nlink below */
42902 + object->i_nlink = 0;
42903 + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42904 + * to simplify error handling: if some error occurs before i_ino is
42905 + * initialized with oid, i_ino should already be set to some
42906 + * distinguished value. */
42907 + object->i_ino = 0;
42908 +
42909 + /* So that on error iput will be called. */
42910 + *retobj = object;
42911 +
42912 + if (DQUOT_ALLOC_INODE(object)) {
42913 + DQUOT_DROP(object);
42914 + object->i_flags |= S_NOQUOTA;
42915 + return RETERR(-EDQUOT);
42916 + }
42917 +
42918 + memset(&entry, 0, sizeof entry);
42919 + entry.obj = object;
42920 +
42921 + set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
42922 + file_plugin_to_plugin(obj_plug));
42923 + result = obj_plug->set_plug_in_inode(object, parent, data);
42924 + if (result) {
42925 + warning("nikita-431", "Cannot install plugin %i on %llx",
42926 + data->id, (unsigned long long)get_inode_oid(object));
42927 + DQUOT_FREE_INODE(object);
42928 + object->i_flags |= S_NOQUOTA;
42929 + return result;
42930 + }
42931 +
42932 + /* reget plugin after installation */
42933 + obj_plug = inode_file_plugin(object);
42934 +
42935 + if (obj_plug->create_object == NULL) {
42936 + DQUOT_FREE_INODE(object);
42937 + object->i_flags |= S_NOQUOTA;
42938 + return RETERR(-EPERM);
42939 + }
42940 +
42941 + /* if any of hash, tail, sd or permission plugins for newly created
42942 + object are not set yet set them here inheriting them from parent
42943 + directory
42944 + */
42945 + assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42946 + result = obj_plug->adjust_to_parent(object,
42947 + parent,
42948 + object->i_sb->s_root->d_inode);
42949 + if (result == 0)
42950 + result = finish_pset(object);
42951 + if (result != 0) {
42952 + warning("nikita-432", "Cannot inherit from %llx to %llx",
42953 + (unsigned long long)get_inode_oid(parent),
42954 + (unsigned long long)get_inode_oid(object));
42955 + DQUOT_FREE_INODE(object);
42956 + object->i_flags |= S_NOQUOTA;
42957 + return result;
42958 + }
42959 +
42960 + /* setup inode and file-operations for this inode */
42961 + setup_inode_ops(object, data);
42962 +
42963 + /* call file plugin's method to initialize plugin specific part of
42964 + * inode */
42965 + if (obj_plug->init_inode_data)
42966 + obj_plug->init_inode_data(object, data, 1 /*create */ );
42967 +
42968 + /* obtain directory plugin (if any) for new object. */
42969 + obj_dir = inode_dir_plugin(object);
42970 + if (obj_dir != NULL && obj_dir->init == NULL) {
42971 + DQUOT_FREE_INODE(object);
42972 + object->i_flags |= S_NOQUOTA;
42973 + return RETERR(-EPERM);
42974 + }
42975 +
42976 + reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42977 +
42978 + reserve = estimate_create_vfs_object(parent, object);
42979 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42980 + DQUOT_FREE_INODE(object);
42981 + object->i_flags |= S_NOQUOTA;
42982 + return RETERR(-ENOSPC);
42983 + }
42984 +
42985 + /* mark inode `immutable'. We disable changes to the file being
42986 + created until valid directory entry for it is inserted. Otherwise,
42987 + if file were expanded and insertion of directory entry fails, we
42988 + have to remove file, but we only alloted enough space in
42989 + transaction to remove _empty_ file. 3.x code used to remove stat
42990 + data in different transaction thus possibly leaking disk space on
42991 + crash. This all only matters if it's possible to access file
42992 + without name, for example, by inode number
42993 + */
42994 + reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
42995 +
42996 + /* create empty object, this includes allocation of new objectid. For
42997 + directories this implies creation of dot and dotdot */
42998 + assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
42999 +
43000 + /* mark inode as `loaded'. From this point onward
43001 + reiser4_delete_inode() will try to remove its stat-data. */
43002 + reiser4_inode_set_flag(object, REISER4_LOADED);
43003 +
43004 + result = obj_plug->create_object(object, parent, data);
43005 + if (result != 0) {
43006 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43007 + if (result != -ENAMETOOLONG && result != -ENOMEM)
43008 + warning("nikita-2219",
43009 + "Failed to create sd for %llu",
43010 + (unsigned long long)get_inode_oid(object));
43011 + DQUOT_FREE_INODE(object);
43012 + object->i_flags |= S_NOQUOTA;
43013 + return result;
43014 + }
43015 +
43016 + if (obj_dir != NULL)
43017 + result = obj_dir->init(object, parent, data);
43018 + if (result == 0) {
43019 + assert("nikita-434", !reiser4_inode_get_flag(object,
43020 + REISER4_NO_SD));
43021 + /* insert inode into VFS hash table */
43022 + insert_inode_hash(object);
43023 + /* create entry */
43024 + result = par_dir->add_entry(parent, dentry, data, &entry);
43025 + if (result == 0) {
43026 + result = reiser4_add_nlink(object, parent, 0);
43027 + /* If O_CREAT is set and the file did not previously
43028 + exist, upon successful completion, open() shall
43029 + mark for update the st_atime, st_ctime, and
43030 + st_mtime fields of the file and the st_ctime and
43031 + st_mtime fields of the parent directory. --SUS
43032 + */
43033 + /* @object times are already updated by
43034 + reiser4_add_nlink() */
43035 + if (result == 0)
43036 + reiser4_update_dir(parent);
43037 + if (result != 0)
43038 + /* cleanup failure to add nlink */
43039 + par_dir->rem_entry(parent, dentry, &entry);
43040 + }
43041 + if (result != 0)
43042 + /* cleanup failure to add entry */
43043 + obj_plug->detach(object, parent);
43044 + } else if (result != -ENOMEM)
43045 + warning("nikita-2219", "Failed to initialize dir for %llu: %i",
43046 + (unsigned long long)get_inode_oid(object), result);
43047 +
43048 + /*
43049 + * update stat-data, committing all pending modifications to the inode
43050 + * fields.
43051 + */
43052 + reiser4_update_sd(object);
43053 + if (result != 0) {
43054 + DQUOT_FREE_INODE(object);
43055 + object->i_flags |= S_NOQUOTA;
43056 + /* if everything was ok (result == 0), parent stat-data is
43057 + * already updated above (update_parent_dir()) */
43058 + reiser4_update_sd(parent);
43059 + /* failure to create entry, remove object */
43060 + obj_plug->delete_object(object);
43061 + }
43062 +
43063 + /* file has name now, clear immutable flag */
43064 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43065 +
43066 + /* on error, iput() will call ->delete_inode(). We should keep track
43067 + of the existence of stat-data for this inode and avoid attempt to
43068 + remove it in reiser4_delete_inode(). This is accomplished through
43069 + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
43070 + */
43071 + return result;
43072 +}
43073 +
43074 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
43075 + reiser4_mknod and reiser4_symlink
43076 +*/
43077 +static int
43078 +create_vfs_object(struct inode *parent,
43079 + struct dentry *dentry, reiser4_object_create_data * data)
43080 +{
43081 + reiser4_context *ctx;
43082 + int result;
43083 + struct inode *child;
43084 +
43085 + ctx = reiser4_init_context(parent->i_sb);
43086 + if (IS_ERR(ctx))
43087 + return PTR_ERR(ctx);
43088 + context_set_commit_async(ctx);
43089 +
43090 + data->parent = parent;
43091 + data->dentry = dentry;
43092 + child = NULL;
43093 + result = do_create_vfs_child(data, &child);
43094 + if (unlikely(result != 0)) {
43095 + if (child != NULL) {
43096 + reiser4_make_bad_inode(child);
43097 + iput(child);
43098 + }
43099 + } else
43100 + d_instantiate(dentry, child);
43101 +
43102 + reiser4_exit_context(ctx);
43103 + return result;
43104 +}
43105 +
43106 +/* helper for link_common. Estimate disk space necessary to add a link
43107 + from @parent to @object
43108 +*/
43109 +static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
43110 + struct inode *object
43111 + /* object to which new link is being cerated */
43112 + )
43113 +{
43114 + reiser4_block_nr res = 0;
43115 + file_plugin *fplug;
43116 + dir_plugin *dplug;
43117 +
43118 + assert("vpf-317", object != NULL);
43119 + assert("vpf-318", parent != NULL);
43120 +
43121 + fplug = inode_file_plugin(object);
43122 + dplug = inode_dir_plugin(parent);
43123 + /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
43124 + /* reiser4_add_nlink(object) */
43125 + res += fplug->estimate.update(object);
43126 + /* add_entry(parent) */
43127 + res += dplug->estimate.add_entry(parent);
43128 + /* reiser4_del_nlink(object) */
43129 + res += fplug->estimate.update(object);
43130 + /* update_dir(parent) */
43131 + res += inode_file_plugin(parent)->estimate.update(parent);
43132 + /* safe-link */
43133 + res += estimate_one_item_removal(reiser4_tree_by_inode(object));
43134 +
43135 + return res;
43136 +}
43137 +
43138 +/* Estimate disk space necessary to remove a link between @parent and
43139 + @object.
43140 +*/
43141 +static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
43142 + struct inode *object
43143 + /* object to which new link is being cerated */
43144 + )
43145 +{
43146 + reiser4_block_nr res = 0;
43147 + file_plugin *fplug;
43148 + dir_plugin *dplug;
43149 +
43150 + assert("vpf-317", object != NULL);
43151 + assert("vpf-318", parent != NULL);
43152 +
43153 + fplug = inode_file_plugin(object);
43154 + dplug = inode_dir_plugin(parent);
43155 +
43156 + /* rem_entry(parent) */
43157 + res += dplug->estimate.rem_entry(parent);
43158 + /* reiser4_del_nlink(object) */
43159 + res += fplug->estimate.update(object);
43160 + /* update_dir(parent) */
43161 + res += inode_file_plugin(parent)->estimate.update(parent);
43162 + /* fplug->unlink */
43163 + res += fplug->estimate.unlink(object, parent);
43164 + /* safe-link */
43165 + res += estimate_one_insert_item(reiser4_tree_by_inode(object));
43166 +
43167 + return res;
43168 +}
43169 +
43170 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
43171 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43172 +{
43173 + file_plugin *fplug;
43174 + struct inode *child;
43175 + int result;
43176 +
43177 + result = 0;
43178 + child = victim->d_inode;
43179 + fplug = inode_file_plugin(child);
43180 +
43181 + /* check for race with create_object() */
43182 + if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
43183 + return RETERR(-E_REPEAT);
43184 + /* object being deleted should have stat data */
43185 + assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
43186 +
43187 + /* ask object plugin */
43188 + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43189 + return RETERR(-ENOTEMPTY);
43190 +
43191 + result = (int)estimate_unlink(parent, child);
43192 + if (result < 0)
43193 + return result;
43194 +
43195 + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43196 +}
43197 +
43198 +/* helper for reiser4_setattr_common */
43199 +static int setattr_reserve(reiser4_tree * tree)
43200 +{
43201 + assert("vs-1096", is_grab_enabled(get_current_context()));
43202 + return reiser4_grab_space(estimate_one_insert_into_item(tree),
43203 + BA_CAN_COMMIT);
43204 +}
43205 +
43206 +/* helper function. Standards require that for many file-system operations
43207 + on success ctime and mtime of parent directory is to be updated. */
43208 +int reiser4_update_dir(struct inode *dir)
43209 +{
43210 + assert("nikita-2525", dir != NULL);
43211 +
43212 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43213 + return reiser4_update_sd(dir);
43214 +}
43215 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c
43216 --- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
43217 +++ linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c 2007-05-06 14:50:43.795005721 +0400
43218 @@ -0,0 +1,914 @@
43219 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43220 + * reiser4/README */
43221 +
43222 +#include "../inode.h"
43223 +#include "../safe_link.h"
43224 +
43225 +static const char *possible_leak = "Possible disk space leak.";
43226 +
43227 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43228 +
43229 + Helper function called from hashed_rename() */
43230 +static int replace_name(struct inode *to_inode, /* inode where @from_coord is
43231 + * to be re-targeted at */
43232 + struct inode *from_dir, /* directory where @from_coord
43233 + * lives */
43234 + struct inode *from_inode, /* inode @from_coord
43235 + * originally point to */
43236 + coord_t * from_coord, /* where directory entry is in
43237 + * the tree */
43238 + lock_handle * from_lh /* lock handle on @from_coord */ )
43239 +{
43240 + item_plugin *from_item;
43241 + int result;
43242 + znode *node;
43243 +
43244 + coord_clear_iplug(from_coord);
43245 + node = from_coord->node;
43246 + result = zload(node);
43247 + if (result != 0)
43248 + return result;
43249 + from_item = item_plugin_by_coord(from_coord);
43250 + if (plugin_of_group(item_plugin_by_coord(from_coord),
43251 + DIR_ENTRY_ITEM_TYPE))
43252 + {
43253 + reiser4_key to_key;
43254 +
43255 + build_sd_key(to_inode, &to_key);
43256 +
43257 + /* everything is found and prepared to change directory entry
43258 + at @from_coord to point to @to_inode.
43259 +
43260 + @to_inode is just about to get new name, so bump its link
43261 + counter.
43262 +
43263 + */
43264 + result = reiser4_add_nlink(to_inode, from_dir, 0);
43265 + if (result != 0) {
43266 + /* Don't issue warning: this may be plain -EMLINK */
43267 + zrelse(node);
43268 + return result;
43269 + }
43270 +
43271 + result =
43272 + from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43273 + if (result != 0) {
43274 + reiser4_del_nlink(to_inode, from_dir, 0);
43275 + zrelse(node);
43276 + return result;
43277 + }
43278 +
43279 + /* @from_inode just lost its name, he-he.
43280 +
43281 + If @from_inode was directory, it contained dotdot pointing
43282 + to @from_dir. @from_dir i_nlink will be decreased when
43283 + iput() will be called on @from_inode.
43284 +
43285 + If file-system is not ADG (hard-links are
43286 + supported on directories), iput(from_inode) will not remove
43287 + @from_inode, and thus above is incorrect, but hard-links on
43288 + directories are problematic in many other respects.
43289 + */
43290 + result = reiser4_del_nlink(from_inode, from_dir, 0);
43291 + if (result != 0) {
43292 + warning("nikita-2330",
43293 + "Cannot remove link from source: %i. %s",
43294 + result, possible_leak);
43295 + }
43296 + /* Has to return success, because entry is already
43297 + * modified. */
43298 + result = 0;
43299 +
43300 + /* NOTE-NIKITA consider calling plugin method in stead of
43301 + accessing inode fields directly. */
43302 + from_dir->i_mtime = CURRENT_TIME;
43303 + } else {
43304 + warning("nikita-2326", "Unexpected item type");
43305 + result = RETERR(-EIO);
43306 + }
43307 + zrelse(node);
43308 + return result;
43309 +}
43310 +
43311 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43312 +
43313 + Helper function used by hashed_rename(). */
43314 +static int add_name(struct inode *inode, /* inode where @coord is to be
43315 + * re-targeted at */
43316 + struct inode *dir, /* directory where @coord lives */
43317 + struct dentry *name, /* new name */
43318 + coord_t * coord, /* where directory entry is in the tree */
43319 + lock_handle * lh, /* lock handle on @coord */
43320 + int is_dir /* true, if @inode is directory */ )
43321 +{
43322 + int result;
43323 + reiser4_dir_entry_desc entry;
43324 +
43325 + assert("nikita-2333", lh->node == coord->node);
43326 + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43327 +
43328 + memset(&entry, 0, sizeof entry);
43329 + entry.obj = inode;
43330 + /* build key of directory entry description */
43331 + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43332 +
43333 + /* ext2 does this in different order: first inserts new entry,
43334 + then increases directory nlink. We don't want do this,
43335 + because reiser4_add_nlink() calls ->add_link() plugin
43336 + method that can fail for whatever reason, leaving as with
43337 + cleanup problems.
43338 + */
43339 + /* @inode is getting new name */
43340 + reiser4_add_nlink(inode, dir, 0);
43341 + /* create @new_name in @new_dir pointing to
43342 + @old_inode */
43343 + result = WITH_COORD(coord,
43344 + inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43345 + coord,
43346 + lh,
43347 + name,
43348 + &entry));
43349 + if (result != 0) {
43350 + int result2;
43351 + result2 = reiser4_del_nlink(inode, dir, 0);
43352 + if (result2 != 0) {
43353 + warning("nikita-2327",
43354 + "Cannot drop link on %lli %i. %s",
43355 + (unsigned long long)get_inode_oid(inode),
43356 + result2, possible_leak);
43357 + }
43358 + } else
43359 + INODE_INC_FIELD(dir, i_size);
43360 + return result;
43361 +}
43362 +
43363 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43364 + struct dentry *old_name, /* old name */
43365 + struct inode *new_dir, /* directory where @new is located */
43366 + struct dentry *new_name /* new name */ )
43367 +{
43368 + reiser4_block_nr res1, res2;
43369 + dir_plugin *p_parent_old, *p_parent_new;
43370 + file_plugin *p_child_old, *p_child_new;
43371 +
43372 + assert("vpf-311", old_dir != NULL);
43373 + assert("vpf-312", new_dir != NULL);
43374 + assert("vpf-313", old_name != NULL);
43375 + assert("vpf-314", new_name != NULL);
43376 +
43377 + p_parent_old = inode_dir_plugin(old_dir);
43378 + p_parent_new = inode_dir_plugin(new_dir);
43379 + p_child_old = inode_file_plugin(old_name->d_inode);
43380 + if (new_name->d_inode)
43381 + p_child_new = inode_file_plugin(new_name->d_inode);
43382 + else
43383 + p_child_new = NULL;
43384 +
43385 + /* find_entry - can insert one leaf. */
43386 + res1 = res2 = 1;
43387 +
43388 + /* replace_name */
43389 + {
43390 + /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43391 + res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43392 + /* update key */
43393 + res1 += 1;
43394 + /* reiser4_del_nlink(p_child_new) */
43395 + if (p_child_new)
43396 + res1 += p_child_new->estimate.update(new_name->d_inode);
43397 + }
43398 +
43399 + /* else add_name */
43400 + {
43401 + /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43402 + res2 +=
43403 + 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43404 + /* reiser4_add_nlink(p_parent_old) */
43405 + res2 += p_child_old->estimate.update(old_name->d_inode);
43406 + /* add_entry(p_parent_new) */
43407 + res2 += p_parent_new->estimate.add_entry(new_dir);
43408 + /* reiser4_del_nlink(p_parent_old) */
43409 + res2 += p_child_old->estimate.update(old_name->d_inode);
43410 + }
43411 +
43412 + res1 = res1 < res2 ? res2 : res1;
43413 +
43414 + /* reiser4_write_sd(p_parent_new) */
43415 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43416 +
43417 + /* reiser4_write_sd(p_child_new) */
43418 + if (p_child_new)
43419 + res1 += p_child_new->estimate.update(new_name->d_inode);
43420 +
43421 + /* hashed_rem_entry(p_parent_old) */
43422 + res1 += p_parent_old->estimate.rem_entry(old_dir);
43423 +
43424 + /* reiser4_del_nlink(p_child_old) */
43425 + res1 += p_child_old->estimate.update(old_name->d_inode);
43426 +
43427 + /* replace_name */
43428 + {
43429 + /* reiser4_add_nlink(p_parent_dir_new) */
43430 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43431 + /* update_key */
43432 + res1 += 1;
43433 + /* reiser4_del_nlink(p_parent_new) */
43434 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43435 + /* reiser4_del_nlink(p_parent_old) */
43436 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43437 + }
43438 +
43439 + /* reiser4_write_sd(p_parent_old) */
43440 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43441 +
43442 + /* reiser4_write_sd(p_child_old) */
43443 + res1 += p_child_old->estimate.update(old_name->d_inode);
43444 +
43445 + return res1;
43446 +}
43447 +
43448 +static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43449 + struct dentry *old_name, /* old name */
43450 + struct inode *new_dir, /* directory where @new is located */
43451 + struct dentry *new_name
43452 + /* new name */ )
43453 +{
43454 + reiser4_block_nr reserve;
43455 +
43456 + reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43457 +
43458 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43459 + return RETERR(-ENOSPC);
43460 +
43461 + return 0;
43462 +}
43463 +
43464 +/* check whether @old_inode and @new_inode can be moved within file system
43465 + * tree. This singles out attempts to rename pseudo-files, for example. */
43466 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
43467 + struct inode *new_dir, struct inode *new_inode)
43468 +{
43469 + file_plugin *fplug;
43470 + dir_plugin *dplug;
43471 +
43472 + assert("nikita-3370", old_inode != NULL);
43473 +
43474 + dplug = inode_dir_plugin(new_dir);
43475 + fplug = inode_file_plugin(old_inode);
43476 +
43477 + if (dplug == NULL)
43478 + return RETERR(-ENOTDIR);
43479 + else if (new_dir->i_op->create == NULL)
43480 + return RETERR(-EPERM);
43481 + else if (!fplug->can_add_link(old_inode))
43482 + return RETERR(-EMLINK);
43483 + else if (new_inode != NULL) {
43484 + fplug = inode_file_plugin(new_inode);
43485 + if (fplug->can_rem_link != NULL &&
43486 + !fplug->can_rem_link(new_inode))
43487 + return RETERR(-EBUSY);
43488 + }
43489 + return 0;
43490 +}
43491 +
43492 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
43493 + znode_lock_mode, reiser4_dir_entry_desc *);
43494 +int reiser4_update_dir(struct inode *);
43495 +
43496 +/* this is common implementation of vfs's rename method of struct
43497 + inode_operations
43498 + See comments in the body.
43499 +
43500 + It is arguable that this function can be made generic so, that it
43501 + will be applicable to any kind of directory plugin that deals with
43502 + directories composed out of directory entries. The only obstacle
43503 + here is that we don't have any data-type to represent directory
43504 + entry. This should be re-considered when more than one different
43505 + directory plugin will be implemented.
43506 +*/
43507 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
43508 + * is located */ ,
43509 + struct dentry *old_name /* old name */ ,
43510 + struct inode *new_dir /* directory where @new
43511 + * is located */ ,
43512 + struct dentry *new_name /* new name */ )
43513 +{
43514 + /* From `The Open Group Base Specifications Issue 6'
43515 +
43516 + If either the old or new argument names a symbolic link, rename()
43517 + shall operate on the symbolic link itself, and shall not resolve
43518 + the last component of the argument. If the old argument and the new
43519 + argument resolve to the same existing file, rename() shall return
43520 + successfully and perform no other action.
43521 +
43522 + [this is done by VFS: vfs_rename()]
43523 +
43524 + If the old argument points to the pathname of a file that is not a
43525 + directory, the new argument shall not point to the pathname of a
43526 + directory.
43527 +
43528 + [checked by VFS: vfs_rename->may_delete()]
43529 +
43530 + If the link named by the new argument exists, it shall
43531 + be removed and old renamed to new. In this case, a link named new
43532 + shall remain visible to other processes throughout the renaming
43533 + operation and refer either to the file referred to by new or old
43534 + before the operation began.
43535 +
43536 + [we should assure this]
43537 +
43538 + Write access permission is required for
43539 + both the directory containing old and the directory containing new.
43540 +
43541 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43542 +
43543 + If the old argument points to the pathname of a directory, the new
43544 + argument shall not point to the pathname of a file that is not a
43545 + directory.
43546 +
43547 + [checked by VFS: vfs_rename->may_delete()]
43548 +
43549 + If the directory named by the new argument exists, it
43550 + shall be removed and old renamed to new. In this case, a link named
43551 + new shall exist throughout the renaming operation and shall refer
43552 + either to the directory referred to by new or old before the
43553 + operation began.
43554 +
43555 + [we should assure this]
43556 +
43557 + If new names an existing directory, it shall be
43558 + required to be an empty directory.
43559 +
43560 + [we should check this]
43561 +
43562 + If the old argument points to a pathname of a symbolic link, the
43563 + symbolic link shall be renamed. If the new argument points to a
43564 + pathname of a symbolic link, the symbolic link shall be removed.
43565 +
43566 + The new pathname shall not contain a path prefix that names
43567 + old. Write access permission is required for the directory
43568 + containing old and the directory containing new. If the old
43569 + argument points to the pathname of a directory, write access
43570 + permission may be required for the directory named by old, and, if
43571 + it exists, the directory named by new.
43572 +
43573 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43574 +
43575 + If the link named by the new argument exists and the file's link
43576 + count becomes 0 when it is removed and no process has the file
43577 + open, the space occupied by the file shall be freed and the file
43578 + shall no longer be accessible. If one or more processes have the
43579 + file open when the last link is removed, the link shall be removed
43580 + before rename() returns, but the removal of the file contents shall
43581 + be postponed until all references to the file are closed.
43582 +
43583 + [iput() handles this, but we can do this manually, a la
43584 + reiser4_unlink()]
43585 +
43586 + Upon successful completion, rename() shall mark for update the
43587 + st_ctime and st_mtime fields of the parent directory of each file.
43588 +
43589 + [N/A]
43590 +
43591 + */
43592 + reiser4_context *ctx;
43593 + int result;
43594 + int is_dir; /* is @old_name directory */
43595 +
43596 + struct inode *old_inode;
43597 + struct inode *new_inode;
43598 + coord_t *new_coord;
43599 +
43600 + reiser4_dentry_fsdata *new_fsdata;
43601 + dir_plugin *dplug;
43602 + file_plugin *fplug;
43603 +
43604 + reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43605 + lock_handle *new_lh, *dotdot_lh;
43606 + struct dentry *dotdot_name;
43607 + reiser4_dentry_fsdata *dataonstack;
43608 +
43609 + ctx = reiser4_init_context(old_dir->i_sb);
43610 + if (IS_ERR(ctx))
43611 + return PTR_ERR(ctx);
43612 +
43613 + old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43614 + sizeof(*dotdot_name) + sizeof(*dataonstack),
43615 + reiser4_ctx_gfp_mask_get());
43616 + if (old_entry == NULL) {
43617 + context_set_commit_async(ctx);
43618 + reiser4_exit_context(ctx);
43619 + return RETERR(-ENOMEM);
43620 + }
43621 + memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43622 + sizeof(*dotdot_name) + sizeof(*dataonstack));
43623 +
43624 + new_entry = old_entry + 1;
43625 + dotdot_entry = old_entry + 2;
43626 + new_lh = (lock_handle *)(old_entry + 3);
43627 + dotdot_lh = new_lh + 1;
43628 + dotdot_name = (struct dentry *)(new_lh + 2);
43629 + dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
43630 +
43631 + assert("nikita-2318", old_dir != NULL);
43632 + assert("nikita-2319", new_dir != NULL);
43633 + assert("nikita-2320", old_name != NULL);
43634 + assert("nikita-2321", new_name != NULL);
43635 +
43636 + old_inode = old_name->d_inode;
43637 + new_inode = new_name->d_inode;
43638 +
43639 + dplug = inode_dir_plugin(old_dir);
43640 + fplug = NULL;
43641 +
43642 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43643 + if (IS_ERR(new_fsdata)) {
43644 + kfree(old_entry);
43645 + context_set_commit_async(ctx);
43646 + reiser4_exit_context(ctx);
43647 + return PTR_ERR(new_fsdata);
43648 + }
43649 +
43650 + new_coord = &new_fsdata->dec.entry_coord;
43651 + coord_clear_iplug(new_coord);
43652 +
43653 + is_dir = S_ISDIR(old_inode->i_mode);
43654 +
43655 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43656 +
43657 + /* if target is existing directory and it's not empty---return error.
43658 +
43659 + This check is done specifically, because is_dir_empty() requires
43660 + tree traversal and have to be done before locks are taken.
43661 + */
43662 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43663 + kfree(old_entry);
43664 + context_set_commit_async(ctx);
43665 + reiser4_exit_context(ctx);
43666 + return RETERR(-ENOTEMPTY);
43667 + }
43668 +
43669 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43670 + if (result != 0) {
43671 + kfree(old_entry);
43672 + context_set_commit_async(ctx);
43673 + reiser4_exit_context(ctx);
43674 + return result;
43675 + }
43676 +
43677 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43678 + new_dir, new_name);
43679 + if (result != 0) {
43680 + kfree(old_entry);
43681 + context_set_commit_async(ctx);
43682 + reiser4_exit_context(ctx);
43683 + return result;
43684 + }
43685 +
43686 + init_lh(new_lh);
43687 +
43688 + /* find entry for @new_name */
43689 + result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
43690 + new_entry);
43691 +
43692 + if (IS_CBKERR(result)) {
43693 + done_lh(new_lh);
43694 + kfree(old_entry);
43695 + context_set_commit_async(ctx);
43696 + reiser4_exit_context(ctx);
43697 + return result;
43698 + }
43699 +
43700 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
43701 +
43702 + /* add or replace name for @old_inode as @new_name */
43703 + if (new_inode != NULL) {
43704 + /* target (@new_name) exists. */
43705 + /* Not clear what to do with objects that are
43706 + both directories and files at the same time. */
43707 + if (result == CBK_COORD_FOUND) {
43708 + result = replace_name(old_inode,
43709 + new_dir,
43710 + new_inode, new_coord, new_lh);
43711 + if (result == 0)
43712 + fplug = inode_file_plugin(new_inode);
43713 + } else if (result == CBK_COORD_NOTFOUND) {
43714 + /* VFS told us that @new_name is bound to existing
43715 + inode, but we failed to find directory entry. */
43716 + warning("nikita-2324", "Target not found");
43717 + result = RETERR(-ENOENT);
43718 + }
43719 + } else {
43720 + /* target (@new_name) doesn't exists. */
43721 + if (result == CBK_COORD_NOTFOUND)
43722 + result = add_name(old_inode,
43723 + new_dir,
43724 + new_name, new_coord, new_lh, is_dir);
43725 + else if (result == CBK_COORD_FOUND) {
43726 + /* VFS told us that @new_name is "negative" dentry,
43727 + but we found directory entry. */
43728 + warning("nikita-2331", "Target found unexpectedly");
43729 + result = RETERR(-EIO);
43730 + }
43731 + }
43732 +
43733 + assert("nikita-3462", ergo(result == 0,
43734 + old_inode->i_nlink >= 2 + !!is_dir));
43735 +
43736 + /* We are done with all modifications to the @new_dir, release lock on
43737 + node. */
43738 + done_lh(new_lh);
43739 +
43740 + if (fplug != NULL) {
43741 + /* detach @new_inode from name-space */
43742 + result = fplug->detach(new_inode, new_dir);
43743 + if (result != 0)
43744 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
43745 + (unsigned long long)get_inode_oid(new_inode),
43746 + result, possible_leak);
43747 + }
43748 +
43749 + if (new_inode != NULL)
43750 + reiser4_update_sd(new_inode);
43751 +
43752 + if (result == 0) {
43753 + old_entry->obj = old_inode;
43754 +
43755 + dplug->build_entry_key(old_dir,
43756 + &old_name->d_name, &old_entry->key);
43757 +
43758 + /* At this stage new name was introduced for
43759 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43760 + counters were updated.
43761 +
43762 + We want to remove @old_name now. If @old_inode wasn't
43763 + directory this is simple.
43764 + */
43765 + result = dplug->rem_entry(old_dir, old_name, old_entry);
43766 + if (result != 0 && result != -ENOMEM) {
43767 + warning("nikita-2335",
43768 + "Cannot remove old name: %i", result);
43769 + } else {
43770 + result = reiser4_del_nlink(old_inode, old_dir, 0);
43771 + if (result != 0 && result != -ENOMEM) {
43772 + warning("nikita-2337",
43773 + "Cannot drop link on old: %i", result);
43774 + }
43775 + }
43776 +
43777 + if (result == 0 && is_dir) {
43778 + /* @old_inode is directory. We also have to update
43779 + dotdot entry. */
43780 + coord_t *dotdot_coord;
43781 +
43782 + memset(dataonstack, 0, sizeof dataonstack);
43783 + memset(dotdot_entry, 0, sizeof dotdot_entry);
43784 + dotdot_entry->obj = old_dir;
43785 + memset(dotdot_name, 0, sizeof dotdot_name);
43786 + dotdot_name->d_name.name = "..";
43787 + dotdot_name->d_name.len = 2;
43788 + /*
43789 + * allocate ->d_fsdata on the stack to avoid using
43790 + * reiser4_get_dentry_fsdata(). Locking is not needed,
43791 + * because dentry is private to the current thread.
43792 + */
43793 + dotdot_name->d_fsdata = dataonstack;
43794 + init_lh(dotdot_lh);
43795 +
43796 + dotdot_coord = &dataonstack->dec.entry_coord;
43797 + coord_clear_iplug(dotdot_coord);
43798 +
43799 + result = reiser4_find_entry(old_inode, dotdot_name,
43800 + dotdot_lh, ZNODE_WRITE_LOCK,
43801 + dotdot_entry);
43802 + if (result == 0) {
43803 + /* replace_name() decreases i_nlink on
43804 + * @old_dir */
43805 + result = replace_name(new_dir,
43806 + old_inode,
43807 + old_dir,
43808 + dotdot_coord, dotdot_lh);
43809 + } else
43810 + result = RETERR(-EIO);
43811 + done_lh(dotdot_lh);
43812 + }
43813 + }
43814 + reiser4_update_dir(new_dir);
43815 + reiser4_update_dir(old_dir);
43816 + reiser4_update_sd(old_inode);
43817 + if (result == 0) {
43818 + file_plugin *fplug;
43819 +
43820 + if (new_inode != NULL) {
43821 + /* add safe-link for target file (in case we removed
43822 + * last reference to the poor fellow */
43823 + fplug = inode_file_plugin(new_inode);
43824 + if (new_inode->i_nlink == 0)
43825 + result = safe_link_add(new_inode, SAFE_UNLINK);
43826 + }
43827 + }
43828 + kfree(old_entry);
43829 + context_set_commit_async(ctx);
43830 + reiser4_exit_context(ctx);
43831 + return result;
43832 +}
43833 +
43834 +#if 0
43835 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
43836 + * is located */ ,
43837 + struct dentry *old_name /* old name */ ,
43838 + struct inode *new_dir /* directory where @new
43839 + * is located */ ,
43840 + struct dentry *new_name /* new name */ )
43841 +{
43842 + /* From `The Open Group Base Specifications Issue 6'
43843 +
43844 + If either the old or new argument names a symbolic link, rename()
43845 + shall operate on the symbolic link itself, and shall not resolve
43846 + the last component of the argument. If the old argument and the new
43847 + argument resolve to the same existing file, rename() shall return
43848 + successfully and perform no other action.
43849 +
43850 + [this is done by VFS: vfs_rename()]
43851 +
43852 + If the old argument points to the pathname of a file that is not a
43853 + directory, the new argument shall not point to the pathname of a
43854 + directory.
43855 +
43856 + [checked by VFS: vfs_rename->may_delete()]
43857 +
43858 + If the link named by the new argument exists, it shall
43859 + be removed and old renamed to new. In this case, a link named new
43860 + shall remain visible to other processes throughout the renaming
43861 + operation and refer either to the file referred to by new or old
43862 + before the operation began.
43863 +
43864 + [we should assure this]
43865 +
43866 + Write access permission is required for
43867 + both the directory containing old and the directory containing new.
43868 +
43869 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43870 +
43871 + If the old argument points to the pathname of a directory, the new
43872 + argument shall not point to the pathname of a file that is not a
43873 + directory.
43874 +
43875 + [checked by VFS: vfs_rename->may_delete()]
43876 +
43877 + If the directory named by the new argument exists, it
43878 + shall be removed and old renamed to new. In this case, a link named
43879 + new shall exist throughout the renaming operation and shall refer
43880 + either to the directory referred to by new or old before the
43881 + operation began.
43882 +
43883 + [we should assure this]
43884 +
43885 + If new names an existing directory, it shall be
43886 + required to be an empty directory.
43887 +
43888 + [we should check this]
43889 +
43890 + If the old argument points to a pathname of a symbolic link, the
43891 + symbolic link shall be renamed. If the new argument points to a
43892 + pathname of a symbolic link, the symbolic link shall be removed.
43893 +
43894 + The new pathname shall not contain a path prefix that names
43895 + old. Write access permission is required for the directory
43896 + containing old and the directory containing new. If the old
43897 + argument points to the pathname of a directory, write access
43898 + permission may be required for the directory named by old, and, if
43899 + it exists, the directory named by new.
43900 +
43901 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43902 +
43903 + If the link named by the new argument exists and the file's link
43904 + count becomes 0 when it is removed and no process has the file
43905 + open, the space occupied by the file shall be freed and the file
43906 + shall no longer be accessible. If one or more processes have the
43907 + file open when the last link is removed, the link shall be removed
43908 + before rename() returns, but the removal of the file contents shall
43909 + be postponed until all references to the file are closed.
43910 +
43911 + [iput() handles this, but we can do this manually, a la
43912 + reiser4_unlink()]
43913 +
43914 + Upon successful completion, rename() shall mark for update the
43915 + st_ctime and st_mtime fields of the parent directory of each file.
43916 +
43917 + [N/A]
43918 +
43919 + */
43920 + reiser4_context *ctx;
43921 + int result;
43922 + int is_dir; /* is @old_name directory */
43923 + struct inode *old_inode;
43924 + struct inode *new_inode;
43925 + reiser4_dir_entry_desc old_entry;
43926 + reiser4_dir_entry_desc new_entry;
43927 + coord_t *new_coord;
43928 + reiser4_dentry_fsdata *new_fsdata;
43929 + lock_handle new_lh;
43930 + dir_plugin *dplug;
43931 + file_plugin *fplug;
43932 +
43933 + ctx = reiser4_init_context(old_dir->i_sb);
43934 + if (IS_ERR(ctx))
43935 + return PTR_ERR(ctx);
43936 +
43937 + assert("nikita-2318", old_dir != NULL);
43938 + assert("nikita-2319", new_dir != NULL);
43939 + assert("nikita-2320", old_name != NULL);
43940 + assert("nikita-2321", new_name != NULL);
43941 +
43942 + old_inode = old_name->d_inode;
43943 + new_inode = new_name->d_inode;
43944 +
43945 + dplug = inode_dir_plugin(old_dir);
43946 + fplug = NULL;
43947 +
43948 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43949 + if (IS_ERR(new_fsdata)) {
43950 + result = PTR_ERR(new_fsdata);
43951 + goto exit;
43952 + }
43953 +
43954 + new_coord = &new_fsdata->dec.entry_coord;
43955 + coord_clear_iplug(new_coord);
43956 +
43957 + is_dir = S_ISDIR(old_inode->i_mode);
43958 +
43959 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43960 +
43961 + /* if target is existing directory and it's not empty---return error.
43962 +
43963 + This check is done specifically, because is_dir_empty() requires
43964 + tree traversal and have to be done before locks are taken.
43965 + */
43966 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43967 + return RETERR(-ENOTEMPTY);
43968 +
43969 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43970 + if (result != 0)
43971 + goto exit;
43972 +
43973 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43974 + new_dir, new_name);
43975 + if (result != 0)
43976 + goto exit;
43977 +
43978 + init_lh(&new_lh);
43979 +
43980 + /* find entry for @new_name */
43981 + result = reiser4_find_entry(new_dir, new_name, &new_lh,
43982 + ZNODE_WRITE_LOCK, &new_entry);
43983 +
43984 + if (IS_CBKERR(result)) {
43985 + done_lh(&new_lh);
43986 + goto exit;
43987 + }
43988 +
43989 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
43990 +
43991 + /* add or replace name for @old_inode as @new_name */
43992 + if (new_inode != NULL) {
43993 + /* target (@new_name) exists. */
43994 + /* Not clear what to do with objects that are
43995 + both directories and files at the same time. */
43996 + if (result == CBK_COORD_FOUND) {
43997 + result = replace_name(old_inode,
43998 + new_dir,
43999 + new_inode, new_coord, &new_lh);
44000 + if (result == 0)
44001 + fplug = inode_file_plugin(new_inode);
44002 + } else if (result == CBK_COORD_NOTFOUND) {
44003 + /* VFS told us that @new_name is bound to existing
44004 + inode, but we failed to find directory entry. */
44005 + warning("nikita-2324", "Target not found");
44006 + result = RETERR(-ENOENT);
44007 + }
44008 + } else {
44009 + /* target (@new_name) doesn't exists. */
44010 + if (result == CBK_COORD_NOTFOUND)
44011 + result = add_name(old_inode,
44012 + new_dir,
44013 + new_name, new_coord, &new_lh, is_dir);
44014 + else if (result == CBK_COORD_FOUND) {
44015 + /* VFS told us that @new_name is "negative" dentry,
44016 + but we found directory entry. */
44017 + warning("nikita-2331", "Target found unexpectedly");
44018 + result = RETERR(-EIO);
44019 + }
44020 + }
44021 +
44022 + assert("nikita-3462", ergo(result == 0,
44023 + old_inode->i_nlink >= 2 + !!is_dir));
44024 +
44025 + /* We are done with all modifications to the @new_dir, release lock on
44026 + node. */
44027 + done_lh(&new_lh);
44028 +
44029 + if (fplug != NULL) {
44030 + /* detach @new_inode from name-space */
44031 + result = fplug->detach(new_inode, new_dir);
44032 + if (result != 0)
44033 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
44034 + (unsigned long long)get_inode_oid(new_inode),
44035 + result, possible_leak);
44036 + }
44037 +
44038 + if (new_inode != NULL)
44039 + reiser4_update_sd(new_inode);
44040 +
44041 + if (result == 0) {
44042 + memset(&old_entry, 0, sizeof old_entry);
44043 + old_entry.obj = old_inode;
44044 +
44045 + dplug->build_entry_key(old_dir,
44046 + &old_name->d_name, &old_entry.key);
44047 +
44048 + /* At this stage new name was introduced for
44049 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44050 + counters were updated.
44051 +
44052 + We want to remove @old_name now. If @old_inode wasn't
44053 + directory this is simple.
44054 + */
44055 + result = dplug->rem_entry(old_dir, old_name, &old_entry);
44056 + /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
44057 + if (result != 0 && result != -ENOMEM) {
44058 + warning("nikita-2335",
44059 + "Cannot remove old name: %i", result);
44060 + } else {
44061 + result = reiser4_del_nlink(old_inode, old_dir, 0);
44062 + if (result != 0 && result != -ENOMEM) {
44063 + warning("nikita-2337",
44064 + "Cannot drop link on old: %i", result);
44065 + }
44066 + }
44067 +
44068 + if (result == 0 && is_dir) {
44069 + /* @old_inode is directory. We also have to update
44070 + dotdot entry. */
44071 + coord_t *dotdot_coord;
44072 + lock_handle dotdot_lh;
44073 + struct dentry dotdot_name;
44074 + reiser4_dir_entry_desc dotdot_entry;
44075 + reiser4_dentry_fsdata dataonstack;
44076 + reiser4_dentry_fsdata *fsdata;
44077 +
44078 + memset(&dataonstack, 0, sizeof dataonstack);
44079 + memset(&dotdot_entry, 0, sizeof dotdot_entry);
44080 + dotdot_entry.obj = old_dir;
44081 + memset(&dotdot_name, 0, sizeof dotdot_name);
44082 + dotdot_name.d_name.name = "..";
44083 + dotdot_name.d_name.len = 2;
44084 + /*
44085 + * allocate ->d_fsdata on the stack to avoid using
44086 + * reiser4_get_dentry_fsdata(). Locking is not needed,
44087 + * because dentry is private to the current thread.
44088 + */
44089 + dotdot_name.d_fsdata = &dataonstack;
44090 + init_lh(&dotdot_lh);
44091 +
44092 + fsdata = &dataonstack;
44093 + dotdot_coord = &fsdata->dec.entry_coord;
44094 + coord_clear_iplug(dotdot_coord);
44095 +
44096 + result = reiser4_find_entry(old_inode,
44097 + &dotdot_name,
44098 + &dotdot_lh,
44099 + ZNODE_WRITE_LOCK,
44100 + &dotdot_entry);
44101 + if (result == 0) {
44102 + /* replace_name() decreases i_nlink on
44103 + * @old_dir */
44104 + result = replace_name(new_dir,
44105 + old_inode,
44106 + old_dir,
44107 + dotdot_coord, &dotdot_lh);
44108 + } else
44109 + result = RETERR(-EIO);
44110 + done_lh(&dotdot_lh);
44111 + }
44112 + }
44113 + reiser4_update_dir(new_dir);
44114 + reiser4_update_dir(old_dir);
44115 + reiser4_update_sd(old_inode);
44116 + if (result == 0) {
44117 + file_plugin *fplug;
44118 +
44119 + if (new_inode != NULL) {
44120 + /* add safe-link for target file (in case we removed
44121 + * last reference to the poor fellow */
44122 + fplug = inode_file_plugin(new_inode);
44123 + if (new_inode->i_nlink == 0)
44124 + result = safe_link_add(new_inode, SAFE_UNLINK);
44125 + }
44126 + }
44127 + exit:
44128 + context_set_commit_async(ctx);
44129 + reiser4_exit_context(ctx);
44130 + return result;
44131 +}
44132 +#endif
44133 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h linux-2.6.20/fs/reiser4/plugin/item/acl.h
44134 --- linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
44135 +++ linux-2.6.20/fs/reiser4/plugin/item/acl.h 2007-05-06 14:50:43.799006970 +0400
44136 @@ -0,0 +1,66 @@
44137 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44138 +
44139 +/* Directory entry. */
44140 +
44141 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44142 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44143 +
44144 +#include "../../forward.h"
44145 +#include "../../dformat.h"
44146 +#include "../../kassign.h"
44147 +#include "../../key.h"
44148 +
44149 +#include <linux/fs.h>
44150 +#include <linux/dcache.h> /* for struct dentry */
44151 +
44152 +typedef struct directory_entry_format {
44153 + /* key of object stat-data. It's not necessary to store whole
44154 + key here, because it's always key of stat-data, so minor
44155 + packing locality and offset can be omitted here. But this
44156 + relies on particular key allocation scheme for stat-data, so,
44157 + for extensibility sake, whole key can be stored here.
44158 +
44159 + We store key as array of bytes, because we don't want 8-byte
44160 + alignment of dir entries.
44161 + */
44162 + obj_key_id id;
44163 + /* file name. Null terminated string. */
44164 + d8 name[0];
44165 +} directory_entry_format;
44166 +
44167 +void print_de(const char *prefix, coord_t * coord);
44168 +int extract_key_de(const coord_t * coord, reiser4_key * key);
44169 +int update_key_de(const coord_t * coord, const reiser4_key * key,
44170 + lock_handle * lh);
44171 +char *extract_name_de(const coord_t * coord, char *buf);
44172 +unsigned extract_file_type_de(const coord_t * coord);
44173 +int add_entry_de(struct inode *dir, coord_t * coord,
44174 + lock_handle * lh, const struct dentry *name,
44175 + reiser4_dir_entry_desc * entry);
44176 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44177 + lock_handle * lh, reiser4_dir_entry_desc * entry);
44178 +int max_name_len_de(const struct inode *dir);
44179 +
44180 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44181 +
44182 +char *extract_dent_name(const coord_t * coord,
44183 + directory_entry_format * dent, char *buf);
44184 +
44185 +#if REISER4_LARGE_KEY
44186 +#define DE_NAME_BUF_LEN (24)
44187 +#else
44188 +#define DE_NAME_BUF_LEN (16)
44189 +#endif
44190 +
44191 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44192 +#endif
44193 +
44194 +/* Make Linus happy.
44195 + Local variables:
44196 + c-indentation-style: "K&R"
44197 + mode-name: "LC"
44198 + c-basic-offset: 8
44199 + tab-width: 8
44200 + fill-column: 120
44201 + End:
44202 +*/
44203 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.20/fs/reiser4/plugin/item/blackbox.c
44204 --- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
44205 +++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.c 2007-05-06 14:50:43.799006970 +0400
44206 @@ -0,0 +1,142 @@
44207 +/* Copyright 2003 by Hans Reiser, licensing governed by
44208 + * reiser4/README */
44209 +
44210 +/* Black box item implementation */
44211 +
44212 +#include "../../forward.h"
44213 +#include "../../debug.h"
44214 +#include "../../dformat.h"
44215 +#include "../../kassign.h"
44216 +#include "../../coord.h"
44217 +#include "../../tree.h"
44218 +#include "../../lock.h"
44219 +
44220 +#include "blackbox.h"
44221 +#include "item.h"
44222 +#include "../plugin.h"
44223 +
44224 +int
44225 +store_black_box(reiser4_tree * tree,
44226 + const reiser4_key * key, void *data, int length)
44227 +{
44228 + int result;
44229 + reiser4_item_data idata;
44230 + coord_t coord;
44231 + lock_handle lh;
44232 +
44233 + memset(&idata, 0, sizeof idata);
44234 +
44235 + idata.data = data;
44236 + idata.user = 0;
44237 + idata.length = length;
44238 + idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44239 +
44240 + init_lh(&lh);
44241 + result = insert_by_key(tree, key,
44242 + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44243 +
44244 + assert("nikita-3413",
44245 + ergo(result == 0,
44246 + WITH_COORD(&coord,
44247 + item_length_by_coord(&coord) == length)));
44248 +
44249 + done_lh(&lh);
44250 + return result;
44251 +}
44252 +
44253 +int
44254 +load_black_box(reiser4_tree * tree,
44255 + reiser4_key * key, void *data, int length, int exact)
44256 +{
44257 + int result;
44258 + coord_t coord;
44259 + lock_handle lh;
44260 +
44261 + init_lh(&lh);
44262 + result = coord_by_key(tree, key,
44263 + &coord, &lh, ZNODE_READ_LOCK,
44264 + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44265 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44266 +
44267 + if (result == 0) {
44268 + int ilen;
44269 +
44270 + result = zload(coord.node);
44271 + if (result == 0) {
44272 + ilen = item_length_by_coord(&coord);
44273 + if (ilen <= length) {
44274 + memcpy(data, item_body_by_coord(&coord), ilen);
44275 + unit_key_by_coord(&coord, key);
44276 + } else if (exact) {
44277 + /*
44278 + * item is larger than buffer provided by the
44279 + * user. Only issue a warning if @exact is
44280 + * set. If @exact is false, we are iterating
44281 + * over all safe-links and here we are reaching
44282 + * the end of the iteration.
44283 + */
44284 + warning("nikita-3415",
44285 + "Wrong black box length: %i > %i",
44286 + ilen, length);
44287 + result = RETERR(-EIO);
44288 + }
44289 + zrelse(coord.node);
44290 + }
44291 + }
44292 +
44293 + done_lh(&lh);
44294 + return result;
44295 +
44296 +}
44297 +
44298 +int
44299 +update_black_box(reiser4_tree * tree,
44300 + const reiser4_key * key, void *data, int length)
44301 +{
44302 + int result;
44303 + coord_t coord;
44304 + lock_handle lh;
44305 +
44306 + init_lh(&lh);
44307 + result = coord_by_key(tree, key,
44308 + &coord, &lh, ZNODE_READ_LOCK,
44309 + FIND_EXACT,
44310 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44311 + if (result == 0) {
44312 + int ilen;
44313 +
44314 + result = zload(coord.node);
44315 + if (result == 0) {
44316 + ilen = item_length_by_coord(&coord);
44317 + if (length <= ilen) {
44318 + memcpy(item_body_by_coord(&coord), data,
44319 + length);
44320 + } else {
44321 + warning("nikita-3437",
44322 + "Wrong black box length: %i < %i",
44323 + ilen, length);
44324 + result = RETERR(-EIO);
44325 + }
44326 + zrelse(coord.node);
44327 + }
44328 + }
44329 +
44330 + done_lh(&lh);
44331 + return result;
44332 +
44333 +}
44334 +
44335 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44336 +{
44337 + return reiser4_cut_tree(tree, key, key, NULL, 1);
44338 +}
44339 +
44340 +/* Make Linus happy.
44341 + Local variables:
44342 + c-indentation-style: "K&R"
44343 + mode-name: "LC"
44344 + c-basic-offset: 8
44345 + tab-width: 8
44346 + fill-column: 120
44347 + End:
44348 +*/
44349 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.20/fs/reiser4/plugin/item/blackbox.h
44350 --- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
44351 +++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.h 2007-05-06 14:50:43.799006970 +0400
44352 @@ -0,0 +1,33 @@
44353 +/* Copyright 2003 by Hans Reiser, licensing governed by
44354 + * reiser4/README */
44355 +
44356 +/* "Black box" entry to fixed-width contain user supplied data */
44357 +
44358 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44359 +#define __FS_REISER4_BLACK_BOX_H__
44360 +
44361 +#include "../../forward.h"
44362 +#include "../../dformat.h"
44363 +#include "../../kassign.h"
44364 +#include "../../key.h"
44365 +
44366 +extern int store_black_box(reiser4_tree * tree,
44367 + const reiser4_key * key, void *data, int length);
44368 +extern int load_black_box(reiser4_tree * tree,
44369 + reiser4_key * key, void *data, int length, int exact);
44370 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44371 +extern int update_black_box(reiser4_tree * tree,
44372 + const reiser4_key * key, void *data, int length);
44373 +
44374 +/* __FS_REISER4_BLACK_BOX_H__ */
44375 +#endif
44376 +
44377 +/* Make Linus happy.
44378 + Local variables:
44379 + c-indentation-style: "K&R"
44380 + mode-name: "LC"
44381 + c-basic-offset: 8
44382 + tab-width: 8
44383 + fill-column: 120
44384 + End:
44385 +*/
44386 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c linux-2.6.20/fs/reiser4/plugin/item/cde.c
44387 --- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
44388 +++ linux-2.6.20/fs/reiser4/plugin/item/cde.c 2007-05-06 14:50:43.799006970 +0400
44389 @@ -0,0 +1,1008 @@
44390 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44391 +
44392 +/* Directory entry implementation */
44393 +
44394 +/* DESCRIPTION:
44395 +
44396 + This is "compound" directory item plugin implementation. This directory
44397 + item type is compound (as opposed to the "simple directory item" in
44398 + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44399 + entries.
44400 +
44401 + The reason behind this decision is disk space efficiency: all directory
44402 + entries inside the same directory have identical fragment in their
44403 + keys. This, of course, depends on key assignment policy. In our default key
44404 + assignment policy, all directory entries have the same locality which is
44405 + equal to the object id of their directory.
44406 +
44407 + Composing directory item out of several directory entries for the same
44408 + directory allows us to store said key fragment only once. That is, this is
44409 + some ad hoc form of key compression (stem compression) that is implemented
44410 + here, because general key compression is not supposed to be implemented in
44411 + v4.0.
44412 +
44413 + Another decision that was made regarding all directory item plugins, is
44414 + that they will store entry keys unaligned. This is for that sake of disk
44415 + space efficiency again.
44416 +
44417 + In should be noted, that storing keys unaligned increases CPU consumption,
44418 + at least on some architectures.
44419 +
44420 + Internal on-disk structure of the compound directory item is the following:
44421 +
44422 + HEADER cde_item_format. Here number of entries is stored.
44423 + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44424 + ENTRY_HEADER_1 offset of entry body are stored.
44425 + ENTRY_HEADER_2 (basically two last parts of key)
44426 + ...
44427 + ENTRY_HEADER_N
44428 + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44429 + ENTRY_BODY_1 NUL-terminated name are stored.
44430 + ENTRY_BODY_2 (part of statadta key in the
44431 + sence that since all SDs have
44432 + zero offset, this offset is not
44433 + stored on disk).
44434 + ...
44435 + ENTRY_BODY_N
44436 +
44437 + When it comes to the balancing, each directory entry in compound directory
44438 + item is unit, that is, something that can be cut from one item and pasted
44439 + into another item of the same type. Handling of unit cut and paste is major
44440 + reason for the complexity of code below.
44441 +
44442 +*/
44443 +
44444 +#include "../../forward.h"
44445 +#include "../../debug.h"
44446 +#include "../../dformat.h"
44447 +#include "../../kassign.h"
44448 +#include "../../key.h"
44449 +#include "../../coord.h"
44450 +#include "sde.h"
44451 +#include "cde.h"
44452 +#include "item.h"
44453 +#include "../node/node.h"
44454 +#include "../plugin.h"
44455 +#include "../../znode.h"
44456 +#include "../../carry.h"
44457 +#include "../../tree.h"
44458 +#include "../../inode.h"
44459 +
44460 +#include <linux/fs.h> /* for struct inode */
44461 +#include <linux/dcache.h> /* for struct dentry */
44462 +#include <linux/quotaops.h>
44463 +
44464 +#if 0
44465 +#define CHECKME(coord) \
44466 +({ \
44467 + const char *message; \
44468 + coord_t dup; \
44469 + \
44470 + coord_dup_nocheck(&dup, (coord)); \
44471 + dup.unit_pos = 0; \
44472 + assert("nikita-2871", cde_check(&dup, &message) == 0); \
44473 +})
44474 +#else
44475 +#define CHECKME(coord) noop
44476 +#endif
44477 +
44478 +/* return body of compound directory item at @coord */
44479 +static inline cde_item_format *formatted_at(const coord_t * coord)
44480 +{
44481 + assert("nikita-1282", coord != NULL);
44482 + return item_body_by_coord(coord);
44483 +}
44484 +
44485 +/* return entry header at @coord */
44486 +static inline cde_unit_header *header_at(const coord_t *
44487 + coord /* coord of item */ ,
44488 + int idx /* index of unit */ )
44489 +{
44490 + assert("nikita-1283", coord != NULL);
44491 + return &formatted_at(coord)->entry[idx];
44492 +}
44493 +
44494 +/* return number of units in compound directory item at @coord */
44495 +static int units(const coord_t * coord /* coord of item */ )
44496 +{
44497 + return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44498 +}
44499 +
44500 +/* return offset of the body of @idx-th entry in @coord */
44501 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44502 + int idx /* index of unit */ )
44503 +{
44504 + if (idx < units(coord))
44505 + return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44506 + else if (idx == units(coord))
44507 + return item_length_by_coord(coord);
44508 + else
44509 + impossible("nikita-1308", "Wrong idx");
44510 + return 0;
44511 +}
44512 +
44513 +/* set offset of the body of @idx-th entry in @coord */
44514 +static void set_offset(const coord_t * coord /* coord of item */ ,
44515 + int idx /* index of unit */ ,
44516 + unsigned int offset /* new offset */ )
44517 +{
44518 + put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44519 +}
44520 +
44521 +static void adj_offset(const coord_t * coord /* coord of item */ ,
44522 + int idx /* index of unit */ ,
44523 + int delta /* offset change */ )
44524 +{
44525 + d16 *doffset;
44526 + __u16 offset;
44527 +
44528 + doffset = &header_at(coord, idx)->offset;
44529 + offset = le16_to_cpu(get_unaligned(doffset));
44530 + offset += delta;
44531 + put_unaligned(cpu_to_le16((__u16) offset), doffset);
44532 +}
44533 +
44534 +/* return pointer to @offset-th byte from the beginning of @coord */
44535 +static char *address(const coord_t * coord /* coord of item */ ,
44536 + int offset)
44537 +{
44538 + return ((char *)item_body_by_coord(coord)) + offset;
44539 +}
44540 +
44541 +/* return pointer to the body of @idx-th entry in @coord */
44542 +static directory_entry_format *entry_at(const coord_t * coord /* coord of
44543 + * item */ ,
44544 + int idx /* index of unit */ )
44545 +{
44546 + return (directory_entry_format *) address(coord,
44547 + (int)offset_of(coord, idx));
44548 +}
44549 +
44550 +/* return number of unit referenced by @coord */
44551 +static int idx_of(const coord_t * coord /* coord of item */ )
44552 +{
44553 + assert("nikita-1285", coord != NULL);
44554 + return coord->unit_pos;
44555 +}
44556 +
44557 +/* find position where entry with @entry_key would be inserted into @coord */
44558 +static int find(const coord_t * coord /* coord of item */ ,
44559 + const reiser4_key * entry_key /* key to look for */ ,
44560 + cmp_t * last /* result of last comparison */ )
44561 +{
44562 + int entries;
44563 +
44564 + int left;
44565 + int right;
44566 +
44567 + cde_unit_header *header;
44568 +
44569 + assert("nikita-1295", coord != NULL);
44570 + assert("nikita-1296", entry_key != NULL);
44571 + assert("nikita-1297", last != NULL);
44572 +
44573 + entries = units(coord);
44574 + left = 0;
44575 + right = entries - 1;
44576 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44577 + int median;
44578 +
44579 + median = (left + right) >> 1;
44580 +
44581 + header = header_at(coord, median);
44582 + *last = de_id_key_cmp(&header->hash, entry_key);
44583 + switch (*last) {
44584 + case LESS_THAN:
44585 + left = median;
44586 + break;
44587 + case GREATER_THAN:
44588 + right = median;
44589 + break;
44590 + case EQUAL_TO:{
44591 + do {
44592 + median--;
44593 + header--;
44594 + } while (median >= 0 &&
44595 + de_id_key_cmp(&header->hash,
44596 + entry_key) == EQUAL_TO);
44597 + return median + 1;
44598 + }
44599 + }
44600 + }
44601 + header = header_at(coord, left);
44602 + for (; left < entries; ++left, ++header) {
44603 + prefetch(header + 1);
44604 + *last = de_id_key_cmp(&header->hash, entry_key);
44605 + if (*last != LESS_THAN)
44606 + break;
44607 + }
44608 + if (left < entries)
44609 + return left;
44610 + else
44611 + return RETERR(-ENOENT);
44612 +
44613 +}
44614 +
44615 +/* expand @coord as to accommodate for insertion of @no new entries starting
44616 + from @pos, with total bodies size @size. */
44617 +static int expand_item(const coord_t * coord /* coord of item */ ,
44618 + int pos /* unit position */ , int no /* number of new
44619 + * units*/ ,
44620 + int size /* total size of new units' data */ ,
44621 + unsigned int data_size /* free space already reserved
44622 + * in the item for insertion */ )
44623 +{
44624 + int entries;
44625 + cde_unit_header *header;
44626 + char *dent;
44627 + int i;
44628 +
44629 + assert("nikita-1310", coord != NULL);
44630 + assert("nikita-1311", pos >= 0);
44631 + assert("nikita-1312", no > 0);
44632 + assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44633 + assert("nikita-1343",
44634 + item_length_by_coord(coord) >=
44635 + (int)(size + data_size + no * sizeof *header));
44636 +
44637 + entries = units(coord);
44638 +
44639 + if (pos == entries)
44640 + dent = address(coord, size);
44641 + else
44642 + dent = (char *)entry_at(coord, pos);
44643 + /* place where new header will be in */
44644 + header = header_at(coord, pos);
44645 + /* free space for new entry headers */
44646 + memmove(header + no, header,
44647 + (unsigned)(address(coord, size) - (char *)header));
44648 + /* if adding to the end initialise first new header */
44649 + if (pos == entries) {
44650 + set_offset(coord, pos, (unsigned)size);
44651 + }
44652 +
44653 + /* adjust entry pointer and size */
44654 + dent = dent + no * sizeof *header;
44655 + size += no * sizeof *header;
44656 + /* free space for new entries */
44657 + memmove(dent + data_size, dent,
44658 + (unsigned)(address(coord, size) - dent));
44659 +
44660 + /* increase counter */
44661 + entries += no;
44662 + put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44663 +
44664 + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44665 + bytes. */
44666 + for (i = 0; i <= pos; ++i)
44667 + adj_offset(coord, i, no * sizeof *header);
44668 + /* [ pos + no ... +\infty ) entries were shifted by ( no *
44669 + sizeof *header + data_size ) bytes */
44670 + for (i = pos + no; i < entries; ++i)
44671 + adj_offset(coord, i, no * sizeof *header + data_size);
44672 + return 0;
44673 +}
44674 +
44675 +/* insert new @entry into item */
44676 +static int expand(const coord_t * coord /* coord of item */ ,
44677 + cde_entry * entry /* entry to insert */ ,
44678 + int len /* length of @entry data */ ,
44679 + int *pos /* position to insert */ ,
44680 + reiser4_dir_entry_desc * dir_entry /* parameters for new
44681 + * entry */ )
44682 +{
44683 + cmp_t cmp_res;
44684 + int datasize;
44685 +
44686 + *pos = find(coord, &dir_entry->key, &cmp_res);
44687 + if (*pos < 0)
44688 + *pos = units(coord);
44689 +
44690 + datasize = sizeof(directory_entry_format);
44691 + if (is_longname(entry->name->name, entry->name->len))
44692 + datasize += entry->name->len + 1;
44693 +
44694 + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44695 + datasize);
44696 + return 0;
44697 +}
44698 +
44699 +/* paste body of @entry into item */
44700 +static int paste_entry(const coord_t * coord /* coord of item */ ,
44701 + cde_entry * entry /* new entry */ ,
44702 + int pos /* position to insert */ ,
44703 + reiser4_dir_entry_desc * dir_entry /* parameters for
44704 + * new entry */ )
44705 +{
44706 + cde_unit_header *header;
44707 + directory_entry_format *dent;
44708 + const char *name;
44709 + int len;
44710 +
44711 + header = header_at(coord, pos);
44712 + dent = entry_at(coord, pos);
44713 +
44714 + build_de_id_by_key(&dir_entry->key, &header->hash);
44715 + build_inode_key_id(entry->obj, &dent->id);
44716 + /* AUDIT unsafe strcpy() operation! It should be replaced with
44717 + much less CPU hungry
44718 + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44719 +
44720 + Also a more major thing is that there should be a way to figure out
44721 + amount of space in dent -> name and be able to check that we are
44722 + not going to overwrite more than we supposed to */
44723 + name = entry->name->name;
44724 + len = entry->name->len;
44725 + if (is_longname(name, len)) {
44726 + strcpy((unsigned char *)dent->name, name);
44727 + put_unaligned(0, &dent->name[len]);
44728 + }
44729 + return 0;
44730 +}
44731 +
44732 +/* estimate how much space is necessary in item to insert/paste set of entries
44733 + described in @data. */
44734 +int estimate_cde(const coord_t * coord /* coord of item */ ,
44735 + const reiser4_item_data * data /* parameters for new item */ )
44736 +{
44737 + cde_entry_data *e;
44738 + int result;
44739 + int i;
44740 +
44741 + e = (cde_entry_data *) data->data;
44742 +
44743 + assert("nikita-1288", e != NULL);
44744 + assert("nikita-1289", e->num_of_entries >= 0);
44745 +
44746 + if (coord == NULL)
44747 + /* insert */
44748 + result = sizeof(cde_item_format);
44749 + else
44750 + /* paste */
44751 + result = 0;
44752 +
44753 + result += e->num_of_entries *
44754 + (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44755 + for (i = 0; i < e->num_of_entries; ++i) {
44756 + const char *name;
44757 + int len;
44758 +
44759 + name = e->entry[i].name->name;
44760 + len = e->entry[i].name->len;
44761 + assert("nikita-2054", strlen(name) == len);
44762 + if (is_longname(name, len))
44763 + result += len + 1;
44764 + }
44765 + ((reiser4_item_data *) data)->length = result;
44766 + return result;
44767 +}
44768 +
44769 +/* ->nr_units() method for this item plugin. */
44770 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44771 +{
44772 + return units(coord);
44773 +}
44774 +
44775 +/* ->unit_key() method for this item plugin. */
44776 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44777 + reiser4_key * key /* resulting key */ )
44778 +{
44779 + assert("nikita-1452", coord != NULL);
44780 + assert("nikita-1345", idx_of(coord) < units(coord));
44781 + assert("nikita-1346", key != NULL);
44782 +
44783 + item_key_by_coord(coord, key);
44784 + extract_key_from_de_id(extract_dir_id_from_key(key),
44785 + &header_at(coord, idx_of(coord))->hash, key);
44786 + return key;
44787 +}
44788 +
44789 +/* mergeable_cde(): implementation of ->mergeable() item method.
44790 +
44791 + Two directory items are mergeable iff they are from the same
44792 + directory. That simple.
44793 +
44794 +*/
44795 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44796 + const coord_t * p2 /* coord of second item */ )
44797 +{
44798 + reiser4_key k1;
44799 + reiser4_key k2;
44800 +
44801 + assert("nikita-1339", p1 != NULL);
44802 + assert("nikita-1340", p2 != NULL);
44803 +
44804 + return
44805 + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44806 + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44807 + extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44808 +
44809 +}
44810 +
44811 +/* ->max_key_inside() method for this item plugin. */
44812 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44813 + reiser4_key * result /* resulting key */ )
44814 +{
44815 + assert("nikita-1342", coord != NULL);
44816 +
44817 + item_key_by_coord(coord, result);
44818 + set_key_ordering(result, get_key_ordering(reiser4_max_key()));
44819 + set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
44820 + set_key_offset(result, get_key_offset(reiser4_max_key()));
44821 + return result;
44822 +}
44823 +
44824 +/* @data contains data which are to be put into tree */
44825 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44826 + const reiser4_key * key /* key to check */ ,
44827 + const reiser4_item_data * data /* parameters of new
44828 + * item/unit being
44829 + * created */ )
44830 +{
44831 + reiser4_key item_key;
44832 +
44833 + /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44834 + data->iplug is initialized */
44835 + assert("vs-457", data && data->iplug);
44836 +/* assert( "vs-553", data -> user == 0 );*/
44837 + item_key_by_coord(coord, &item_key);
44838 +
44839 + return (item_plugin_by_coord(coord) == data->iplug) &&
44840 + (extract_dir_id_from_key(&item_key) ==
44841 + extract_dir_id_from_key(key));
44842 +}
44843 +
44844 +#if REISER4_DEBUG
44845 +/* cde_check ->check() method for compressed directory items
44846 +
44847 + used for debugging, every item should have here the most complete
44848 + possible check of the consistency of the item that the inventor can
44849 + construct
44850 +*/
44851 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
44852 + const char **error /* where to store error message */)
44853 +{
44854 + int i;
44855 + int result;
44856 + char *item_start;
44857 + char *item_end;
44858 + reiser4_key key;
44859 +
44860 + coord_t c;
44861 +
44862 + assert("nikita-1357", coord != NULL);
44863 + assert("nikita-1358", error != NULL);
44864 +
44865 + if (!ergo(coord->item_pos != 0,
44866 + is_dot_key(item_key_by_coord(coord, &key)))) {
44867 + *error = "CDE doesn't start with dot";
44868 + return -1;
44869 + }
44870 + item_start = item_body_by_coord(coord);
44871 + item_end = item_start + item_length_by_coord(coord);
44872 +
44873 + coord_dup(&c, coord);
44874 + result = 0;
44875 + for (i = 0; i < units(coord); ++i) {
44876 + directory_entry_format *entry;
44877 +
44878 + if ((char *)(header_at(coord, i) + 1) >
44879 + item_end - units(coord) * sizeof *entry) {
44880 + *error = "CDE header is out of bounds";
44881 + result = -1;
44882 + break;
44883 + }
44884 + entry = entry_at(coord, i);
44885 + if ((char *)entry < item_start + sizeof(cde_item_format)) {
44886 + *error = "CDE header is too low";
44887 + result = -1;
44888 + break;
44889 + }
44890 + if ((char *)(entry + 1) > item_end) {
44891 + *error = "CDE header is too high";
44892 + result = -1;
44893 + break;
44894 + }
44895 + }
44896 +
44897 + return result;
44898 +}
44899 +#endif
44900 +
44901 +/* ->init() method for this item plugin. */
44902 +int init_cde(coord_t * coord /* coord of item */ ,
44903 + coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44904 + UNUSED_ARG)
44905 +{
44906 + put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44907 + return 0;
44908 +}
44909 +
44910 +/* ->lookup() method for this item plugin. */
44911 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44912 + lookup_bias bias /* search bias */ ,
44913 + coord_t * coord /* coord of item to lookup in */ )
44914 +{
44915 + cmp_t last_comp;
44916 + int pos;
44917 +
44918 + reiser4_key utmost_key;
44919 +
44920 + assert("nikita-1293", coord != NULL);
44921 + assert("nikita-1294", key != NULL);
44922 +
44923 + CHECKME(coord);
44924 +
44925 + if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44926 + coord->unit_pos = 0;
44927 + coord->between = BEFORE_UNIT;
44928 + return CBK_COORD_NOTFOUND;
44929 + }
44930 + pos = find(coord, key, &last_comp);
44931 + if (pos >= 0) {
44932 + coord->unit_pos = (int)pos;
44933 + switch (last_comp) {
44934 + case EQUAL_TO:
44935 + coord->between = AT_UNIT;
44936 + return CBK_COORD_FOUND;
44937 + case GREATER_THAN:
44938 + coord->between = BEFORE_UNIT;
44939 + return RETERR(-ENOENT);
44940 + case LESS_THAN:
44941 + default:
44942 + impossible("nikita-1298", "Broken find");
44943 + return RETERR(-EIO);
44944 + }
44945 + } else {
44946 + coord->unit_pos = units(coord) - 1;
44947 + coord->between = AFTER_UNIT;
44948 + return (bias ==
44949 + FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44950 + CBK_COORD_NOTFOUND;
44951 + }
44952 +}
44953 +
44954 +/* ->paste() method for this item plugin. */
44955 +int paste_cde(coord_t * coord /* coord of item */ ,
44956 + reiser4_item_data * data /* parameters of new unit being
44957 + * inserted */ ,
44958 + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44959 +{
44960 + cde_entry_data *e;
44961 + int result;
44962 + int i;
44963 +
44964 + CHECKME(coord);
44965 + e = (cde_entry_data *) data->data;
44966 +
44967 + result = 0;
44968 + for (i = 0; i < e->num_of_entries; ++i) {
44969 + int pos;
44970 + int phantom_size;
44971 +
44972 + phantom_size = data->length;
44973 + if (units(coord) == 0)
44974 + phantom_size -= sizeof(cde_item_format);
44975 +
44976 + result =
44977 + expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44978 + if (result != 0)
44979 + break;
44980 + result = paste_entry(coord, e->entry + i, pos, data->arg);
44981 + if (result != 0)
44982 + break;
44983 + }
44984 + CHECKME(coord);
44985 + return result;
44986 +}
44987 +
44988 +/* amount of space occupied by all entries starting from @idx both headers and
44989 + bodies. */
44990 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44991 + int idx /* index of unit */ )
44992 +{
44993 + assert("nikita-1299", coord != NULL);
44994 + assert("nikita-1300", idx < (int)units(coord));
44995 +
44996 + return sizeof(cde_item_format) +
44997 + (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44998 + idx + 1) -
44999 + offset_of(coord, 0);
45000 +}
45001 +
45002 +/* how many but not more than @want units of @source can be merged with
45003 + item in @target node. If pend == append - we try to append last item
45004 + of @target by first units of @source. If pend == prepend - we try to
45005 + "prepend" first item in @target by last units of @source. @target
45006 + node has @free_space bytes of free space. Total size of those units
45007 + are returned via @size */
45008 +int can_shift_cde(unsigned free_space /* free space in item */ ,
45009 + coord_t * coord /* coord of source item */ ,
45010 + znode * target /* target node */ ,
45011 + shift_direction pend /* shift direction */ ,
45012 + unsigned *size /* resulting number of shifted bytes */ ,
45013 + unsigned want /* maximal number of bytes to shift */ )
45014 +{
45015 + int shift;
45016 +
45017 + CHECKME(coord);
45018 + if (want == 0) {
45019 + *size = 0;
45020 + return 0;
45021 + }
45022 +
45023 + /* pend == SHIFT_LEFT <==> shifting to the left */
45024 + if (pend == SHIFT_LEFT) {
45025 + for (shift = min((int)want - 1, units(coord)); shift >= 0;
45026 + --shift) {
45027 + *size = part_size(coord, shift);
45028 + if (target != NULL)
45029 + *size -= sizeof(cde_item_format);
45030 + if (*size <= free_space)
45031 + break;
45032 + }
45033 + shift = shift + 1;
45034 + } else {
45035 + int total_size;
45036 +
45037 + assert("nikita-1301", pend == SHIFT_RIGHT);
45038 +
45039 + total_size = item_length_by_coord(coord);
45040 + for (shift = units(coord) - want - 1; shift < units(coord) - 1;
45041 + ++shift) {
45042 + *size = total_size - part_size(coord, shift);
45043 + if (target == NULL)
45044 + *size += sizeof(cde_item_format);
45045 + if (*size <= free_space)
45046 + break;
45047 + }
45048 + shift = units(coord) - shift - 1;
45049 + }
45050 + if (shift == 0)
45051 + *size = 0;
45052 + CHECKME(coord);
45053 + return shift;
45054 +}
45055 +
45056 +/* ->copy_units() method for this item plugin. */
45057 +void copy_units_cde(coord_t * target /* coord of target item */ ,
45058 + coord_t * source /* coord of source item */ ,
45059 + unsigned from /* starting unit */ ,
45060 + unsigned count /* how many units to copy */ ,
45061 + shift_direction where_is_free_space /* shift direction */ ,
45062 + unsigned free_space /* free space in item */ )
45063 +{
45064 + char *header_from;
45065 + char *header_to;
45066 +
45067 + char *entry_from;
45068 + char *entry_to;
45069 +
45070 + int pos_in_target;
45071 + int data_size;
45072 + int data_delta;
45073 + int i;
45074 +
45075 + assert("nikita-1303", target != NULL);
45076 + assert("nikita-1304", source != NULL);
45077 + assert("nikita-1305", (int)from < units(source));
45078 + assert("nikita-1307", (int)(from + count) <= units(source));
45079 +
45080 + if (where_is_free_space == SHIFT_LEFT) {
45081 + assert("nikita-1453", from == 0);
45082 + pos_in_target = units(target);
45083 + } else {
45084 + assert("nikita-1309", (int)(from + count) == units(source));
45085 + pos_in_target = 0;
45086 + memmove(item_body_by_coord(target),
45087 + (char *)item_body_by_coord(target) + free_space,
45088 + item_length_by_coord(target) - free_space);
45089 + }
45090 +
45091 + CHECKME(target);
45092 + CHECKME(source);
45093 +
45094 + /* expand @target */
45095 + data_size =
45096 + offset_of(source, (int)(from + count)) - offset_of(source,
45097 + (int)from);
45098 +
45099 + if (units(target) == 0)
45100 + free_space -= sizeof(cde_item_format);
45101 +
45102 + expand_item(target, pos_in_target, (int)count,
45103 + (int)(item_length_by_coord(target) - free_space),
45104 + (unsigned)data_size);
45105 +
45106 + /* copy first @count units of @source into @target */
45107 + data_delta =
45108 + offset_of(target, pos_in_target) - offset_of(source, (int)from);
45109 +
45110 + /* copy entries */
45111 + entry_from = (char *)entry_at(source, (int)from);
45112 + entry_to = (char *)entry_at(source, (int)(from + count));
45113 + memmove(entry_at(target, pos_in_target), entry_from,
45114 + (unsigned)(entry_to - entry_from));
45115 +
45116 + /* copy headers */
45117 + header_from = (char *)header_at(source, (int)from);
45118 + header_to = (char *)header_at(source, (int)(from + count));
45119 + memmove(header_at(target, pos_in_target), header_from,
45120 + (unsigned)(header_to - header_from));
45121 +
45122 + /* update offsets */
45123 + for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45124 + adj_offset(target, i, data_delta);
45125 + CHECKME(target);
45126 + CHECKME(source);
45127 +}
45128 +
45129 +/* ->cut_units() method for this item plugin. */
45130 +int cut_units_cde(coord_t * coord /* coord of item */ ,
45131 + pos_in_node_t from /* start unit pos */ ,
45132 + pos_in_node_t to /* stop unit pos */ ,
45133 + struct carry_cut_data *cdata UNUSED_ARG,
45134 + reiser4_key * smallest_removed, reiser4_key * new_first)
45135 +{
45136 + char *header_from;
45137 + char *header_to;
45138 +
45139 + char *entry_from;
45140 + char *entry_to;
45141 +
45142 + int size;
45143 + int entry_delta;
45144 + int header_delta;
45145 + int i;
45146 +
45147 + unsigned count;
45148 +
45149 + CHECKME(coord);
45150 +
45151 + count = to - from + 1;
45152 +
45153 + assert("nikita-1454", coord != NULL);
45154 + assert("nikita-1455", (int)(from + count) <= units(coord));
45155 +
45156 + if (smallest_removed)
45157 + unit_key_by_coord(coord, smallest_removed);
45158 +
45159 + if (new_first) {
45160 + coord_t next;
45161 +
45162 + /* not everything is cut from item head */
45163 + assert("vs-1527", from == 0);
45164 + assert("vs-1528", to < units(coord) - 1);
45165 +
45166 + coord_dup(&next, coord);
45167 + next.unit_pos++;
45168 + unit_key_by_coord(&next, new_first);
45169 + }
45170 +
45171 + size = item_length_by_coord(coord);
45172 + if (count == (unsigned)units(coord)) {
45173 + return size;
45174 + }
45175 +
45176 + header_from = (char *)header_at(coord, (int)from);
45177 + header_to = (char *)header_at(coord, (int)(from + count));
45178 +
45179 + entry_from = (char *)entry_at(coord, (int)from);
45180 + entry_to = (char *)entry_at(coord, (int)(from + count));
45181 +
45182 + /* move headers */
45183 + memmove(header_from, header_to,
45184 + (unsigned)(address(coord, size) - header_to));
45185 +
45186 + header_delta = header_to - header_from;
45187 +
45188 + entry_from -= header_delta;
45189 + entry_to -= header_delta;
45190 + size -= header_delta;
45191 +
45192 + /* copy entries */
45193 + memmove(entry_from, entry_to,
45194 + (unsigned)(address(coord, size) - entry_to));
45195 +
45196 + entry_delta = entry_to - entry_from;
45197 + size -= entry_delta;
45198 +
45199 + /* update offsets */
45200 +
45201 + for (i = 0; i < (int)from; ++i)
45202 + adj_offset(coord, i, -header_delta);
45203 +
45204 + for (i = from; i < units(coord) - (int)count; ++i)
45205 + adj_offset(coord, i, -header_delta - entry_delta);
45206 +
45207 + put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45208 + &formatted_at(coord)->num_of_entries);
45209 +
45210 + if (from == 0) {
45211 + /* entries from head was removed - move remaining to right */
45212 + memmove((char *)item_body_by_coord(coord) +
45213 + header_delta + entry_delta, item_body_by_coord(coord),
45214 + (unsigned)size);
45215 + if (REISER4_DEBUG)
45216 + memset(item_body_by_coord(coord), 0,
45217 + (unsigned)header_delta + entry_delta);
45218 + } else {
45219 + /* freed space is already at the end of item */
45220 + if (REISER4_DEBUG)
45221 + memset((char *)item_body_by_coord(coord) + size, 0,
45222 + (unsigned)header_delta + entry_delta);
45223 + }
45224 +
45225 + return header_delta + entry_delta;
45226 +}
45227 +
45228 +int kill_units_cde(coord_t * coord /* coord of item */ ,
45229 + pos_in_node_t from /* start unit pos */ ,
45230 + pos_in_node_t to /* stop unit pos */ ,
45231 + struct carry_kill_data *kdata UNUSED_ARG,
45232 + reiser4_key * smallest_removed, reiser4_key * new_first)
45233 +{
45234 + return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45235 +}
45236 +
45237 +/* ->s.dir.extract_key() method for this item plugin. */
45238 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
45239 + reiser4_key * key /* resulting key */ )
45240 +{
45241 + directory_entry_format *dent;
45242 +
45243 + assert("nikita-1155", coord != NULL);
45244 + assert("nikita-1156", key != NULL);
45245 +
45246 + dent = entry_at(coord, idx_of(coord));
45247 + return extract_key_from_id(&dent->id, key);
45248 +}
45249 +
45250 +int
45251 +update_key_cde(const coord_t * coord, const reiser4_key * key,
45252 + lock_handle * lh UNUSED_ARG)
45253 +{
45254 + directory_entry_format *dent;
45255 + obj_key_id obj_id;
45256 + int result;
45257 +
45258 + assert("nikita-2344", coord != NULL);
45259 + assert("nikita-2345", key != NULL);
45260 +
45261 + dent = entry_at(coord, idx_of(coord));
45262 + result = build_obj_key_id(key, &obj_id);
45263 + if (result == 0) {
45264 + dent->id = obj_id;
45265 + znode_make_dirty(coord->node);
45266 + }
45267 + return 0;
45268 +}
45269 +
45270 +/* ->s.dir.extract_name() method for this item plugin. */
45271 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45272 +{
45273 + directory_entry_format *dent;
45274 +
45275 + assert("nikita-1157", coord != NULL);
45276 +
45277 + dent = entry_at(coord, idx_of(coord));
45278 + return extract_dent_name(coord, dent, buf);
45279 +}
45280 +
45281 +static int cde_bytes(int pasting, const reiser4_item_data * data)
45282 +{
45283 + int result;
45284 +
45285 + result = data->length;
45286 + if (!pasting)
45287 + result -= sizeof(cde_item_format);
45288 + return result;
45289 +}
45290 +
45291 +/* ->s.dir.add_entry() method for this item plugin */
45292 +int add_entry_cde(struct inode *dir /* directory object */ ,
45293 + coord_t * coord /* coord of item */ ,
45294 + lock_handle * lh /* lock handle for insertion */ ,
45295 + const struct dentry *name /* name to insert */ ,
45296 + reiser4_dir_entry_desc * dir_entry /* parameters of new
45297 + * directory entry */ )
45298 +{
45299 + reiser4_item_data data;
45300 + cde_entry entry;
45301 + cde_entry_data edata;
45302 + int result;
45303 +
45304 + assert("nikita-1656", coord->node == lh->node);
45305 + assert("nikita-1657", znode_is_write_locked(coord->node));
45306 +
45307 + edata.num_of_entries = 1;
45308 + edata.entry = &entry;
45309 +
45310 + entry.dir = dir;
45311 + entry.obj = dir_entry->obj;
45312 + entry.name = &name->d_name;
45313 +
45314 + data.data = (char *)&edata;
45315 + data.user = 0; /* &edata is not user space */
45316 + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45317 + data.arg = dir_entry;
45318 + assert("nikita-1302", data.iplug != NULL);
45319 +
45320 + result = is_dot_key(&dir_entry->key);
45321 + data.length = estimate_cde(result ? coord : NULL, &data);
45322 +
45323 + /* NOTE-NIKITA quota plugin? */
45324 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45325 + return RETERR(-EDQUOT);
45326 +
45327 + if (result)
45328 + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45329 + else
45330 + result = reiser4_resize_item(coord, &data, &dir_entry->key,
45331 + lh, 0);
45332 + return result;
45333 +}
45334 +
45335 +/* ->s.dir.rem_entry() */
45336 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
45337 + const struct qstr *name, coord_t * coord /* coord of item */ ,
45338 + lock_handle * lh UNUSED_ARG /* lock handle for
45339 + * removal */ ,
45340 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45341 + * directory entry
45342 + * being removed */ )
45343 +{
45344 + coord_t shadow;
45345 + int result;
45346 + int length;
45347 + ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45348 +
45349 + assert("nikita-2870", strlen(name->name) == name->len);
45350 + assert("nikita-2869",
45351 + !strcmp(name->name, extract_name_cde(coord, buf)));
45352 +
45353 + length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45354 + if (is_longname(name->name, name->len))
45355 + length += name->len + 1;
45356 +
45357 + if (inode_get_bytes(dir) < length) {
45358 + warning("nikita-2628", "Dir is broke: %llu: %llu",
45359 + (unsigned long long)get_inode_oid(dir),
45360 + inode_get_bytes(dir));
45361 +
45362 + return RETERR(-EIO);
45363 + }
45364 +
45365 + /* cut_node() is supposed to take pointers to _different_
45366 + coords, because it will modify them without respect to
45367 + possible aliasing. To work around this, create temporary copy
45368 + of @coord.
45369 + */
45370 + coord_dup(&shadow, coord);
45371 + result =
45372 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45373 + if (result == 0) {
45374 + /* NOTE-NIKITA quota plugin? */
45375 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
45376 + }
45377 + return result;
45378 +}
45379 +
45380 +/* ->s.dir.max_name_len() method for this item plugin */
45381 +int max_name_len_cde(const struct inode *dir /* directory */ )
45382 +{
45383 + return
45384 + reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45385 + sizeof(directory_entry_format) - sizeof(cde_item_format) -
45386 + sizeof(cde_unit_header) - 2;
45387 +}
45388 +
45389 +/* Make Linus happy.
45390 + Local variables:
45391 + c-indentation-style: "K&R"
45392 + mode-name: "LC"
45393 + c-basic-offset: 8
45394 + tab-width: 8
45395 + fill-column: 120
45396 + End:
45397 +*/
45398 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h linux-2.6.20/fs/reiser4/plugin/item/cde.h
45399 --- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
45400 +++ linux-2.6.20/fs/reiser4/plugin/item/cde.h 2007-05-06 14:50:43.803008220 +0400
45401 @@ -0,0 +1,87 @@
45402 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45403 +
45404 +/* Compound directory item. See cde.c for description. */
45405 +
45406 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45407 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45408 +
45409 +#include "../../forward.h"
45410 +#include "../../kassign.h"
45411 +#include "../../dformat.h"
45412 +
45413 +#include <linux/fs.h> /* for struct inode */
45414 +#include <linux/dcache.h> /* for struct dentry, etc */
45415 +
45416 +typedef struct cde_unit_header {
45417 + de_id hash;
45418 + d16 offset;
45419 +} cde_unit_header;
45420 +
45421 +typedef struct cde_item_format {
45422 + d16 num_of_entries;
45423 + cde_unit_header entry[0];
45424 +} cde_item_format;
45425 +
45426 +typedef struct cde_entry {
45427 + const struct inode *dir;
45428 + const struct inode *obj;
45429 + const struct qstr *name;
45430 +} cde_entry;
45431 +
45432 +typedef struct cde_entry_data {
45433 + int num_of_entries;
45434 + cde_entry *entry;
45435 +} cde_entry_data;
45436 +
45437 +/* plugin->item.b.* */
45438 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45439 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45440 + const reiser4_item_data *);
45441 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
45442 +pos_in_node_t nr_units_cde(const coord_t * coord);
45443 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45444 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45445 +void print_cde(const char *prefix, coord_t * coord);
45446 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45447 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45448 + coord_t * coord);
45449 +int paste_cde(coord_t * coord, reiser4_item_data * data,
45450 + carry_plugin_info * info UNUSED_ARG);
45451 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45452 + shift_direction pend, unsigned *size, unsigned want);
45453 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45454 + unsigned count, shift_direction where_is_free_space,
45455 + unsigned free_space);
45456 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45457 + struct carry_cut_data *, reiser4_key * smallest_removed,
45458 + reiser4_key * new_first);
45459 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45460 + struct carry_kill_data *, reiser4_key * smallest_removed,
45461 + reiser4_key * new_first);
45462 +void print_cde(const char *prefix, coord_t * coord);
45463 +int reiser4_check_cde(const coord_t * coord, const char **error);
45464 +
45465 +/* plugin->u.item.s.dir.* */
45466 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
45467 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
45468 + lock_handle * lh);
45469 +char *extract_name_cde(const coord_t * coord, char *buf);
45470 +int add_entry_cde(struct inode *dir, coord_t * coord,
45471 + lock_handle * lh, const struct dentry *name,
45472 + reiser4_dir_entry_desc * entry);
45473 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45474 + lock_handle * lh, reiser4_dir_entry_desc * entry);
45475 +int max_name_len_cde(const struct inode *dir);
45476 +
45477 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45478 +#endif
45479 +
45480 +/* Make Linus happy.
45481 + Local variables:
45482 + c-indentation-style: "K&R"
45483 + mode-name: "LC"
45484 + c-basic-offset: 8
45485 + tab-width: 8
45486 + fill-column: 120
45487 + End:
45488 +*/
45489 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.20/fs/reiser4/plugin/item/ctail.c
45490 --- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
45491 +++ linux-2.6.20/fs/reiser4/plugin/item/ctail.c 2007-05-06 14:50:43.803008220 +0400
45492 @@ -0,0 +1,1570 @@
45493 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45494 +
45495 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
45496 +
45497 +/* DESCRIPTION:
45498 +
45499 +Each cryptcompress object is stored on disk as a set of clusters sliced
45500 +into ctails.
45501 +
45502 +Internal on-disk structure:
45503 +
45504 + HEADER (1) Here stored disk cluster shift
45505 + BODY
45506 +*/
45507 +
45508 +#include "../../forward.h"
45509 +#include "../../debug.h"
45510 +#include "../../dformat.h"
45511 +#include "../../kassign.h"
45512 +#include "../../key.h"
45513 +#include "../../coord.h"
45514 +#include "item.h"
45515 +#include "../node/node.h"
45516 +#include "../plugin.h"
45517 +#include "../object.h"
45518 +#include "../../znode.h"
45519 +#include "../../carry.h"
45520 +#include "../../tree.h"
45521 +#include "../../inode.h"
45522 +#include "../../super.h"
45523 +#include "../../context.h"
45524 +#include "../../page_cache.h"
45525 +#include "../cluster.h"
45526 +#include "../../flush.h"
45527 +#include "../../tree_walk.h"
45528 +
45529 +#include <linux/pagevec.h>
45530 +#include <linux/swap.h>
45531 +#include <linux/fs.h>
45532 +
45533 +/* return body of ctail item at @coord */
45534 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45535 +{
45536 + assert("edward-60", coord != NULL);
45537 + return item_body_by_coord(coord);
45538 +}
45539 +
45540 +static int cluster_shift_by_coord(const coord_t * coord)
45541 +{
45542 + return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45543 +}
45544 +
45545 +static inline void dclust_set_extension_shift(hint_t * hint)
45546 +{
45547 + assert("edward-1270",
45548 + item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
45549 + hint->ext_coord.extension.ctail.shift =
45550 + cluster_shift_by_coord(&hint->ext_coord.coord);
45551 +}
45552 +
45553 +static loff_t off_by_coord(const coord_t * coord)
45554 +{
45555 + reiser4_key key;
45556 + return get_key_offset(item_key_by_coord(coord, &key));
45557 +}
45558 +
45559 +int coord_is_unprepped_ctail(const coord_t * coord)
45560 +{
45561 + assert("edward-1233", coord != NULL);
45562 + assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45563 + assert("edward-1235",
45564 + ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45565 + nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45566 +
45567 + return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45568 +}
45569 +
45570 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45571 +{
45572 + int shift;
45573 +
45574 + if (inode != NULL) {
45575 + shift = inode_cluster_shift(inode);
45576 + assert("edward-1236",
45577 + ergo(!coord_is_unprepped_ctail(coord),
45578 + shift == cluster_shift_by_coord(coord)));
45579 + } else {
45580 + assert("edward-1237", !coord_is_unprepped_ctail(coord));
45581 + shift = cluster_shift_by_coord(coord);
45582 + }
45583 + return off_by_coord(coord) >> shift;
45584 +}
45585 +
45586 +static int disk_cluster_size(const coord_t * coord)
45587 +{
45588 + assert("edward-1156",
45589 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45590 + /* calculation of disk cluster size
45591 + is meaninless if ctail is unprepped */
45592 + assert("edward-1238", !coord_is_unprepped_ctail(coord));
45593 +
45594 + return 1 << cluster_shift_by_coord(coord);
45595 +}
45596 +
45597 +/* true if the key is of first disk cluster item */
45598 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45599 +{
45600 + assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45601 +
45602 + return coord_is_unprepped_ctail(coord) ||
45603 + ((get_key_offset(key) &
45604 + ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45605 +}
45606 +
45607 +static char *first_unit(coord_t * coord)
45608 +{
45609 + /* FIXME: warning: pointer of type `void *' used in arithmetic */
45610 + return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45611 +}
45612 +
45613 +/* plugin->u.item.b.max_key_inside :
45614 + tail_max_key_inside */
45615 +
45616 +/* plugin->u.item.b.can_contain_key */
45617 +int
45618 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45619 + const reiser4_item_data * data)
45620 +{
45621 + reiser4_key item_key;
45622 +
45623 + if (item_plugin_by_coord(coord) != data->iplug)
45624 + return 0;
45625 +
45626 + item_key_by_coord(coord, &item_key);
45627 + if (get_key_locality(key) != get_key_locality(&item_key) ||
45628 + get_key_objectid(key) != get_key_objectid(&item_key))
45629 + return 0;
45630 + if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45631 + get_key_offset(key))
45632 + return 0;
45633 + if (is_disk_cluster_key(key, coord))
45634 + return 0;
45635 + return 1;
45636 +}
45637 +
45638 +/* plugin->u.item.b.mergeable
45639 + c-tails of different clusters are not mergeable */
45640 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45641 +{
45642 + reiser4_key key1, key2;
45643 +
45644 + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45645 + assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
45646 + UNIX_FILE_METADATA_ITEM_TYPE));
45647 +
45648 + if (item_id_by_coord(p2) != CTAIL_ID) {
45649 + /* second item is of another type */
45650 + return 0;
45651 + }
45652 +
45653 + item_key_by_coord(p1, &key1);
45654 + item_key_by_coord(p2, &key2);
45655 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
45656 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
45657 + get_key_type(&key1) != get_key_type(&key2)) {
45658 + /* items of different objects */
45659 + return 0;
45660 + }
45661 + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45662 + /* not adjacent items */
45663 + return 0;
45664 + if (is_disk_cluster_key(&key2, p2))
45665 + return 0;
45666 + return 1;
45667 +}
45668 +
45669 +/* plugin->u.item.b.nr_units */
45670 +pos_in_node_t nr_units_ctail(const coord_t * coord)
45671 +{
45672 + return (item_length_by_coord(coord) -
45673 + sizeof(ctail_formatted_at(coord)->cluster_shift));
45674 +}
45675 +
45676 +/* plugin->u.item.b.estimate:
45677 + estimate how much space is needed to insert/paste @data->length bytes
45678 + into ctail at @coord */
45679 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
45680 + const reiser4_item_data *
45681 + data /* parameters for new item */ )
45682 +{
45683 + if (coord == NULL)
45684 + /* insert */
45685 + return (sizeof(ctail_item_format) + data->length);
45686 + else
45687 + /* paste */
45688 + return data->length;
45689 +}
45690 +
45691 +/* ->init() method for this item plugin. */
45692 +int init_ctail(coord_t * to /* coord of item */ ,
45693 + coord_t * from /* old_item */ ,
45694 + reiser4_item_data * data /* structure used for insertion */ )
45695 +{
45696 + int cluster_shift; /* cpu value to convert */
45697 +
45698 + if (data) {
45699 + assert("edward-463", data->length > sizeof(ctail_item_format));
45700 + cluster_shift = *((int *)(data->arg));
45701 + data->length -= sizeof(ctail_item_format);
45702 + } else {
45703 + assert("edward-464", from != NULL);
45704 + assert("edward-855", ctail_ok(from));
45705 + cluster_shift = (int)(cluster_shift_by_coord(from));
45706 + }
45707 + put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45708 + assert("edward-856", ctail_ok(to));
45709 + return 0;
45710 +}
45711 +
45712 +/* plugin->u.item.b.lookup:
45713 + NULL: We are looking for item keys only */
45714 +
45715 +#if REISER4_DEBUG
45716 +int ctail_ok(const coord_t * coord)
45717 +{
45718 + return coord_is_unprepped_ctail(coord) ||
45719 + cluster_shift_ok(cluster_shift_by_coord(coord));
45720 +}
45721 +
45722 +/* plugin->u.item.b.check */
45723 +int check_ctail(const coord_t * coord, const char **error)
45724 +{
45725 + if (!ctail_ok(coord)) {
45726 + if (error)
45727 + *error = "bad cluster shift in ctail";
45728 + return 1;
45729 + }
45730 + return 0;
45731 +}
45732 +#endif
45733 +
45734 +/* plugin->u.item.b.paste */
45735 +int
45736 +paste_ctail(coord_t * coord, reiser4_item_data * data,
45737 + carry_plugin_info * info UNUSED_ARG)
45738 +{
45739 + unsigned old_nr_units;
45740 +
45741 + assert("edward-268", data->data != NULL);
45742 + /* copy only from kernel space */
45743 + assert("edward-66", data->user == 0);
45744 +
45745 + old_nr_units =
45746 + item_length_by_coord(coord) - sizeof(ctail_item_format) -
45747 + data->length;
45748 +
45749 + /* ctail items never get pasted in the middle */
45750 +
45751 + if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45752 +
45753 + /* paste at the beginning when create new item */
45754 + assert("edward-450",
45755 + item_length_by_coord(coord) ==
45756 + data->length + sizeof(ctail_item_format));
45757 + assert("edward-451", old_nr_units == 0);
45758 + } else if (coord->unit_pos == old_nr_units - 1
45759 + && coord->between == AFTER_UNIT) {
45760 +
45761 + /* paste at the end */
45762 + coord->unit_pos++;
45763 + } else
45764 + impossible("edward-453", "bad paste position");
45765 +
45766 + memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45767 +
45768 + assert("edward-857", ctail_ok(coord));
45769 +
45770 + return 0;
45771 +}
45772 +
45773 +/* plugin->u.item.b.fast_paste */
45774 +
45775 +/* plugin->u.item.b.can_shift
45776 + number of units is returned via return value, number of bytes via @size. For
45777 + ctail items they coincide */
45778 +int
45779 +can_shift_ctail(unsigned free_space, coord_t * source,
45780 + znode * target, shift_direction direction UNUSED_ARG,
45781 + unsigned *size /* number of bytes */ , unsigned want)
45782 +{
45783 + /* make sure that that we do not want to shift more than we have */
45784 + assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45785 +
45786 + *size = min(want, free_space);
45787 +
45788 + if (!target) {
45789 + /* new item will be created */
45790 + if (*size <= sizeof(ctail_item_format)) {
45791 + *size = 0;
45792 + return 0;
45793 + }
45794 + return *size - sizeof(ctail_item_format);
45795 + }
45796 + return *size;
45797 +}
45798 +
45799 +/* plugin->u.item.b.copy_units
45800 + cooperates with ->can_shift() */
45801 +void
45802 +copy_units_ctail(coord_t * target, coord_t * source,
45803 + unsigned from, unsigned count /* units */ ,
45804 + shift_direction where_is_free_space,
45805 + unsigned free_space /* bytes */ )
45806 +{
45807 + /* make sure that item @target is expanded already */
45808 + assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45809 + assert("edward-70", free_space == count || free_space == count + 1);
45810 +
45811 + assert("edward-858", ctail_ok(source));
45812 +
45813 + if (where_is_free_space == SHIFT_LEFT) {
45814 + /* append item @target with @count first bytes of @source:
45815 + this restriction came from ordinary tails */
45816 + assert("edward-71", from == 0);
45817 + assert("edward-860", ctail_ok(target));
45818 +
45819 + memcpy(first_unit(target) + nr_units_ctail(target) - count,
45820 + first_unit(source), count);
45821 + } else {
45822 + /* target item is moved to right already */
45823 + reiser4_key key;
45824 +
45825 + assert("edward-72", nr_units_ctail(source) == from + count);
45826 +
45827 + if (free_space == count) {
45828 + init_ctail(target, source, NULL);
45829 + } else {
45830 + /* new item has been created */
45831 + assert("edward-862", ctail_ok(target));
45832 + }
45833 + memcpy(first_unit(target), first_unit(source) + from, count);
45834 +
45835 + assert("edward-863", ctail_ok(target));
45836 +
45837 + /* new units are inserted before first unit in an item,
45838 + therefore, we have to update item key */
45839 + item_key_by_coord(source, &key);
45840 + set_key_offset(&key, get_key_offset(&key) + from);
45841 +
45842 + node_plugin_by_node(target->node)->update_item_key(target, &key,
45843 + NULL /*info */);
45844 + }
45845 +}
45846 +
45847 +/* plugin->u.item.b.create_hook */
45848 +int create_hook_ctail(const coord_t * coord, void *arg)
45849 +{
45850 + assert("edward-864", znode_is_loaded(coord->node));
45851 +
45852 + znode_set_convertible(coord->node);
45853 + return 0;
45854 +}
45855 +
45856 +/* plugin->u.item.b.kill_hook */
45857 +int
45858 +kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
45859 + carry_kill_data * kdata)
45860 +{
45861 + struct inode *inode;
45862 +
45863 + assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45864 + assert("edward-291", znode_is_write_locked(coord->node));
45865 +
45866 + inode = kdata->inode;
45867 + if (inode) {
45868 + reiser4_key key;
45869 + item_key_by_coord(coord, &key);
45870 +
45871 + if (from == 0 && is_disk_cluster_key(&key, coord)) {
45872 + /* disk cluster is killed */
45873 + cloff_t start =
45874 + off_to_clust(get_key_offset(&key), inode);
45875 + truncate_page_cluster_cryptcompress(inode, start,
45876 + kdata->params.truncate);
45877 + inode_sub_bytes(inode, inode_cluster_size(inode));
45878 + }
45879 + }
45880 + return 0;
45881 +}
45882 +
45883 +/* for shift_hook_ctail(),
45884 + return true if the first disk cluster item has dirty child
45885 +*/
45886 +static int ctail_convertible(const coord_t * coord)
45887 +{
45888 + int result;
45889 + reiser4_key key;
45890 + jnode *child = NULL;
45891 +
45892 + assert("edward-477", coord != NULL);
45893 + assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45894 +
45895 + if (coord_is_unprepped_ctail(coord))
45896 + /* unprepped ctail should be converted */
45897 + return 1;
45898 +
45899 + item_key_by_coord(coord, &key);
45900 + child = jlookup(current_tree,
45901 + get_key_objectid(&key),
45902 + off_to_pg(off_by_coord(coord)));
45903 + if (!child)
45904 + return 0;
45905 + result = JF_ISSET(child, JNODE_DIRTY);
45906 + jput(child);
45907 + return result;
45908 +}
45909 +
45910 +/* FIXME-EDWARD */
45911 +/* plugin->u.item.b.shift_hook */
45912 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45913 + unsigned from UNUSED_ARG /* start unit */ ,
45914 + unsigned count UNUSED_ARG /* stop unit */ ,
45915 + znode * old_node /* old parent */ )
45916 +{
45917 + assert("edward-479", item != NULL);
45918 + assert("edward-480", item->node != old_node);
45919 +
45920 + if (!znode_convertible(old_node) || znode_convertible(item->node))
45921 + return 0;
45922 + if (ctail_convertible(item))
45923 + znode_set_convertible(item->node);
45924 + return 0;
45925 +}
45926 +
45927 +static int
45928 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45929 + int cut, void *p, reiser4_key * smallest_removed,
45930 + reiser4_key * new_first)
45931 +{
45932 + pos_in_node_t count; /* number of units to cut */
45933 + char *item;
45934 +
45935 + count = to - from + 1;
45936 + item = item_body_by_coord(coord);
45937 +
45938 + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45939 +
45940 + if (smallest_removed) {
45941 + /* store smallest key removed */
45942 + item_key_by_coord(coord, smallest_removed);
45943 + set_key_offset(smallest_removed,
45944 + get_key_offset(smallest_removed) + from);
45945 + }
45946 +
45947 + if (new_first) {
45948 + assert("vs-1531", from == 0);
45949 +
45950 + item_key_by_coord(coord, new_first);
45951 + set_key_offset(new_first,
45952 + get_key_offset(new_first) + from + count);
45953 + }
45954 +
45955 + if (!cut)
45956 + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45957 +
45958 + if (from == 0) {
45959 + if (count != nr_units_ctail(coord)) {
45960 + /* part of item is removed, so move free space at the beginning
45961 + of the item and update item key */
45962 + reiser4_key key;
45963 + memcpy(item + to + 1, item, sizeof(ctail_item_format));
45964 + item_key_by_coord(coord, &key);
45965 + set_key_offset(&key, get_key_offset(&key) + count);
45966 + node_plugin_by_node(coord->node)->update_item_key(coord,
45967 + &key,
45968 + NULL);
45969 + } else {
45970 + /* cut_units should not be called to cut evrything */
45971 + assert("vs-1532", ergo(cut, 0));
45972 + /* whole item is cut, so more then amount of space occupied
45973 + by units got freed */
45974 + count += sizeof(ctail_item_format);
45975 + }
45976 + if (REISER4_DEBUG)
45977 + memset(item, 0, count);
45978 + } else if (REISER4_DEBUG)
45979 + memset(item + sizeof(ctail_item_format) + from, 0, count);
45980 + return count;
45981 +}
45982 +
45983 +/* plugin->u.item.b.cut_units */
45984 +int
45985 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45986 + carry_cut_data * cdata, reiser4_key * smallest_removed,
45987 + reiser4_key * new_first)
45988 +{
45989 + return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45990 + smallest_removed, new_first);
45991 +}
45992 +
45993 +/* plugin->u.item.b.kill_units */
45994 +int
45995 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45996 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45997 + reiser4_key * new_first)
45998 +{
45999 + return cut_or_kill_ctail_units(item, from, to, 0, kdata,
46000 + smallest_removed, new_first);
46001 +}
46002 +
46003 +/* plugin->u.item.s.file.read */
46004 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
46005 +{
46006 + uf_coord_t *uf_coord;
46007 + coord_t *coord;
46008 +
46009 + uf_coord = &hint->ext_coord;
46010 + coord = &uf_coord->coord;
46011 + assert("edward-127", f->user == 0);
46012 + assert("edward-129", coord && coord->node);
46013 + assert("edward-130", coord_is_existing_unit(coord));
46014 + assert("edward-132", znode_is_loaded(coord->node));
46015 +
46016 + /* start read only from the beginning of ctail */
46017 + assert("edward-133", coord->unit_pos == 0);
46018 + /* read only whole ctails */
46019 + assert("edward-135", nr_units_ctail(coord) <= f->length);
46020 +
46021 + assert("edward-136", reiser4_schedulable());
46022 + assert("edward-886", ctail_ok(coord));
46023 +
46024 + if (f->data)
46025 + memcpy(f->data, (char *)first_unit(coord),
46026 + (size_t) nr_units_ctail(coord));
46027 +
46028 + dclust_set_extension_shift(hint);
46029 + mark_page_accessed(znode_page(coord->node));
46030 + move_flow_forward(f, nr_units_ctail(coord));
46031 +
46032 + return 0;
46033 +}
46034 +
46035 +/* Reads a disk cluster consists of ctail items,
46036 + attaches a transform stream with plain text */
46037 +int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
46038 + znode_lock_mode mode)
46039 +{
46040 + int result;
46041 + assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
46042 + assert("edward-671", clust->hint != NULL);
46043 + assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
46044 + assert("edward-672", cryptcompress_inode_ok(inode));
46045 +
46046 + /* set input stream */
46047 + result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
46048 + if (result)
46049 + return result;
46050 +
46051 + result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
46052 + assert("edward-1340", !result);
46053 + if (result)
46054 + return result;
46055 + if (mode == ZNODE_READ_LOCK)
46056 + /* write still need the lock to insert unprepped
46057 + items, etc... */
46058 + put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
46059 +
46060 + if (clust->dstat == FAKE_DISK_CLUSTER ||
46061 + clust->dstat == UNPR_DISK_CLUSTER) {
46062 + tfm_cluster_set_uptodate(&clust->tc);
46063 + return 0;
46064 + }
46065 + result = grab_coa(&clust->tc, inode_compression_plugin(inode));
46066 + if (result)
46067 + return result;
46068 + result = reiser4_inflate_cluster(clust, inode);
46069 + if (result)
46070 + return result;
46071 + tfm_cluster_set_uptodate(&clust->tc);
46072 + return 0;
46073 +}
46074 +
46075 +/* read one locked page */
46076 +int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
46077 + struct page *page, znode_lock_mode mode)
46078 +{
46079 + int ret;
46080 + unsigned cloff;
46081 + char *data;
46082 + size_t pgcnt;
46083 + tfm_cluster_t *tc = &clust->tc;
46084 +
46085 + assert("edward-212", PageLocked(page));
46086 +
46087 + if (PageUptodate(page))
46088 + goto exit;
46089 +
46090 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
46091 + clust->index = pg_to_clust(page->index, inode);
46092 + unlock_page(page);
46093 + ret = ctail_read_disk_cluster(clust, inode, mode);
46094 + lock_page(page);
46095 + if (ret)
46096 + return ret;
46097 + }
46098 + if (PageUptodate(page))
46099 + /* races with another read/write */
46100 + goto exit;
46101 +
46102 + /* bytes in the page */
46103 + pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
46104 +
46105 + if (pgcnt == 0) {
46106 + assert("edward-1290", 0);
46107 + return RETERR(-EINVAL);
46108 + }
46109 + assert("edward-119", tfm_cluster_is_uptodate(tc));
46110 +
46111 + switch (clust->dstat) {
46112 + case UNPR_DISK_CLUSTER:
46113 + assert("edward-1285", 0);
46114 +#if REISER4_DEBUG
46115 + warning("edward-1168",
46116 + "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46117 + page->index, clust->index,
46118 + (unsigned long long)get_inode_oid(inode));
46119 +#endif
46120 + case FAKE_DISK_CLUSTER:
46121 + /* fill the page by zeroes */
46122 + data = kmap_atomic(page, KM_USER0);
46123 +
46124 + memset(data, 0, PAGE_CACHE_SIZE);
46125 + flush_dcache_page(page);
46126 + kunmap_atomic(data, KM_USER0);
46127 + SetPageUptodate(page);
46128 + break;
46129 + case PREP_DISK_CLUSTER:
46130 + /* fill the page by transformed data */
46131 + assert("edward-1058", !PageUptodate(page));
46132 + assert("edward-120", tc->len <= inode_cluster_size(inode));
46133 +
46134 + /* start page offset in the cluster */
46135 + cloff = pg_to_off_to_cloff(page->index, inode);
46136 +
46137 + data = kmap(page);
46138 + memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46139 + memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46140 + flush_dcache_page(page);
46141 + kunmap(page);
46142 + SetPageUptodate(page);
46143 + break;
46144 + default:
46145 + impossible("edward-1169", "bad disk cluster state");
46146 + }
46147 + exit:
46148 + return 0;
46149 +}
46150 +
46151 +/* plugin->u.item.s.file.readpage */
46152 +int readpage_ctail(void *vp, struct page *page)
46153 +{
46154 + int result;
46155 + hint_t *hint;
46156 + reiser4_cluster_t *clust = vp;
46157 +
46158 + assert("edward-114", clust != NULL);
46159 + assert("edward-115", PageLocked(page));
46160 + assert("edward-116", !PageUptodate(page));
46161 + assert("edward-117", !jprivate(page) && !PagePrivate(page));
46162 + assert("edward-118", page->mapping && page->mapping->host);
46163 + assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46164 +
46165 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46166 + if (hint == NULL) {
46167 + unlock_page(page);
46168 + return RETERR(-ENOMEM);
46169 + }
46170 + clust->hint = hint;
46171 + result = load_file_hint(clust->file, hint);
46172 + if (result) {
46173 + kfree(hint);
46174 + unlock_page(page);
46175 + return result;
46176 + }
46177 + assert("vs-25", hint->ext_coord.lh == &hint->lh);
46178 + result = do_readpage_ctail(page->mapping->host, clust, page,
46179 + ZNODE_READ_LOCK);
46180 +
46181 + assert("edward-213", PageLocked(page));
46182 + assert("edward-1163", ergo(!result, PageUptodate(page)));
46183 + assert("edward-868",
46184 + ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46185 +
46186 + unlock_page(page);
46187 + done_lh(&hint->lh);
46188 + hint->ext_coord.valid = 0;
46189 + save_file_hint(clust->file, hint);
46190 + kfree(hint);
46191 + tfm_cluster_clr_uptodate(&clust->tc);
46192 +
46193 + return result;
46194 +}
46195 +
46196 +/* Helper function for ->readpages() */
46197 +static int
46198 +ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46199 +{
46200 + int i;
46201 + int result;
46202 + assert("edward-779", clust != NULL);
46203 + assert("edward-1059", clust->win == NULL);
46204 + assert("edward-780", inode != NULL);
46205 +
46206 + result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46207 + if (result)
46208 + return result;
46209 + result = ctail_read_disk_cluster(clust, inode, ZNODE_READ_LOCK);
46210 + if (result)
46211 + goto out;
46212 + /* at this point stream with valid plain text is attached */
46213 + assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46214 +
46215 + for (i = 0; i < clust->nr_pages; i++) {
46216 + struct page *page = clust->pages[i];
46217 + lock_page(page);
46218 + result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46219 + unlock_page(page);
46220 + if (result)
46221 + break;
46222 + }
46223 + tfm_cluster_clr_uptodate(&clust->tc);
46224 + out:
46225 + reiser4_release_cluster_pages(clust);
46226 + return result;
46227 +}
46228 +
46229 +/* filler for read_cache_pages() */
46230 +static int ctail_readpages_filler(void * data, struct page * page)
46231 +{
46232 + int ret = 0;
46233 + reiser4_cluster_t * clust = data;
46234 + struct inode * inode = clust->file->f_dentry->d_inode;
46235 +
46236 + if (PageUptodate(page)) {
46237 + unlock_page(page);
46238 + return 0;
46239 + }
46240 + unlock_page(page);
46241 + move_cluster_forward(clust, inode, page->index);
46242 + ret = ctail_read_page_cluster(clust, inode);
46243 + if (ret)
46244 + return ret;
46245 + assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46246 +
46247 + lock_page(page);
46248 + ret = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46249 + assert("edward-1061", ergo(!ret, PageUptodate(page)));
46250 + unlock_page(page);
46251 +
46252 + return ret;
46253 +}
46254 +
46255 +/* We populate a bit more then upper readahead suggests:
46256 + with each nominated page we read the whole page cluster
46257 + this page belongs to. */
46258 +int readpages_ctail(struct file *file, struct address_space *mapping,
46259 + struct list_head *pages)
46260 +{
46261 + int ret = 0;
46262 + hint_t *hint;
46263 + reiser4_cluster_t clust;
46264 + struct inode *inode = mapping->host;
46265 +
46266 + assert("edward-1521", inode == file->f_dentry->d_inode);
46267 +
46268 + cluster_init_read(&clust, NULL);
46269 + clust.file = file;
46270 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46271 + if (hint == NULL) {
46272 + warning("vs-28", "failed to allocate hint");
46273 + ret = RETERR(-ENOMEM);
46274 + goto exit1;
46275 + }
46276 + clust.hint = hint;
46277 + ret = load_file_hint(clust.file, hint);
46278 + if (ret) {
46279 + warning("edward-1522", "failed to load hint");
46280 + goto exit2;
46281 + }
46282 + assert("vs-26", hint->ext_coord.lh == &hint->lh);
46283 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46284 + if (ret) {
46285 + warning("edward-1523", "failed to alloc pgset");
46286 + goto exit3;
46287 + }
46288 + ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46289 +
46290 + assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46291 + exit3:
46292 + done_lh(&hint->lh);
46293 + save_file_hint(file, hint);
46294 + hint->ext_coord.valid = 0;
46295 + exit2:
46296 + kfree(hint);
46297 + exit1:
46298 + put_cluster_handle(&clust);
46299 + return ret;
46300 +}
46301 +
46302 +/*
46303 + plugin->u.item.s.file.append_key
46304 + key of the first item of the next disk cluster
46305 +*/
46306 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46307 +{
46308 + assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46309 + assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46310 +
46311 + item_key_by_coord(coord, key);
46312 + set_key_offset(key,
46313 + ((__u64) (clust_by_coord(coord, NULL)) +
46314 + 1) << cluster_shift_by_coord(coord));
46315 + return key;
46316 +}
46317 +
46318 +static int
46319 +insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46320 +{
46321 + int result;
46322 + char buf[UCTAIL_NR_UNITS];
46323 + reiser4_item_data data;
46324 + reiser4_key key;
46325 + int shift = (int)UCTAIL_SHIFT;
46326 +
46327 + memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46328 + result = key_by_inode_cryptcompress(inode,
46329 + clust_to_off(clust->index, inode),
46330 + &key);
46331 + if (result)
46332 + return result;
46333 + data.user = 0;
46334 + data.iplug = item_plugin_by_id(CTAIL_ID);
46335 + data.arg = &shift;
46336 + data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46337 + data.data = buf;
46338 +
46339 + result = insert_by_coord(&clust->hint->ext_coord.coord,
46340 + &data, &key, clust->hint->ext_coord.lh, 0);
46341 + return result;
46342 +}
46343 +
46344 +static int
46345 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46346 + struct inode *inode)
46347 +{
46348 + int result;
46349 + carry_pool *pool;
46350 + carry_level *lowest_level;
46351 + reiser4_item_data *data;
46352 + carry_op *op;
46353 + int cluster_shift = inode_cluster_shift(inode);
46354 +
46355 + pool =
46356 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46357 + sizeof(*data));
46358 + if (IS_ERR(pool))
46359 + return PTR_ERR(pool);
46360 + lowest_level = (carry_level *) (pool + 1);
46361 + init_carry_level(lowest_level, pool);
46362 + data = (reiser4_item_data *) (lowest_level + 3);
46363 +
46364 + assert("edward-466", coord->between == AFTER_ITEM
46365 + || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46366 + || coord->between == EMPTY_NODE
46367 + || coord->between == BEFORE_UNIT);
46368 +
46369 + if (coord->between == AFTER_UNIT) {
46370 + coord->unit_pos = 0;
46371 + coord->between = AFTER_ITEM;
46372 + }
46373 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46374 + 0 /* operate directly on coord -> node */);
46375 + if (IS_ERR(op) || (op == NULL)) {
46376 + done_carry_pool(pool);
46377 + return RETERR(op ? PTR_ERR(op) : -EIO);
46378 + }
46379 + data->user = 0;
46380 + data->iplug = item_plugin_by_id(CTAIL_ID);
46381 + data->arg = &cluster_shift;
46382 +
46383 + data->length = 0;
46384 + data->data = NULL;
46385 +
46386 + op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46387 + op->u.insert_flow.insert_point = coord;
46388 + op->u.insert_flow.flow = f;
46389 + op->u.insert_flow.data = data;
46390 + op->u.insert_flow.new_nodes = 0;
46391 +
46392 + lowest_level->track_type = CARRY_TRACK_CHANGE;
46393 + lowest_level->tracked = lh;
46394 +
46395 + result = reiser4_carry(lowest_level, NULL);
46396 + done_carry_pool(pool);
46397 +
46398 + return result;
46399 +}
46400 +
46401 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46402 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
46403 + lock_handle * lh, flow_t * f,
46404 + struct inode *inode)
46405 +{
46406 + int ret;
46407 + coord_t pos;
46408 + lock_handle lock;
46409 +
46410 + assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46411 + assert("edward-484", coord->between == AT_UNIT
46412 + || coord->between == AFTER_ITEM);
46413 + assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46414 +
46415 + coord_dup(&pos, coord);
46416 + pos.unit_pos = 0;
46417 + pos.between = AFTER_ITEM;
46418 +
46419 + init_lh(&lock);
46420 + copy_lh(&lock, lh);
46421 +
46422 + ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
46423 + done_lh(&lock);
46424 + assert("edward-1347", znode_is_write_locked(lh->node));
46425 + assert("edward-1228", !ret);
46426 + return ret;
46427 +}
46428 +
46429 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46430 +static int overwrite_ctail(coord_t * coord, flow_t * f)
46431 +{
46432 + unsigned count;
46433 +
46434 + assert("edward-269", f->user == 0);
46435 + assert("edward-270", f->data != NULL);
46436 + assert("edward-271", f->length > 0);
46437 + assert("edward-272", coord_is_existing_unit(coord));
46438 + assert("edward-273", coord->unit_pos == 0);
46439 + assert("edward-274", znode_is_write_locked(coord->node));
46440 + assert("edward-275", reiser4_schedulable());
46441 + assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46442 + assert("edward-1243", ctail_ok(coord));
46443 +
46444 + count = nr_units_ctail(coord);
46445 +
46446 + if (count > f->length)
46447 + count = f->length;
46448 + memcpy(first_unit(coord), f->data, count);
46449 + move_flow_forward(f, count);
46450 + coord->unit_pos += count;
46451 + return 0;
46452 +}
46453 +
46454 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46455 + cut ctail (part or whole) starting from next unit position */
46456 +static int cut_ctail(coord_t * coord)
46457 +{
46458 + coord_t stop;
46459 +
46460 + assert("edward-435", coord->between == AT_UNIT &&
46461 + coord->item_pos < coord_num_items(coord) &&
46462 + coord->unit_pos <= coord_num_units(coord));
46463 +
46464 + if (coord->unit_pos == coord_num_units(coord))
46465 + /* nothing to cut */
46466 + return 0;
46467 + coord_dup(&stop, coord);
46468 + stop.unit_pos = coord_last_unit_pos(coord);
46469 +
46470 + return cut_node_content(coord, &stop, NULL, NULL, NULL);
46471 +}
46472 +
46473 +int
46474 +ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
46475 +{
46476 + int result;
46477 + assert("edward-1244", inode != NULL);
46478 + assert("edward-1245", clust->hint != NULL);
46479 + assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46480 + assert("edward-1247", clust->reserved == 1);
46481 +
46482 + result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46483 + if (cbk_errored(result))
46484 + return result;
46485 + assert("edward-1249", result == CBK_COORD_NOTFOUND);
46486 + assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46487 +
46488 + assert("edward-1295",
46489 + clust->hint->ext_coord.lh->node ==
46490 + clust->hint->ext_coord.coord.node);
46491 +
46492 + coord_set_between_clusters(&clust->hint->ext_coord.coord);
46493 +
46494 + result = insert_unprepped_ctail(clust, inode);
46495 + all_grabbed2free();
46496 +
46497 + assert("edward-1251", !result);
46498 + assert("edward-1252", cryptcompress_inode_ok(inode));
46499 + assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46500 + assert("edward-1254",
46501 + reiser4_clustered_blocks(reiser4_get_current_sb()));
46502 + assert("edward-1255",
46503 + znode_convertible(clust->hint->ext_coord.coord.node));
46504 +
46505 + return result;
46506 +}
46507 +
46508 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
46509 +{
46510 + int result = 0;
46511 + convert_item_info_t *info;
46512 +
46513 + assert("edward-468", pos != NULL);
46514 + assert("edward-469", pos->sq != NULL);
46515 + assert("edward-845", item_convert_data(pos) != NULL);
46516 +
46517 + info = item_convert_data(pos);
46518 + assert("edward-679", info->flow.data != NULL);
46519 +
46520 + switch (mode) {
46521 + case CRC_APPEND_ITEM:
46522 + assert("edward-1229", info->flow.length != 0);
46523 + assert("edward-1256",
46524 + cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46525 + result =
46526 + insert_cryptcompress_flow_in_place(&pos->coord,
46527 + &pos->lock,
46528 + &info->flow,
46529 + info->inode);
46530 + break;
46531 + case CRC_OVERWRITE_ITEM:
46532 + assert("edward-1230", info->flow.length != 0);
46533 + overwrite_ctail(&pos->coord, &info->flow);
46534 + if (info->flow.length != 0)
46535 + break;
46536 + case CRC_CUT_ITEM:
46537 + assert("edward-1231", info->flow.length == 0);
46538 + result = cut_ctail(&pos->coord);
46539 + break;
46540 + default:
46541 + result = RETERR(-EIO);
46542 + impossible("edward-244", "bad convert mode");
46543 + }
46544 + return result;
46545 +}
46546 +
46547 +/* plugin->u.item.f.scan */
46548 +int scan_ctail(flush_scan * scan)
46549 +{
46550 + int result = 0;
46551 + struct page *page;
46552 + struct inode *inode;
46553 + jnode *node = scan->node;
46554 +
46555 + assert("edward-227", scan->node != NULL);
46556 + assert("edward-228", jnode_is_cluster_page(scan->node));
46557 + assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46558 +
46559 + page = jnode_page(node);
46560 + inode = page->mapping->host;
46561 +
46562 + if (!reiser4_scanning_left(scan))
46563 + return result;
46564 + if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46565 + znode_make_dirty(scan->parent_lock.node);
46566 +
46567 + if (!znode_convertible(scan->parent_lock.node)) {
46568 + if (JF_ISSET(scan->node, JNODE_DIRTY))
46569 + znode_set_convertible(scan->parent_lock.node);
46570 + else {
46571 + warning("edward-681",
46572 + "cluster page is already processed");
46573 + return -EAGAIN;
46574 + }
46575 + }
46576 + return result;
46577 +}
46578 +
46579 +/* If true, this function attaches children */
46580 +static int should_attach_convert_idata(flush_pos_t * pos)
46581 +{
46582 + int result;
46583 + assert("edward-431", pos != NULL);
46584 + assert("edward-432", pos->child == NULL);
46585 + assert("edward-619", znode_is_write_locked(pos->coord.node));
46586 + assert("edward-470",
46587 + item_plugin_by_coord(&pos->coord) ==
46588 + item_plugin_by_id(CTAIL_ID));
46589 +
46590 + /* check for leftmost child */
46591 + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46592 +
46593 + if (!pos->child)
46594 + return 0;
46595 + spin_lock_jnode(pos->child);
46596 + result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46597 + pos->child->atom == ZJNODE(pos->coord.node)->atom);
46598 + spin_unlock_jnode(pos->child);
46599 + if (!result && pos->child) {
46600 + /* existing child isn't to attach, clear up this one */
46601 + jput(pos->child);
46602 + pos->child = NULL;
46603 + }
46604 + return result;
46605 +}
46606 +
46607 +/* plugin->init_convert_data() */
46608 +static int
46609 +init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
46610 +{
46611 + assert("edward-813", idata != NULL);
46612 + assert("edward-814", inode != NULL);
46613 +
46614 + idata->inode = inode;
46615 + idata->d_cur = DC_FIRST_ITEM;
46616 + idata->d_next = DC_INVALID_STATE;
46617 +
46618 + return 0;
46619 +}
46620 +
46621 +static int alloc_item_convert_data(convert_info_t * sq)
46622 +{
46623 + assert("edward-816", sq != NULL);
46624 + assert("edward-817", sq->itm == NULL);
46625 +
46626 + sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
46627 + if (sq->itm == NULL)
46628 + return RETERR(-ENOMEM);
46629 + return 0;
46630 +}
46631 +
46632 +static void free_item_convert_data(convert_info_t * sq)
46633 +{
46634 + assert("edward-818", sq != NULL);
46635 + assert("edward-819", sq->itm != NULL);
46636 + assert("edward-820", sq->iplug != NULL);
46637 +
46638 + kfree(sq->itm);
46639 + sq->itm = NULL;
46640 + return;
46641 +}
46642 +
46643 +static int alloc_convert_data(flush_pos_t * pos)
46644 +{
46645 + assert("edward-821", pos != NULL);
46646 + assert("edward-822", pos->sq == NULL);
46647 +
46648 + pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
46649 + if (!pos->sq)
46650 + return RETERR(-ENOMEM);
46651 + memset(pos->sq, 0, sizeof(*pos->sq));
46652 + cluster_init_write(&pos->sq->clust, 0);
46653 + return 0;
46654 +}
46655 +
46656 +void free_convert_data(flush_pos_t * pos)
46657 +{
46658 + convert_info_t *sq;
46659 +
46660 + assert("edward-823", pos != NULL);
46661 + assert("edward-824", pos->sq != NULL);
46662 +
46663 + sq = pos->sq;
46664 + if (sq->itm)
46665 + free_item_convert_data(sq);
46666 + put_cluster_handle(&sq->clust);
46667 + kfree(pos->sq);
46668 + pos->sq = NULL;
46669 + return;
46670 +}
46671 +
46672 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46673 +{
46674 + convert_info_t *sq;
46675 +
46676 + assert("edward-825", pos != NULL);
46677 + assert("edward-826", pos->sq != NULL);
46678 + assert("edward-827", item_convert_data(pos) != NULL);
46679 + assert("edward-828", inode != NULL);
46680 +
46681 + sq = pos->sq;
46682 +
46683 + memset(sq->itm, 0, sizeof(*sq->itm));
46684 +
46685 + /* iplug->init_convert_data() */
46686 + return init_convert_data_ctail(sq->itm, inode);
46687 +}
46688 +
46689 +/* create and attach disk cluster info used by 'convert' phase of the flush
46690 + squalloc() */
46691 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46692 +{
46693 + int ret = 0;
46694 + convert_item_info_t *info;
46695 + reiser4_cluster_t *clust;
46696 + file_plugin *fplug = inode_file_plugin(inode);
46697 + compression_plugin *cplug = inode_compression_plugin(inode);
46698 +
46699 + assert("edward-248", pos != NULL);
46700 + assert("edward-249", pos->child != NULL);
46701 + assert("edward-251", inode != NULL);
46702 + assert("edward-682", cryptcompress_inode_ok(inode));
46703 + assert("edward-252",
46704 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
46705 + assert("edward-473",
46706 + item_plugin_by_coord(&pos->coord) ==
46707 + item_plugin_by_id(CTAIL_ID));
46708 +
46709 + if (!pos->sq) {
46710 + ret = alloc_convert_data(pos);
46711 + if (ret)
46712 + return ret;
46713 + }
46714 + clust = &pos->sq->clust;
46715 + ret = grab_coa(&clust->tc, cplug);
46716 + if (ret)
46717 + goto err;
46718 + ret = set_cluster_by_page(clust,
46719 + jnode_page(pos->child),
46720 + MAX_CLUSTER_NRPAGES);
46721 + if (ret)
46722 + goto err;
46723 +
46724 + assert("edward-829", pos->sq != NULL);
46725 + assert("edward-250", item_convert_data(pos) == NULL);
46726 +
46727 + pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46728 +
46729 + ret = alloc_item_convert_data(pos->sq);
46730 + if (ret)
46731 + goto err;
46732 + ret = init_item_convert_data(pos, inode);
46733 + if (ret)
46734 + goto err;
46735 + info = item_convert_data(pos);
46736 +
46737 + ret = flush_cluster_pages(clust, pos->child, inode);
46738 + if (ret)
46739 + goto err;
46740 +
46741 + reiser4_deflate_cluster(clust, inode);
46742 + inc_item_convert_count(pos);
46743 +
46744 + /* make flow by transformed stream */
46745 + fplug->flow_by_inode(info->inode,
46746 + (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46747 + 0 /* kernel space */ ,
46748 + clust->tc.len,
46749 + clust_to_off(clust->index, inode),
46750 + WRITE_OP, &info->flow);
46751 + jput(pos->child);
46752 +
46753 + assert("edward-683", cryptcompress_inode_ok(inode));
46754 + return 0;
46755 + err:
46756 + jput(pos->child);
46757 + free_convert_data(pos);
46758 + return ret;
46759 +}
46760 +
46761 +/* clear up disk cluster info */
46762 +static void detach_convert_idata(convert_info_t * sq)
46763 +{
46764 + convert_item_info_t *info;
46765 +
46766 + assert("edward-253", sq != NULL);
46767 + assert("edward-840", sq->itm != NULL);
46768 +
46769 + info = sq->itm;
46770 + assert("edward-255", info->inode != NULL);
46771 + assert("edward-1212", info->flow.length == 0);
46772 +
46773 + free_item_convert_data(sq);
46774 + return;
46775 +}
46776 +
46777 +/* plugin->u.item.f.utmost_child */
46778 +
46779 +/* This function sets leftmost child for a first cluster item,
46780 + if the child exists, and NULL in other cases.
46781 + NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46782 +
46783 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46784 +{
46785 + reiser4_key key;
46786 +
46787 + item_key_by_coord(coord, &key);
46788 +
46789 + assert("edward-257", coord != NULL);
46790 + assert("edward-258", child != NULL);
46791 + assert("edward-259", side == LEFT_SIDE);
46792 + assert("edward-260",
46793 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46794 +
46795 + if (!is_disk_cluster_key(&key, coord))
46796 + *child = NULL;
46797 + else
46798 + *child = jlookup(current_tree,
46799 + get_key_objectid(item_key_by_coord
46800 + (coord, &key)),
46801 + off_to_pg(get_key_offset(&key)));
46802 + return 0;
46803 +}
46804 +
46805 +/* Returns true if @p2 is the next item to @p1
46806 + in the _same_ disk cluster.
46807 + Disk cluster is a set of items. If ->clustered() != NULL,
46808 + with each item the whole disk cluster should be read/modified
46809 +*/
46810 +static int clustered_ctail(const coord_t * p1, const coord_t * p2)
46811 +{
46812 + return mergeable_ctail(p1, p2);
46813 +}
46814 +
46815 +/* Go rightward and check for next disk cluster item, set
46816 + d_next to DC_CHAINED_ITEM, if the last one exists.
46817 + If the current position is last item, go to right neighbor.
46818 + Skip empty nodes. Note, that right neighbors may be not in
46819 + the slum because of races. If so, make it dirty and
46820 + convertible.
46821 +*/
46822 +static int next_item_dc_stat(flush_pos_t * pos)
46823 +{
46824 + int ret = 0;
46825 + int stop = 0;
46826 + znode *cur;
46827 + coord_t coord;
46828 + lock_handle lh;
46829 + lock_handle right_lock;
46830 +
46831 + assert("edward-1232", !node_is_empty(pos->coord.node));
46832 + assert("edward-1014",
46833 + pos->coord.item_pos < coord_num_items(&pos->coord));
46834 + assert("edward-1015", chaining_data_present(pos));
46835 + assert("edward-1017",
46836 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
46837 +
46838 + item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46839 +
46840 + if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46841 + return ret;
46842 + if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46843 + return ret;
46844 +
46845 + /* check next slum item */
46846 + init_lh(&right_lock);
46847 + cur = pos->coord.node;
46848 +
46849 + while (!stop) {
46850 + init_lh(&lh);
46851 + ret = reiser4_get_right_neighbor(&lh,
46852 + cur,
46853 + ZNODE_WRITE_LOCK,
46854 + GN_CAN_USE_UPPER_LEVELS);
46855 + if (ret)
46856 + break;
46857 + ret = zload(lh.node);
46858 + if (ret) {
46859 + done_lh(&lh);
46860 + break;
46861 + }
46862 + coord_init_before_first_item(&coord, lh.node);
46863 +
46864 + if (node_is_empty(lh.node)) {
46865 + znode_make_dirty(lh.node);
46866 + znode_set_convertible(lh.node);
46867 + stop = 0;
46868 + } else if (clustered_ctail(&pos->coord, &coord)) {
46869 +
46870 + item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46871 +
46872 + if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46873 + /*
46874 + warning("edward-1024",
46875 + "next slum item mergeable, "
46876 + "but znode %p isn't dirty\n",
46877 + lh.node);
46878 + */
46879 + znode_make_dirty(lh.node);
46880 + }
46881 + if (!znode_convertible(lh.node)) {
46882 + /*
46883 + warning("edward-1272",
46884 + "next slum item mergeable, "
46885 + "but znode %p isn't convertible\n",
46886 + lh.node);
46887 + */
46888 + znode_set_convertible(lh.node);
46889 + }
46890 + stop = 1;
46891 + } else
46892 + stop = 1;
46893 + zrelse(lh.node);
46894 + done_lh(&right_lock);
46895 + copy_lh(&right_lock, &lh);
46896 + done_lh(&lh);
46897 + cur = right_lock.node;
46898 + }
46899 + done_lh(&right_lock);
46900 +
46901 + if (ret == -E_NO_NEIGHBOR)
46902 + ret = 0;
46903 + return ret;
46904 +}
46905 +
46906 +static int
46907 +assign_convert_mode(convert_item_info_t * idata,
46908 + cryptcompress_write_mode_t * mode)
46909 +{
46910 + int result = 0;
46911 +
46912 + assert("edward-1025", idata != NULL);
46913 +
46914 + if (idata->flow.length) {
46915 + /* append or overwrite */
46916 + switch (idata->d_cur) {
46917 + case DC_FIRST_ITEM:
46918 + case DC_CHAINED_ITEM:
46919 + *mode = CRC_OVERWRITE_ITEM;
46920 + break;
46921 + case DC_AFTER_CLUSTER:
46922 + *mode = CRC_APPEND_ITEM;
46923 + break;
46924 + default:
46925 + impossible("edward-1018", "wrong current item state");
46926 + }
46927 + } else {
46928 + /* cut or invalidate */
46929 + switch (idata->d_cur) {
46930 + case DC_FIRST_ITEM:
46931 + case DC_CHAINED_ITEM:
46932 + *mode = CRC_CUT_ITEM;
46933 + break;
46934 + case DC_AFTER_CLUSTER:
46935 + result = 1;
46936 + break;
46937 + default:
46938 + impossible("edward-1019", "wrong current item state");
46939 + }
46940 + }
46941 + return result;
46942 +}
46943 +
46944 +/* plugin->u.item.f.convert */
46945 +/* write ctail in guessed mode */
46946 +int convert_ctail(flush_pos_t * pos)
46947 +{
46948 + int result;
46949 + int nr_items;
46950 + cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
46951 +
46952 + assert("edward-1020", pos != NULL);
46953 + assert("edward-1213", coord_num_items(&pos->coord) != 0);
46954 + assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46955 + assert("edward-1258", ctail_ok(&pos->coord));
46956 + assert("edward-261", pos->coord.node != NULL);
46957 +
46958 + nr_items = coord_num_items(&pos->coord);
46959 + if (!chaining_data_present(pos)) {
46960 + if (should_attach_convert_idata(pos)) {
46961 + /* attach convert item info */
46962 + struct inode *inode;
46963 +
46964 + assert("edward-264", pos->child != NULL);
46965 + assert("edward-265", jnode_page(pos->child) != NULL);
46966 + assert("edward-266",
46967 + jnode_page(pos->child)->mapping != NULL);
46968 +
46969 + inode = jnode_page(pos->child)->mapping->host;
46970 +
46971 + assert("edward-267", inode != NULL);
46972 +
46973 + /* attach item convert info by child and put the last one */
46974 + result = attach_convert_idata(pos, inode);
46975 + pos->child = NULL;
46976 + if (result == -E_REPEAT) {
46977 + /* jnode became clean, or there is no dirty
46978 + pages (nothing to update in disk cluster) */
46979 + warning("edward-1021",
46980 + "convert_ctail: nothing to attach");
46981 + return 0;
46982 + }
46983 + if (result != 0)
46984 + return result;
46985 + } else
46986 + /* unconvertible */
46987 + return 0;
46988 + } else {
46989 + /* use old convert info */
46990 +
46991 + convert_item_info_t *idata;
46992 +
46993 + idata = item_convert_data(pos);
46994 +
46995 + result = assign_convert_mode(idata, &mode);
46996 + if (result) {
46997 + /* disk cluster is over,
46998 + nothing to update anymore */
46999 + detach_convert_idata(pos->sq);
47000 + return 0;
47001 + }
47002 + }
47003 +
47004 + assert("edward-433", chaining_data_present(pos));
47005 + assert("edward-1022",
47006 + pos->coord.item_pos < coord_num_items(&pos->coord));
47007 +
47008 + result = next_item_dc_stat(pos);
47009 + if (result) {
47010 + detach_convert_idata(pos->sq);
47011 + return result;
47012 + }
47013 + result = do_convert_ctail(pos, mode);
47014 + if (result) {
47015 + detach_convert_idata(pos->sq);
47016 + return result;
47017 + }
47018 + switch (mode) {
47019 + case CRC_CUT_ITEM:
47020 + assert("edward-1214", item_convert_data(pos)->flow.length == 0);
47021 + assert("edward-1215",
47022 + coord_num_items(&pos->coord) == nr_items ||
47023 + coord_num_items(&pos->coord) == nr_items - 1);
47024 + if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
47025 + break;
47026 + if (coord_num_items(&pos->coord) != nr_items) {
47027 + /* the item was killed, no more chained items */
47028 + detach_convert_idata(pos->sq);
47029 + if (!node_is_empty(pos->coord.node))
47030 + /* make sure the next item will be scanned */
47031 + coord_init_before_item(&pos->coord);
47032 + break;
47033 + }
47034 + case CRC_APPEND_ITEM:
47035 + assert("edward-434", item_convert_data(pos)->flow.length == 0);
47036 + detach_convert_idata(pos->sq);
47037 + break;
47038 + case CRC_OVERWRITE_ITEM:
47039 + if (coord_is_unprepped_ctail(&pos->coord)) {
47040 + /* convert unpprepped ctail to prepped one */
47041 + int shift;
47042 + shift =
47043 + inode_cluster_shift(item_convert_data(pos)->inode);
47044 + assert("edward-1259", cluster_shift_ok(shift));
47045 + put_unaligned((d8)shift,
47046 + &ctail_formatted_at(&pos->coord)->
47047 + cluster_shift);
47048 + }
47049 + break;
47050 + }
47051 + return result;
47052 +}
47053 +
47054 +/* Make Linus happy.
47055 + Local variables:
47056 + c-indentation-style: "K&R"
47057 + mode-name: "LC"
47058 + c-basic-offset: 8
47059 + tab-width: 8
47060 + fill-column: 120
47061 + End:
47062 +*/
47063 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.20/fs/reiser4/plugin/item/ctail.h
47064 --- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
47065 +++ linux-2.6.20/fs/reiser4/plugin/item/ctail.h 2007-05-06 14:50:43.803008220 +0400
47066 @@ -0,0 +1,97 @@
47067 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47068 +
47069 +#if !defined( __FS_REISER4_CTAIL_H__ )
47070 +#define __FS_REISER4_CTAIL_H__
47071 +
47072 +/* Disk format of ctail item */
47073 +typedef struct ctail_item_format {
47074 + /* packed shift; size of (prepped) disk cluster
47075 + is calculated as (1 << cluster_shift) */
47076 + d8 cluster_shift;
47077 + /* ctail body */
47078 + d8 body[0];
47079 +} __attribute__ ((packed)) ctail_item_format;
47080 +
47081 +/* Unprepped disk cluster is represented by a single ctail item
47082 + with the following "magic" attributes: */
47083 +/* "magic" cluster_shift */
47084 +#define UCTAIL_SHIFT 0xff
47085 +/* How many units unprepped ctail item has */
47086 +#define UCTAIL_NR_UNITS 1
47087 +
47088 +/* The following is a set of various item states in a disk cluster.
47089 + Disk cluster is a set of items whose keys belong to the interval
47090 + [dc_key , dc_key + disk_cluster_size - 1] */
47091 +typedef enum {
47092 + DC_INVALID_STATE = 0,
47093 + DC_FIRST_ITEM = 1,
47094 + DC_CHAINED_ITEM = 2,
47095 + DC_AFTER_CLUSTER = 3
47096 +} dc_item_stat;
47097 +
47098 +/* ctail-specific extension.
47099 + In particular this describes parameters of disk cluster an item belongs to */
47100 +typedef struct {
47101 + int shift; /* this contains cluster_shift extracted from
47102 + ctail_item_format (above), or UCTAIL_SHIFT
47103 + (the last one is the "magic" of unprepped disk clusters)*/
47104 + int dsize; /* size of a prepped disk cluster */
47105 + int ncount; /* count of nodes occupied by a disk cluster */
47106 +} ctail_coord_extension_t;
47107 +
47108 +struct cut_list;
47109 +
47110 +/* plugin->item.b.* */
47111 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47112 + const reiser4_item_data *);
47113 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47114 +pos_in_node_t nr_units_ctail(const coord_t * coord);
47115 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47116 +void print_ctail(const char *prefix, coord_t * coord);
47117 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47118 +
47119 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
47120 + carry_plugin_info * info UNUSED_ARG);
47121 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47122 +int can_shift_ctail(unsigned free_space, coord_t * coord,
47123 + znode * target, shift_direction pend, unsigned *size,
47124 + unsigned want);
47125 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47126 + unsigned count, shift_direction where_is_free_space,
47127 + unsigned free_space);
47128 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47129 + carry_cut_data *, reiser4_key * smallest_removed,
47130 + reiser4_key * new_first);
47131 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47132 + carry_kill_data *, reiser4_key * smallest_removed,
47133 + reiser4_key * new_first);
47134 +int ctail_ok(const coord_t * coord);
47135 +int check_ctail(const coord_t * coord, const char **error);
47136 +
47137 +/* plugin->u.item.s.* */
47138 +int read_ctail(struct file *, flow_t *, hint_t *);
47139 +int readpage_ctail(void *, struct page *);
47140 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
47141 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47142 +int create_hook_ctail(const coord_t * coord, void *arg);
47143 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47144 + carry_kill_data *);
47145 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47146 +
47147 +/* plugin->u.item.f */
47148 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
47149 +int scan_ctail(flush_scan *);
47150 +int convert_ctail(flush_pos_t *);
47151 +size_t inode_scaled_cluster_size(struct inode *);
47152 +
47153 +#endif /* __FS_REISER4_CTAIL_H__ */
47154 +
47155 +/* Make Linus happy.
47156 + Local variables:
47157 + c-indentation-style: "K&R"
47158 + mode-name: "LC"
47159 + c-basic-offset: 8
47160 + tab-width: 8
47161 + fill-column: 120
47162 + End:
47163 +*/
47164 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c linux-2.6.20/fs/reiser4/plugin/item/extent.c
47165 --- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
47166 +++ linux-2.6.20/fs/reiser4/plugin/item/extent.c 2007-05-06 14:50:43.807009470 +0400
47167 @@ -0,0 +1,197 @@
47168 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47169 +
47170 +#include "item.h"
47171 +#include "../../key.h"
47172 +#include "../../super.h"
47173 +#include "../../carry.h"
47174 +#include "../../inode.h"
47175 +#include "../../page_cache.h"
47176 +#include "../../flush.h"
47177 +#include "../object.h"
47178 +
47179 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47180 +/* Audited by: green(2002.06.13) */
47181 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47182 + int nr_extents)
47183 +{
47184 + data->data = ext_unit;
47185 + /* data->data is kernel space */
47186 + data->user = 0;
47187 + data->length = sizeof(reiser4_extent) * nr_extents;
47188 + data->arg = NULL;
47189 + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47190 + return data;
47191 +}
47192 +
47193 +/* how many bytes are addressed by @nr first extents of the extent item */
47194 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
47195 +{
47196 + pos_in_node_t i;
47197 + reiser4_block_nr blocks;
47198 + reiser4_extent *ext;
47199 +
47200 + ext = item_body_by_coord(coord);
47201 + assert("vs-263", nr <= nr_units_extent(coord));
47202 +
47203 + blocks = 0;
47204 + for (i = 0; i < nr; i++, ext++) {
47205 + blocks += extent_get_width(ext);
47206 + }
47207 +
47208 + return blocks * current_blocksize;
47209 +}
47210 +
47211 +extent_state state_of_extent(reiser4_extent * ext)
47212 +{
47213 + switch ((int)extent_get_start(ext)) {
47214 + case 0:
47215 + return HOLE_EXTENT;
47216 + case 1:
47217 + return UNALLOCATED_EXTENT;
47218 + default:
47219 + break;
47220 + }
47221 + return ALLOCATED_EXTENT;
47222 +}
47223 +
47224 +int extent_is_unallocated(const coord_t * item)
47225 +{
47226 + assert("jmacd-5133", item_is_extent(item));
47227 +
47228 + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47229 +}
47230 +
47231 +/* set extent's start and width */
47232 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47233 + reiser4_block_nr width)
47234 +{
47235 + extent_set_start(ext, start);
47236 + extent_set_width(ext, width);
47237 +}
47238 +
47239 +/**
47240 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
47241 + * @un_extent: coordinate of extent to be overwritten
47242 + * @lh: need better comment
47243 + * @key: need better comment
47244 + * @exts_to_add: data prepared for insertion into tree
47245 + * @replace: need better comment
47246 + * @flags: need better comment
47247 + * @return_insert_position: need better comment
47248 + *
47249 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47250 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47251 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47252 + * set to extent which was overwritten.
47253 + */
47254 +int reiser4_replace_extent(struct replace_handle *h,
47255 + int return_inserted_position)
47256 +{
47257 + int result;
47258 + znode *orig_znode;
47259 + /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47260 +
47261 + assert("vs-990", coord_is_existing_unit(h->coord));
47262 + assert("vs-1375", znode_is_write_locked(h->coord->node));
47263 + assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47264 + assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47265 + assert("vs-1427", ergo(h->nr_new_extents == 2,
47266 + extent_get_width(&h->new_extents[1]) != 0));
47267 +
47268 + /* compose structure for paste */
47269 + init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47270 +
47271 + coord_dup(&h->coord_after, h->coord);
47272 + init_lh(&h->lh_after);
47273 + copy_lh(&h->lh_after, h->lh);
47274 + reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47275 + reiser4_tap_monitor(&h->watch);
47276 +
47277 + ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47278 + orig_znode = h->coord->node;
47279 +
47280 +#if REISER4_DEBUG
47281 + /* make sure that key is set properly */
47282 + unit_key_by_coord(h->coord, &h->tmp);
47283 + set_key_offset(&h->tmp,
47284 + get_key_offset(&h->tmp) +
47285 + extent_get_width(&h->overwrite) * current_blocksize);
47286 + assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47287 +#endif
47288 +
47289 + /* set insert point after unit to be replaced */
47290 + h->coord->between = AFTER_UNIT;
47291 +
47292 + result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47293 + &h->paste_key, &h->item, h->flags);
47294 + if (!result) {
47295 + /* now we have to replace the unit after which new units were
47296 + inserted. Its position is tracked by @watch */
47297 + reiser4_extent *ext;
47298 + znode *node;
47299 +
47300 + node = h->coord_after.node;
47301 + if (node != orig_znode) {
47302 + coord_clear_iplug(&h->coord_after);
47303 + result = zload(node);
47304 + }
47305 +
47306 + if (likely(!result)) {
47307 + ext = extent_by_coord(&h->coord_after);
47308 +
47309 + assert("vs-987", znode_is_loaded(node));
47310 + assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47311 +
47312 + /* overwrite extent unit */
47313 + memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47314 + znode_make_dirty(node);
47315 +
47316 + if (node != orig_znode)
47317 + zrelse(node);
47318 +
47319 + if (return_inserted_position == 0) {
47320 + /* coord and lh are to be set to overwritten
47321 + extent */
47322 + assert("vs-1662",
47323 + WITH_DATA(node, !memcmp(&h->overwrite,
47324 + extent_by_coord(
47325 + &h->coord_after),
47326 + sizeof(reiser4_extent))));
47327 +
47328 + *h->coord = h->coord_after;
47329 + done_lh(h->lh);
47330 + copy_lh(h->lh, &h->lh_after);
47331 + } else {
47332 + /* h->coord and h->lh are to be set to first of
47333 + inserted units */
47334 + assert("vs-1663",
47335 + WITH_DATA(h->coord->node,
47336 + !memcmp(&h->new_extents[0],
47337 + extent_by_coord(h->coord),
47338 + sizeof(reiser4_extent))));
47339 + assert("vs-1664", h->lh->node == h->coord->node);
47340 + }
47341 + }
47342 + }
47343 + reiser4_tap_done(&h->watch);
47344 +
47345 + return result;
47346 +}
47347 +
47348 +lock_handle *znode_lh(znode *node)
47349 +{
47350 + assert("vs-1371", znode_is_write_locked(node));
47351 + assert("vs-1372", znode_is_wlocked_once(node));
47352 + return list_entry(node->lock.owners.next, lock_handle, owners_link);
47353 +}
47354 +
47355 +/*
47356 + * Local variables:
47357 + * c-indentation-style: "K&R"
47358 + * mode-name: "LC"
47359 + * c-basic-offset: 8
47360 + * tab-width: 8
47361 + * fill-column: 79
47362 + * scroll-step: 1
47363 + * End:
47364 + */
47365 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c
47366 --- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
47367 +++ linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c 2007-05-06 14:50:43.807009470 +0400
47368 @@ -0,0 +1,1443 @@
47369 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47370 +
47371 +#include "item.h"
47372 +#include "../../inode.h"
47373 +#include "../../page_cache.h"
47374 +#include "../object.h"
47375 +
47376 +#include <linux/quotaops.h>
47377 +#include <linux/swap.h>
47378 +#include "../../../../mm/filemap.h"
47379 +
47380 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47381 +{
47382 + reiser4_extent *ext;
47383 +
47384 + ext = (reiser4_extent *) (zdata(node) + offset);
47385 + return ext;
47386 +}
47387 +
47388 +/**
47389 + * check_uf_coord - verify coord extension
47390 + * @uf_coord:
47391 + * @key:
47392 + *
47393 + * Makes sure that all fields of @uf_coord are set properly. If @key is
47394 + * specified - check whether @uf_coord is set correspondingly.
47395 + */
47396 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47397 +{
47398 +#if REISER4_DEBUG
47399 + const coord_t *coord;
47400 + const extent_coord_extension_t *ext_coord;
47401 + reiser4_extent *ext;
47402 +
47403 + coord = &uf_coord->coord;
47404 + ext_coord = &uf_coord->extension.extent;
47405 + ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47406 +
47407 + assert("",
47408 + WITH_DATA(coord->node,
47409 + (uf_coord->valid == 1 &&
47410 + coord_is_iplug_set(coord) &&
47411 + item_is_extent(coord) &&
47412 + ext_coord->nr_units == nr_units_extent(coord) &&
47413 + ext == extent_by_coord(coord) &&
47414 + ext_coord->width == extent_get_width(ext) &&
47415 + coord->unit_pos < ext_coord->nr_units &&
47416 + ext_coord->pos_in_unit < ext_coord->width &&
47417 + memcmp(ext, &ext_coord->extent,
47418 + sizeof(reiser4_extent)) == 0)));
47419 + if (key) {
47420 + reiser4_key coord_key;
47421 +
47422 + unit_key_by_coord(&uf_coord->coord, &coord_key);
47423 + set_key_offset(&coord_key,
47424 + get_key_offset(&coord_key) +
47425 + (uf_coord->extension.extent.
47426 + pos_in_unit << PAGE_CACHE_SHIFT));
47427 + assert("", keyeq(key, &coord_key));
47428 + }
47429 +#endif
47430 +}
47431 +
47432 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47433 +{
47434 + check_uf_coord(uf_coord, NULL);
47435 +
47436 + return ext_by_offset(uf_coord->coord.node,
47437 + uf_coord->extension.extent.ext_offset);
47438 +}
47439 +
47440 +#if REISER4_DEBUG
47441 +
47442 +/**
47443 + * offset_is_in_unit
47444 + *
47445 + *
47446 + *
47447 + */
47448 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47449 + pos_in_unit inside of unit correspondingly */
47450 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
47451 +{
47452 + reiser4_key unit_key;
47453 + __u64 unit_off;
47454 + reiser4_extent *ext;
47455 +
47456 + ext = extent_by_coord(coord);
47457 +
47458 + unit_key_extent(coord, &unit_key);
47459 + unit_off = get_key_offset(&unit_key);
47460 + if (off < unit_off)
47461 + return 0;
47462 + if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47463 + return 0;
47464 + return 1;
47465 +}
47466 +
47467 +static int
47468 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47469 +{
47470 + reiser4_key item_key;
47471 +
47472 + assert("vs-771", coord_is_existing_unit(coord));
47473 + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47474 + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47475 +
47476 + return offset_is_in_unit(coord, get_key_offset(key));
47477 +}
47478 +
47479 +#endif
47480 +
47481 +/**
47482 + * can_append -
47483 + * @key:
47484 + * @coord:
47485 + *
47486 + * Returns 1 if @key is equal to an append key of item @coord is set to
47487 + */
47488 +static int can_append(const reiser4_key *key, const coord_t *coord)
47489 +{
47490 + reiser4_key append_key;
47491 +
47492 + return keyeq(key, append_key_extent(coord, &append_key));
47493 +}
47494 +
47495 +/**
47496 + * append_hole
47497 + * @coord:
47498 + * @lh:
47499 + * @key:
47500 + *
47501 + */
47502 +static int append_hole(coord_t *coord, lock_handle *lh,
47503 + const reiser4_key *key)
47504 +{
47505 + reiser4_key append_key;
47506 + reiser4_block_nr hole_width;
47507 + reiser4_extent *ext, new_ext;
47508 + reiser4_item_data idata;
47509 +
47510 + /* last item of file may have to be appended with hole */
47511 + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47512 + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47513 +
47514 + /* key of first byte which is not addressed by this extent */
47515 + append_key_extent(coord, &append_key);
47516 +
47517 + assert("", keyle(&append_key, key));
47518 +
47519 + /*
47520 + * extent item has to be appended with hole. Calculate length of that
47521 + * hole
47522 + */
47523 + hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47524 + current_blocksize - 1) >> current_blocksize_bits);
47525 + assert("vs-954", hole_width > 0);
47526 +
47527 + /* set coord after last unit */
47528 + coord_init_after_item_end(coord);
47529 +
47530 + /* get last extent in the item */
47531 + ext = extent_by_coord(coord);
47532 + if (state_of_extent(ext) == HOLE_EXTENT) {
47533 + /*
47534 + * last extent of a file is hole extent. Widen that extent by
47535 + * @hole_width blocks. Note that we do not worry about
47536 + * overflowing - extent width is 64 bits
47537 + */
47538 + reiser4_set_extent(ext, HOLE_EXTENT_START,
47539 + extent_get_width(ext) + hole_width);
47540 + znode_make_dirty(coord->node);
47541 + return 0;
47542 + }
47543 +
47544 + /* append last item of the file with hole extent unit */
47545 + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47546 + state_of_extent(ext) == UNALLOCATED_EXTENT));
47547 +
47548 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47549 + init_new_extent(&idata, &new_ext, 1);
47550 + return insert_into_item(coord, lh, &append_key, &idata, 0);
47551 +}
47552 +
47553 +/**
47554 + * check_jnodes
47555 + * @twig: longterm locked twig node
47556 + * @key:
47557 + *
47558 + */
47559 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47560 +{
47561 +#if REISER4_DEBUG
47562 + coord_t c;
47563 + reiser4_key node_key, jnode_key;
47564 +
47565 + jnode_key = *key;
47566 +
47567 + assert("", twig != NULL);
47568 + assert("", znode_get_level(twig) == TWIG_LEVEL);
47569 + assert("", znode_is_write_locked(twig));
47570 +
47571 + zload(twig);
47572 + /* get the smallest key in twig node */
47573 + coord_init_first_unit(&c, twig);
47574 + unit_key_by_coord(&c, &node_key);
47575 + assert("", keyle(&node_key, &jnode_key));
47576 +
47577 + coord_init_last_unit(&c, twig);
47578 + unit_key_by_coord(&c, &node_key);
47579 + if (item_plugin_by_coord(&c)->s.file.append_key)
47580 + item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47581 + set_key_offset(&jnode_key,
47582 + get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47583 + assert("", keylt(&jnode_key, &node_key));
47584 + zrelse(twig);
47585 +#endif
47586 +}
47587 +
47588 +/**
47589 + * append_last_extent - append last file item
47590 + * @uf_coord: coord to start insertion from
47591 + * @jnodes: array of jnodes
47592 + * @count: number of jnodes in the array
47593 + *
47594 + * There is already at least one extent item of file @inode in the tree. Append
47595 + * the last of them with unallocated extent unit of width @count. Assign
47596 + * fake block numbers to jnodes corresponding to the inserted extent.
47597 + */
47598 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47599 + jnode **jnodes, int count)
47600 +{
47601 + int result;
47602 + reiser4_extent new_ext;
47603 + reiser4_item_data idata;
47604 + coord_t *coord;
47605 + extent_coord_extension_t *ext_coord;
47606 + reiser4_extent *ext;
47607 + reiser4_block_nr block;
47608 + jnode *node;
47609 + int i;
47610 +
47611 + coord = &uf_coord->coord;
47612 + ext_coord = &uf_coord->extension.extent;
47613 + ext = ext_by_ext_coord(uf_coord);
47614 +
47615 + /* check correctness of position in the item */
47616 + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47617 + assert("vs-1311", coord->between == AFTER_UNIT);
47618 + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47619 +
47620 + if (!can_append(key, coord)) {
47621 + /* hole extent has to be inserted */
47622 + result = append_hole(coord, uf_coord->lh, key);
47623 + uf_coord->valid = 0;
47624 + return result;
47625 + }
47626 +
47627 + if (count == 0)
47628 + return 0;
47629 +
47630 + assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47631 +
47632 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47633 + count);
47634 + BUG_ON(result != 0);
47635 +
47636 + switch (state_of_extent(ext)) {
47637 + case UNALLOCATED_EXTENT:
47638 + /*
47639 + * last extent unit of the file is unallocated one. Increase
47640 + * its width by @count
47641 + */
47642 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
47643 + extent_get_width(ext) + count);
47644 + znode_make_dirty(coord->node);
47645 +
47646 + /* update coord extension */
47647 + ext_coord->width += count;
47648 + ON_DEBUG(extent_set_width
47649 + (&uf_coord->extension.extent.extent,
47650 + ext_coord->width));
47651 + break;
47652 +
47653 + case HOLE_EXTENT:
47654 + case ALLOCATED_EXTENT:
47655 + /*
47656 + * last extent unit of the file is either hole or allocated
47657 + * one. Append one unallocated extent of width @count
47658 + */
47659 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47660 + init_new_extent(&idata, &new_ext, 1);
47661 + result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47662 + uf_coord->valid = 0;
47663 + if (result)
47664 + return result;
47665 + break;
47666 +
47667 + default:
47668 + return RETERR(-EIO);
47669 + }
47670 +
47671 + /*
47672 + * make sure that we hold long term locked twig node containing all
47673 + * jnodes we are about to capture
47674 + */
47675 + check_jnodes(uf_coord->lh->node, key, count);
47676 +
47677 + /*
47678 + * assign fake block numbers to all jnodes. FIXME: make sure whether
47679 + * twig node containing inserted extent item is locked
47680 + */
47681 + block = fake_blocknr_unformatted(count);
47682 + for (i = 0; i < count; i ++, block ++) {
47683 + node = jnodes[i];
47684 + spin_lock_jnode(node);
47685 + JF_SET(node, JNODE_CREATED);
47686 + jnode_set_block(node, &block);
47687 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47688 + BUG_ON(result != 0);
47689 + jnode_make_dirty_locked(node);
47690 + spin_unlock_jnode(node);
47691 + }
47692 + return count;
47693 +}
47694 +
47695 +/**
47696 + * insert_first_hole - inser hole extent into tree
47697 + * @coord:
47698 + * @lh:
47699 + * @key:
47700 + *
47701 + *
47702 + */
47703 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
47704 + const reiser4_key *key)
47705 +{
47706 + reiser4_extent new_ext;
47707 + reiser4_item_data idata;
47708 + reiser4_key item_key;
47709 + reiser4_block_nr hole_width;
47710 +
47711 + /* @coord must be set for inserting of new item */
47712 + assert("vs-711", coord_is_between_items(coord));
47713 +
47714 + item_key = *key;
47715 + set_key_offset(&item_key, 0ull);
47716 +
47717 + hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47718 + current_blocksize_bits);
47719 + assert("vs-710", hole_width > 0);
47720 +
47721 + /* compose body of hole extent and insert item into tree */
47722 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47723 + init_new_extent(&idata, &new_ext, 1);
47724 + return insert_extent_by_coord(coord, &idata, &item_key, lh);
47725 +}
47726 +
47727 +
47728 +/**
47729 + * insert_first_extent - insert first file item
47730 + * @inode: inode of file
47731 + * @uf_coord: coord to start insertion from
47732 + * @jnodes: array of jnodes
47733 + * @count: number of jnodes in the array
47734 + * @inode:
47735 + *
47736 + * There are no items of file @inode in the tree yet. Insert unallocated extent
47737 + * of width @count into tree or hole extent if writing not to the
47738 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47739 + * unallocated extent. Returns number of jnodes or error code.
47740 + */
47741 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47742 + jnode **jnodes, int count,
47743 + struct inode *inode)
47744 +{
47745 + int result;
47746 + int i;
47747 + reiser4_extent new_ext;
47748 + reiser4_item_data idata;
47749 + reiser4_block_nr block;
47750 + unix_file_info_t *uf_info;
47751 + jnode *node;
47752 +
47753 + /* first extent insertion starts at leaf level */
47754 + assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47755 + assert("vs-711", coord_is_between_items(&uf_coord->coord));
47756 +
47757 + if (get_key_offset(key) != 0) {
47758 + result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47759 + uf_coord->valid = 0;
47760 + uf_info = unix_file_inode_data(inode);
47761 +
47762 + /*
47763 + * first item insertion is only possible when writing to empty
47764 + * file or performing tail conversion
47765 + */
47766 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47767 + (reiser4_inode_get_flag(inode,
47768 + REISER4_PART_MIXED) &&
47769 + reiser4_inode_get_flag(inode,
47770 + REISER4_PART_IN_CONV))));
47771 + /* if file was empty - update its state */
47772 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47773 + uf_info->container = UF_CONTAINER_EXTENTS;
47774 + return result;
47775 + }
47776 +
47777 + if (count == 0)
47778 + return 0;
47779 +
47780 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47781 + BUG_ON(result != 0);
47782 +
47783 + /*
47784 + * prepare for tree modification: compose body of item and item data
47785 + * structure needed for insertion
47786 + */
47787 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47788 + init_new_extent(&idata, &new_ext, 1);
47789 +
47790 + /* insert extent item into the tree */
47791 + result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47792 + uf_coord->lh);
47793 + if (result)
47794 + return result;
47795 +
47796 + /*
47797 + * make sure that we hold long term locked twig node containing all
47798 + * jnodes we are about to capture
47799 + */
47800 + check_jnodes(uf_coord->lh->node, key, count);
47801 + /*
47802 + * assign fake block numbers to all jnodes, capture and mark them dirty
47803 + */
47804 + block = fake_blocknr_unformatted(count);
47805 + for (i = 0; i < count; i ++, block ++) {
47806 + node = jnodes[i];
47807 + spin_lock_jnode(node);
47808 + JF_SET(node, JNODE_CREATED);
47809 + jnode_set_block(node, &block);
47810 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47811 + BUG_ON(result != 0);
47812 + jnode_make_dirty_locked(node);
47813 + spin_unlock_jnode(node);
47814 + }
47815 +
47816 + /*
47817 + * invalidate coordinate, research must be performed to continue
47818 + * because write will continue on twig level
47819 + */
47820 + uf_coord->valid = 0;
47821 + return count;
47822 +}
47823 +
47824 +/**
47825 + * plug_hole - replace hole extent with unallocated and holes
47826 + * @uf_coord:
47827 + * @key:
47828 + * @node:
47829 + * @h: structure containing coordinate, lock handle, key, etc
47830 + *
47831 + * Creates an unallocated extent of width 1 within a hole. In worst case two
47832 + * additional extents can be created.
47833 + */
47834 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47835 +{
47836 + struct replace_handle rh;
47837 + reiser4_extent *ext;
47838 + reiser4_block_nr width, pos_in_unit;
47839 + coord_t *coord;
47840 + extent_coord_extension_t *ext_coord;
47841 + int return_inserted_position;
47842 +
47843 + check_uf_coord(uf_coord, key);
47844 +
47845 + rh.coord = coord_by_uf_coord(uf_coord);
47846 + rh.lh = uf_coord->lh;
47847 + rh.flags = 0;
47848 +
47849 + coord = coord_by_uf_coord(uf_coord);
47850 + ext_coord = ext_coord_by_uf_coord(uf_coord);
47851 + ext = ext_by_ext_coord(uf_coord);
47852 +
47853 + width = ext_coord->width;
47854 + pos_in_unit = ext_coord->pos_in_unit;
47855 +
47856 + *how = 0;
47857 + if (width == 1) {
47858 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47859 + znode_make_dirty(coord->node);
47860 + /* update uf_coord */
47861 + ON_DEBUG(ext_coord->extent = *ext);
47862 + *how = 1;
47863 + return 0;
47864 + } else if (pos_in_unit == 0) {
47865 + /* we deal with first element of extent */
47866 + if (coord->unit_pos) {
47867 + /* there is an extent to the left */
47868 + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
47869 + /*
47870 + * left neighboring unit is an unallocated
47871 + * extent. Increase its width and decrease
47872 + * width of hole
47873 + */
47874 + extent_set_width(ext - 1,
47875 + extent_get_width(ext - 1) + 1);
47876 + extent_set_width(ext, width - 1);
47877 + znode_make_dirty(coord->node);
47878 +
47879 + /* update coord extension */
47880 + coord->unit_pos--;
47881 + ext_coord->width = extent_get_width(ext - 1);
47882 + ext_coord->pos_in_unit = ext_coord->width - 1;
47883 + ext_coord->ext_offset -= sizeof(reiser4_extent);
47884 + ON_DEBUG(ext_coord->extent =
47885 + *extent_by_coord(coord));
47886 + *how = 2;
47887 + return 0;
47888 + }
47889 + }
47890 + /* extent for replace */
47891 + reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
47892 + /* extent to be inserted */
47893 + reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
47894 + width - 1);
47895 + rh.nr_new_extents = 1;
47896 +
47897 + /* have reiser4_replace_extent to return with @coord and
47898 + @uf_coord->lh set to unit which was replaced */
47899 + return_inserted_position = 0;
47900 + *how = 3;
47901 + } else if (pos_in_unit == width - 1) {
47902 + /* we deal with last element of extent */
47903 + if (coord->unit_pos < nr_units_extent(coord) - 1) {
47904 + /* there is an extent unit to the right */
47905 + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
47906 + /*
47907 + * right neighboring unit is an unallocated
47908 + * extent. Increase its width and decrease
47909 + * width of hole
47910 + */
47911 + extent_set_width(ext + 1,
47912 + extent_get_width(ext + 1) + 1);
47913 + extent_set_width(ext, width - 1);
47914 + znode_make_dirty(coord->node);
47915 +
47916 + /* update coord extension */
47917 + coord->unit_pos++;
47918 + ext_coord->width = extent_get_width(ext + 1);
47919 + ext_coord->pos_in_unit = 0;
47920 + ext_coord->ext_offset += sizeof(reiser4_extent);
47921 + ON_DEBUG(ext_coord->extent =
47922 + *extent_by_coord(coord));
47923 + *how = 4;
47924 + return 0;
47925 + }
47926 + }
47927 + /* extent for replace */
47928 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
47929 + /* extent to be inserted */
47930 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47931 + 1);
47932 + rh.nr_new_extents = 1;
47933 +
47934 + /* have reiser4_replace_extent to return with @coord and
47935 + @uf_coord->lh set to unit which was inserted */
47936 + return_inserted_position = 1;
47937 + *how = 5;
47938 + } else {
47939 + /* extent for replace */
47940 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
47941 + pos_in_unit);
47942 + /* extents to be inserted */
47943 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47944 + 1);
47945 + reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
47946 + width - pos_in_unit - 1);
47947 + rh.nr_new_extents = 2;
47948 +
47949 + /* have reiser4_replace_extent to return with @coord and
47950 + @uf_coord->lh set to first of units which were inserted */
47951 + return_inserted_position = 1;
47952 + *how = 6;
47953 + }
47954 + unit_key_by_coord(coord, &rh.paste_key);
47955 + set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
47956 + extent_get_width(&rh.overwrite) * current_blocksize);
47957 +
47958 + uf_coord->valid = 0;
47959 + return reiser4_replace_extent(&rh, return_inserted_position);
47960 +}
47961 +
47962 +/**
47963 + * overwrite_one_block -
47964 + * @uf_coord:
47965 + * @key:
47966 + * @node:
47967 + *
47968 + * If @node corresponds to hole extent - create unallocated extent for it and
47969 + * assign fake block number. If @node corresponds to allocated extent - assign
47970 + * block number of jnode
47971 + */
47972 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
47973 + jnode *node, int *hole_plugged)
47974 +{
47975 + int result;
47976 + extent_coord_extension_t *ext_coord;
47977 + reiser4_extent *ext;
47978 + reiser4_block_nr block;
47979 + int how;
47980 +
47981 + assert("vs-1312", uf_coord->coord.between == AT_UNIT);
47982 +
47983 + result = 0;
47984 + ext_coord = ext_coord_by_uf_coord(uf_coord);
47985 + ext = ext_by_ext_coord(uf_coord);
47986 + assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
47987 +
47988 + switch (state_of_extent(ext)) {
47989 + case ALLOCATED_EXTENT:
47990 + block = extent_get_start(ext) + ext_coord->pos_in_unit;
47991 + break;
47992 +
47993 + case HOLE_EXTENT:
47994 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
47995 + BUG_ON(result != 0);
47996 + result = plug_hole(uf_coord, key, &how);
47997 + if (result)
47998 + return result;
47999 + block = fake_blocknr_unformatted(1);
48000 + if (hole_plugged)
48001 + *hole_plugged = 1;
48002 + JF_SET(node, JNODE_CREATED);
48003 + break;
48004 +
48005 + default:
48006 + return RETERR(-EIO);
48007 + }
48008 +
48009 + jnode_set_block(node, &block);
48010 + return 0;
48011 +}
48012 +
48013 +/**
48014 + * move_coord - move coordinate forward
48015 + * @uf_coord:
48016 + *
48017 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
48018 + * the last one already or is invalid.
48019 + */
48020 +static int move_coord(uf_coord_t *uf_coord)
48021 +{
48022 + extent_coord_extension_t *ext_coord;
48023 +
48024 + if (uf_coord->valid == 0)
48025 + return 1;
48026 + ext_coord = &uf_coord->extension.extent;
48027 + ext_coord->pos_in_unit ++;
48028 + if (ext_coord->pos_in_unit < ext_coord->width)
48029 + /* coordinate moved within the unit */
48030 + return 0;
48031 +
48032 + /* end of unit is reached. Try to move to next unit */
48033 + ext_coord->pos_in_unit = 0;
48034 + uf_coord->coord.unit_pos ++;
48035 + if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48036 + /* coordinate moved to next unit */
48037 + ext_coord->ext_offset += sizeof(reiser4_extent);
48038 + ext_coord->width =
48039 + extent_get_width(ext_by_offset
48040 + (uf_coord->coord.node,
48041 + ext_coord->ext_offset));
48042 + ON_DEBUG(ext_coord->extent =
48043 + *ext_by_offset(uf_coord->coord.node,
48044 + ext_coord->ext_offset));
48045 + return 0;
48046 + }
48047 + /* end of item is reached */
48048 + uf_coord->valid = 0;
48049 + return 1;
48050 +}
48051 +
48052 +/**
48053 + * overwrite_extent -
48054 + * @inode:
48055 + *
48056 + * Returns number of handled jnodes.
48057 + */
48058 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48059 + jnode **jnodes, int count, int *plugged_hole)
48060 +{
48061 + int result;
48062 + reiser4_key k;
48063 + int i;
48064 + jnode *node;
48065 +
48066 + k = *key;
48067 + for (i = 0; i < count; i ++) {
48068 + node = jnodes[i];
48069 + if (*jnode_get_block(node) == 0) {
48070 + result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48071 + if (result)
48072 + return result;
48073 + }
48074 + /*
48075 + * make sure that we hold long term locked twig node containing
48076 + * all jnodes we are about to capture
48077 + */
48078 + check_jnodes(uf_coord->lh->node, &k, 1);
48079 + /*
48080 + * assign fake block numbers to all jnodes, capture and mark
48081 + * them dirty
48082 + */
48083 + spin_lock_jnode(node);
48084 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48085 + BUG_ON(result != 0);
48086 + jnode_make_dirty_locked(node);
48087 + spin_unlock_jnode(node);
48088 +
48089 + if (uf_coord->valid == 0)
48090 + return i + 1;
48091 +
48092 + check_uf_coord(uf_coord, &k);
48093 +
48094 + if (move_coord(uf_coord)) {
48095 + /*
48096 + * failed to move to the next node pointer. Either end
48097 + * of file or end of twig node is reached. In the later
48098 + * case we might go to the right neighbor.
48099 + */
48100 + uf_coord->valid = 0;
48101 + return i + 1;
48102 + }
48103 + set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48104 + }
48105 +
48106 + return count;
48107 +}
48108 +
48109 +/**
48110 + * reiser4_update_extent
48111 + * @file:
48112 + * @jnodes:
48113 + * @count:
48114 + * @off:
48115 + *
48116 + */
48117 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
48118 + int *plugged_hole)
48119 +{
48120 + int result;
48121 + znode *loaded;
48122 + uf_coord_t uf_coord;
48123 + coord_t *coord;
48124 + lock_handle lh;
48125 + reiser4_key key;
48126 +
48127 + assert("", reiser4_lock_counters()->d_refs == 0);
48128 +
48129 + key_by_inode_and_offset_common(inode, pos, &key);
48130 +
48131 + init_uf_coord(&uf_coord, &lh);
48132 + coord = &uf_coord.coord;
48133 + result = find_file_item_nohint(coord, &lh, &key,
48134 + ZNODE_WRITE_LOCK, inode);
48135 + if (IS_CBKERR(result)) {
48136 + assert("", reiser4_lock_counters()->d_refs == 0);
48137 + return result;
48138 + }
48139 +
48140 + result = zload(coord->node);
48141 + BUG_ON(result != 0);
48142 + loaded = coord->node;
48143 +
48144 + if (coord->between == AFTER_UNIT) {
48145 + /*
48146 + * append existing extent item with unallocated extent of width
48147 + * nr_jnodes
48148 + */
48149 + init_coord_extension_extent(&uf_coord,
48150 + get_key_offset(&key));
48151 + result = append_last_extent(&uf_coord, &key,
48152 + &node, 1);
48153 + } else if (coord->between == AT_UNIT) {
48154 + /*
48155 + * overwrite
48156 + * not optimal yet. Will be optimized if new write will show
48157 + * performance win.
48158 + */
48159 + init_coord_extension_extent(&uf_coord,
48160 + get_key_offset(&key));
48161 + result = overwrite_extent(&uf_coord, &key,
48162 + &node, 1, plugged_hole);
48163 + } else {
48164 + /*
48165 + * there are no items of this file in the tree yet. Create
48166 + * first item of the file inserting one unallocated extent of
48167 + * width nr_jnodes
48168 + */
48169 + result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48170 + }
48171 + assert("", result == 1 || result < 0);
48172 + zrelse(loaded);
48173 + done_lh(&lh);
48174 + assert("", reiser4_lock_counters()->d_refs == 0);
48175 + return (result == 1) ? 0 : result;
48176 +}
48177 +
48178 +/**
48179 + * update_extents
48180 + * @file:
48181 + * @jnodes:
48182 + * @count:
48183 + * @off:
48184 + *
48185 + */
48186 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48187 +{
48188 + struct inode *inode;
48189 + struct hint hint;
48190 + reiser4_key key;
48191 + int result;
48192 + znode *loaded;
48193 +
48194 + result = load_file_hint(file, &hint);
48195 + BUG_ON(result != 0);
48196 +
48197 + inode = file->f_dentry->d_inode;
48198 + if (count != 0)
48199 + /*
48200 + * count == 0 is special case: expanding truncate
48201 + */
48202 + pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
48203 + key_by_inode_and_offset_common(inode, pos, &key);
48204 +
48205 + assert("", reiser4_lock_counters()->d_refs == 0);
48206 +
48207 + do {
48208 + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
48209 + if (IS_CBKERR(result)) {
48210 + assert("", reiser4_lock_counters()->d_refs == 0);
48211 + return result;
48212 + }
48213 +
48214 + result = zload(hint.ext_coord.coord.node);
48215 + BUG_ON(result != 0);
48216 + loaded = hint.ext_coord.coord.node;
48217 +
48218 + if (hint.ext_coord.coord.between == AFTER_UNIT) {
48219 + /*
48220 + * append existing extent item with unallocated extent
48221 + * of width nr_jnodes
48222 + */
48223 + if (hint.ext_coord.valid == 0)
48224 + /* NOTE: get statistics on this */
48225 + init_coord_extension_extent(&hint.ext_coord,
48226 + get_key_offset(&key));
48227 + result = append_last_extent(&hint.ext_coord, &key,
48228 + jnodes, count);
48229 + } else if (hint.ext_coord.coord.between == AT_UNIT) {
48230 + /*
48231 + * overwrite
48232 + * not optimal yet. Will be optimized if new write will
48233 + * show performance win.
48234 + */
48235 + if (hint.ext_coord.valid == 0)
48236 + /* NOTE: get statistics on this */
48237 + init_coord_extension_extent(&hint.ext_coord,
48238 + get_key_offset(&key));
48239 + result = overwrite_extent(&hint.ext_coord, &key,
48240 + jnodes, count, NULL);
48241 + } else {
48242 + /*
48243 + * there are no items of this file in the tree
48244 + * yet. Create first item of the file inserting one
48245 + * unallocated extent of * width nr_jnodes
48246 + */
48247 + result = insert_first_extent(&hint.ext_coord, &key,
48248 + jnodes, count, inode);
48249 + }
48250 + zrelse(loaded);
48251 + if (result < 0) {
48252 + done_lh(hint.ext_coord.lh);
48253 + break;
48254 + }
48255 +
48256 + jnodes += result;
48257 + count -= result;
48258 + set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48259 +
48260 + /* seal and unlock znode */
48261 + if (hint.ext_coord.valid)
48262 + reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48263 + else
48264 + reiser4_unset_hint(&hint);
48265 +
48266 + } while (count > 0);
48267 +
48268 + save_file_hint(file, &hint);
48269 + assert("", reiser4_lock_counters()->d_refs == 0);
48270 + return result;
48271 +}
48272 +
48273 +/**
48274 + * write_extent_reserve_space - reserve space for extent write operation
48275 + * @inode:
48276 + *
48277 + * Estimates and reserves space which may be required for writing
48278 + * WRITE_GRANULARITY pages of file.
48279 + */
48280 +static int write_extent_reserve_space(struct inode *inode)
48281 +{
48282 + __u64 count;
48283 + reiser4_tree *tree;
48284 +
48285 + /*
48286 + * to write WRITE_GRANULARITY pages to a file by extents we have to
48287 + * reserve disk space for:
48288 +
48289 + * 1. find_file_item may have to insert empty node to the tree (empty
48290 + * leaf node between two extent items). This requires 1 block and
48291 + * number of blocks which are necessary to perform insertion of an
48292 + * internal item into twig level.
48293 +
48294 + * 2. for each of written pages there might be needed 1 block and
48295 + * number of blocks which might be necessary to perform insertion of or
48296 + * paste to an extent item.
48297 +
48298 + * 3. stat data update
48299 + */
48300 + tree = reiser4_tree_by_inode(inode);
48301 + count = estimate_one_insert_item(tree) +
48302 + WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48303 + estimate_one_insert_item(tree);
48304 + grab_space_enable();
48305 + return reiser4_grab_space(count, 0 /* flags */);
48306 +}
48307 +
48308 +/**
48309 + * reiser4_write_extent - write method of extent item plugin
48310 + * @file: file to write to
48311 + * @buf: address of user-space buffer
48312 + * @count: number of bytes to write
48313 + * @pos: position in file to write to
48314 + *
48315 + */
48316 +ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
48317 + size_t count, loff_t *pos)
48318 +{
48319 + int have_to_update_extent;
48320 + int nr_pages, nr_dirty;
48321 + struct page *page;
48322 + jnode *jnodes[WRITE_GRANULARITY + 1];
48323 + struct inode *inode;
48324 + unsigned long index;
48325 + unsigned long end;
48326 + int i;
48327 + int to_page, page_off;
48328 + size_t left, written;
48329 + int result = 0;
48330 +
48331 + inode = file->f_dentry->d_inode;
48332 + if (write_extent_reserve_space(inode))
48333 + return RETERR(-ENOSPC);
48334 +
48335 + if (count == 0) {
48336 + /* truncate case */
48337 + update_extents(file, jnodes, 0, *pos);
48338 + return 0;
48339 + }
48340 +
48341 + BUG_ON(get_current_context()->trans->atom != NULL);
48342 +
48343 + left = count;
48344 + index = *pos >> PAGE_CACHE_SHIFT;
48345 + /* calculate number of pages which are to be written */
48346 + end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48347 + nr_pages = end - index + 1;
48348 + nr_dirty = 0;
48349 + assert("", nr_pages <= WRITE_GRANULARITY + 1);
48350 +
48351 + /* get pages and jnodes */
48352 + for (i = 0; i < nr_pages; i ++) {
48353 + page = find_or_create_page(inode->i_mapping, index + i,
48354 + reiser4_ctx_gfp_mask_get());
48355 + if (page == NULL) {
48356 + nr_pages = i;
48357 + result = RETERR(-ENOMEM);
48358 + goto out;
48359 + }
48360 +
48361 + jnodes[i] = jnode_of_page(page);
48362 + if (IS_ERR(jnodes[i])) {
48363 + unlock_page(page);
48364 + page_cache_release(page);
48365 + nr_pages = i;
48366 + result = RETERR(-ENOMEM);
48367 + goto out;
48368 + }
48369 + /* prevent jnode and page from disconnecting */
48370 + JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48371 + unlock_page(page);
48372 + }
48373 +
48374 + BUG_ON(get_current_context()->trans->atom != NULL);
48375 +
48376 + have_to_update_extent = 0;
48377 +
48378 + page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48379 + for (i = 0; i < nr_pages; i ++) {
48380 + to_page = PAGE_CACHE_SIZE - page_off;
48381 + if (to_page > left)
48382 + to_page = left;
48383 + page = jnode_page(jnodes[i]);
48384 + if (page_offset(page) < inode->i_size &&
48385 + !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48386 + /*
48387 + * the above is not optimal for partial write to last
48388 + * page of file when file size is not at boundary of
48389 + * page
48390 + */
48391 + lock_page(page);
48392 + if (!PageUptodate(page)) {
48393 + result = readpage_unix_file(NULL, page);
48394 + BUG_ON(result != 0);
48395 + /* wait for read completion */
48396 + lock_page(page);
48397 + BUG_ON(!PageUptodate(page));
48398 + } else
48399 + result = 0;
48400 + unlock_page(page);
48401 + }
48402 +
48403 + BUG_ON(get_current_context()->trans->atom != NULL);
48404 + fault_in_pages_readable(buf, to_page);
48405 + BUG_ON(get_current_context()->trans->atom != NULL);
48406 +
48407 + lock_page(page);
48408 + if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48409 + void *kaddr;
48410 +
48411 + kaddr = kmap_atomic(page, KM_USER0);
48412 + memset(kaddr, 0, page_off);
48413 + memset(kaddr + page_off + to_page, 0,
48414 + PAGE_CACHE_SIZE - (page_off + to_page));
48415 + flush_dcache_page(page);
48416 + kunmap_atomic(kaddr, KM_USER0);
48417 + }
48418 +
48419 + written = filemap_copy_from_user(page, page_off, buf, to_page);
48420 + if (unlikely(written != to_page)) {
48421 + unlock_page(page);
48422 + result = RETERR(-EFAULT);
48423 + break;
48424 + }
48425 +
48426 + flush_dcache_page(page);
48427 + reiser4_set_page_dirty_internal(page);
48428 + unlock_page(page);
48429 + nr_dirty++;
48430 +
48431 + mark_page_accessed(page);
48432 + SetPageUptodate(page);
48433 +
48434 + if (jnodes[i]->blocknr == 0)
48435 + have_to_update_extent ++;
48436 +
48437 + page_off = 0;
48438 + buf += to_page;
48439 + left -= to_page;
48440 + BUG_ON(get_current_context()->trans->atom != NULL);
48441 + }
48442 +
48443 + if (have_to_update_extent) {
48444 + update_extents(file, jnodes, nr_dirty, *pos);
48445 + } else {
48446 + for (i = 0; i < nr_dirty; i ++) {
48447 + int ret;
48448 + spin_lock_jnode(jnodes[i]);
48449 + ret = reiser4_try_capture(jnodes[i],
48450 + ZNODE_WRITE_LOCK, 0);
48451 + BUG_ON(ret != 0);
48452 + jnode_make_dirty_locked(jnodes[i]);
48453 + spin_unlock_jnode(jnodes[i]);
48454 + }
48455 + }
48456 +out:
48457 + for (i = 0; i < nr_pages; i ++) {
48458 + page_cache_release(jnode_page(jnodes[i]));
48459 + JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48460 + jput(jnodes[i]);
48461 + }
48462 +
48463 + /* the only errors handled so far is ENOMEM and
48464 + EFAULT on copy_from_user */
48465 +
48466 + return (count - left) ? (count - left) : result;
48467 +}
48468 +
48469 +static inline void zero_page(struct page *page)
48470 +{
48471 + char *kaddr = kmap_atomic(page, KM_USER0);
48472 +
48473 + memset(kaddr, 0, PAGE_CACHE_SIZE);
48474 + flush_dcache_page(page);
48475 + kunmap_atomic(kaddr, KM_USER0);
48476 + SetPageUptodate(page);
48477 + unlock_page(page);
48478 +}
48479 +
48480 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48481 + struct page *page)
48482 +{
48483 + jnode *j;
48484 + struct address_space *mapping;
48485 + unsigned long index;
48486 + oid_t oid;
48487 + reiser4_block_nr block;
48488 +
48489 + mapping = page->mapping;
48490 + oid = get_inode_oid(mapping->host);
48491 + index = page->index;
48492 +
48493 + switch (state_of_extent(ext)) {
48494 + case HOLE_EXTENT:
48495 + /*
48496 + * it is possible to have hole page with jnode, if page was
48497 + * eflushed previously.
48498 + */
48499 + j = jfind(mapping, index);
48500 + if (j == NULL) {
48501 + zero_page(page);
48502 + return 0;
48503 + }
48504 + spin_lock_jnode(j);
48505 + if (!jnode_page(j)) {
48506 + jnode_attach_page(j, page);
48507 + } else {
48508 + BUG_ON(jnode_page(j) != page);
48509 + assert("vs-1504", jnode_page(j) == page);
48510 + }
48511 + block = *jnode_get_io_block(j);
48512 + spin_unlock_jnode(j);
48513 + if (block == 0) {
48514 + zero_page(page);
48515 + jput(j);
48516 + return 0;
48517 + }
48518 + break;
48519 +
48520 + case ALLOCATED_EXTENT:
48521 + j = jnode_of_page(page);
48522 + if (IS_ERR(j))
48523 + return PTR_ERR(j);
48524 + if (*jnode_get_block(j) == 0) {
48525 + reiser4_block_nr blocknr;
48526 +
48527 + blocknr = extent_get_start(ext) + pos;
48528 + jnode_set_block(j, &blocknr);
48529 + } else
48530 + assert("vs-1403",
48531 + j->blocknr == extent_get_start(ext) + pos);
48532 + break;
48533 +
48534 + case UNALLOCATED_EXTENT:
48535 + j = jfind(mapping, index);
48536 + assert("nikita-2688", j);
48537 + assert("vs-1426", jnode_page(j) == NULL);
48538 +
48539 + spin_lock_jnode(j);
48540 + jnode_attach_page(j, page);
48541 + spin_unlock_jnode(j);
48542 + break;
48543 +
48544 + default:
48545 + warning("vs-957", "wrong extent\n");
48546 + return RETERR(-EIO);
48547 + }
48548 +
48549 + BUG_ON(j == 0);
48550 + reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
48551 + jput(j);
48552 + return 0;
48553 +}
48554 +
48555 +/* Implements plugin->u.item.s.file.read operation for extent items. */
48556 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
48557 +{
48558 + int result;
48559 + struct page *page;
48560 + unsigned long cur_page, next_page;
48561 + unsigned long page_off, count;
48562 + struct address_space *mapping;
48563 + loff_t file_off;
48564 + uf_coord_t *uf_coord;
48565 + coord_t *coord;
48566 + extent_coord_extension_t *ext_coord;
48567 + unsigned long nr_pages;
48568 + char *kaddr;
48569 +
48570 + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48571 + assert("vs-572", flow->user == 1);
48572 + assert("vs-1351", flow->length > 0);
48573 +
48574 + uf_coord = &hint->ext_coord;
48575 +
48576 + check_uf_coord(uf_coord, NULL);
48577 + assert("vs-33", uf_coord->lh == &hint->lh);
48578 +
48579 + coord = &uf_coord->coord;
48580 + assert("vs-1119", znode_is_rlocked(coord->node));
48581 + assert("vs-1120", znode_is_loaded(coord->node));
48582 + assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48583 +
48584 + mapping = file->f_dentry->d_inode->i_mapping;
48585 + ext_coord = &uf_coord->extension.extent;
48586 +
48587 + /* offset in a file to start read from */
48588 + file_off = get_key_offset(&flow->key);
48589 + /* offset within the page to start read from */
48590 + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48591 + /* bytes which can be read from the page which contains file_off */
48592 + count = PAGE_CACHE_SIZE - page_off;
48593 +
48594 + /* index of page containing offset read is to start from */
48595 + cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48596 + next_page = cur_page;
48597 + /* number of pages flow spans over */
48598 + nr_pages =
48599 + ((file_off + flow->length + PAGE_CACHE_SIZE -
48600 + 1) >> PAGE_CACHE_SHIFT) - cur_page;
48601 +
48602 + /* we start having twig node read locked. However, we do not want to
48603 + keep that lock all the time readahead works. So, set a sel and
48604 + release twig node. */
48605 + reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
48606 + /* &hint->lh is done-ed */
48607 +
48608 + do {
48609 + reiser4_txn_restart_current();
48610 + page = read_mapping_page(mapping, cur_page, file);
48611 + if (IS_ERR(page))
48612 + return PTR_ERR(page);
48613 + lock_page(page);
48614 + if (!PageUptodate(page)) {
48615 + unlock_page(page);
48616 + page_cache_release(page);
48617 + warning("jmacd-97178", "extent_read: page is not up to date");
48618 + return RETERR(-EIO);
48619 + }
48620 + mark_page_accessed(page);
48621 + unlock_page(page);
48622 +
48623 + /* If users can be writing to this page using arbitrary virtual
48624 + addresses, take care about potential aliasing before reading
48625 + the page on the kernel side.
48626 + */
48627 + if (mapping_writably_mapped(mapping))
48628 + flush_dcache_page(page);
48629 +
48630 + assert("nikita-3034", reiser4_schedulable());
48631 +
48632 + /* number of bytes which are to be read from the page */
48633 + if (count > flow->length)
48634 + count = flow->length;
48635 +
48636 + result = fault_in_pages_writeable(flow->data, count);
48637 + if (result) {
48638 + page_cache_release(page);
48639 + return RETERR(-EFAULT);
48640 + }
48641 +
48642 + kaddr = kmap_atomic(page, KM_USER0);
48643 + result = __copy_to_user_inatomic(flow->data,
48644 + kaddr + page_off, count);
48645 + kunmap_atomic(kaddr, KM_USER0);
48646 + if (result != 0) {
48647 + kaddr = kmap(page);
48648 + result = __copy_to_user(flow->data, kaddr + page_off, count);
48649 + kunmap(page);
48650 + if (unlikely(result))
48651 + return RETERR(-EFAULT);
48652 + }
48653 +
48654 + page_cache_release(page);
48655 +
48656 + /* increase key (flow->key), update user area pointer (flow->data) */
48657 + move_flow_forward(flow, count);
48658 +
48659 + page_off = 0;
48660 + cur_page ++;
48661 + count = PAGE_CACHE_SIZE;
48662 + nr_pages--;
48663 + } while (flow->length);
48664 +
48665 + return 0;
48666 +}
48667 +
48668 +/*
48669 + plugin->s.file.readpage
48670 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
48671 + or
48672 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
48673 +
48674 + At the beginning: coord->node is read locked, zloaded, page is
48675 + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
48676 +*/
48677 +int reiser4_readpage_extent(void *vp, struct page *page)
48678 +{
48679 + uf_coord_t *uf_coord = vp;
48680 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
48681 + ON_DEBUG(reiser4_key key);
48682 +
48683 + assert("vs-1040", PageLocked(page));
48684 + assert("vs-1050", !PageUptodate(page));
48685 + assert("vs-1039", page->mapping && page->mapping->host);
48686 +
48687 + assert("vs-1044", znode_is_loaded(coord->node));
48688 + assert("vs-758", item_is_extent(coord));
48689 + assert("vs-1046", coord_is_existing_unit(coord));
48690 + assert("vs-1045", znode_is_rlocked(coord->node));
48691 + assert("vs-1047",
48692 + page->mapping->host->i_ino ==
48693 + get_key_objectid(item_key_by_coord(coord, &key)));
48694 + check_uf_coord(uf_coord, NULL);
48695 +
48696 + return reiser4_do_readpage_extent(
48697 + ext_by_ext_coord(uf_coord),
48698 + uf_coord->extension.extent.pos_in_unit, page);
48699 +}
48700 +
48701 +/**
48702 + * get_block_address_extent
48703 + * @coord:
48704 + * @block:
48705 + * @result:
48706 + *
48707 + *
48708 + */
48709 +int get_block_address_extent(const coord_t *coord, sector_t block,
48710 + sector_t *result)
48711 +{
48712 + reiser4_extent *ext;
48713 +
48714 + if (!coord_is_existing_unit(coord))
48715 + return RETERR(-EINVAL);
48716 +
48717 + ext = extent_by_coord(coord);
48718 +
48719 + if (state_of_extent(ext) != ALLOCATED_EXTENT)
48720 + /* FIXME: bad things may happen if it is unallocated extent */
48721 + *result = 0;
48722 + else {
48723 + reiser4_key key;
48724 +
48725 + unit_key_by_coord(coord, &key);
48726 + assert("vs-1645",
48727 + block >= get_key_offset(&key) >> current_blocksize_bits);
48728 + assert("vs-1646",
48729 + block <
48730 + (get_key_offset(&key) >> current_blocksize_bits) +
48731 + extent_get_width(ext));
48732 + *result =
48733 + extent_get_start(ext) + (block -
48734 + (get_key_offset(&key) >>
48735 + current_blocksize_bits));
48736 + }
48737 + return 0;
48738 +}
48739 +
48740 +/*
48741 + plugin->u.item.s.file.append_key
48742 + key of first byte which is the next to last byte by addressed by this extent
48743 +*/
48744 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
48745 +{
48746 + item_key_by_coord(coord, key);
48747 + set_key_offset(key,
48748 + get_key_offset(key) + reiser4_extent_size(coord,
48749 + nr_units_extent
48750 + (coord)));
48751 +
48752 + assert("vs-610", get_key_offset(key)
48753 + && (get_key_offset(key) & (current_blocksize - 1)) == 0);
48754 + return key;
48755 +}
48756 +
48757 +/* plugin->u.item.s.file.init_coord_extension */
48758 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
48759 +{
48760 + coord_t *coord;
48761 + extent_coord_extension_t *ext_coord;
48762 + reiser4_key key;
48763 + loff_t offset;
48764 +
48765 + assert("vs-1295", uf_coord->valid == 0);
48766 +
48767 + coord = &uf_coord->coord;
48768 + assert("vs-1288", coord_is_iplug_set(coord));
48769 + assert("vs-1327", znode_is_loaded(coord->node));
48770 +
48771 + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
48772 + return;
48773 +
48774 + ext_coord = &uf_coord->extension.extent;
48775 + ext_coord->nr_units = nr_units_extent(coord);
48776 + ext_coord->ext_offset =
48777 + (char *)extent_by_coord(coord) - zdata(coord->node);
48778 + ext_coord->width = extent_get_width(extent_by_coord(coord));
48779 + ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
48780 + uf_coord->valid = 1;
48781 +
48782 + /* pos_in_unit is the only uninitialized field in extended coord */
48783 + if (coord->between == AFTER_UNIT) {
48784 + assert("vs-1330",
48785 + coord->unit_pos == nr_units_extent(coord) - 1);
48786 +
48787 + ext_coord->pos_in_unit = ext_coord->width - 1;
48788 + } else {
48789 + /* AT_UNIT */
48790 + unit_key_by_coord(coord, &key);
48791 + offset = get_key_offset(&key);
48792 +
48793 + assert("vs-1328", offset <= lookuped);
48794 + assert("vs-1329",
48795 + lookuped <
48796 + offset + ext_coord->width * current_blocksize);
48797 + ext_coord->pos_in_unit =
48798 + ((lookuped - offset) >> current_blocksize_bits);
48799 + }
48800 +}
48801 +
48802 +/*
48803 + * Local variables:
48804 + * c-indentation-style: "K&R"
48805 + * mode-name: "LC"
48806 + * c-basic-offset: 8
48807 + * tab-width: 8
48808 + * fill-column: 79
48809 + * scroll-step: 1
48810 + * End:
48811 + */
48812 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c
48813 --- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
48814 +++ linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c 2007-05-06 14:50:43.811010720 +0400
48815 @@ -0,0 +1,1028 @@
48816 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48817 +
48818 +#include "item.h"
48819 +#include "../../tree.h"
48820 +#include "../../jnode.h"
48821 +#include "../../super.h"
48822 +#include "../../flush.h"
48823 +#include "../../carry.h"
48824 +#include "../object.h"
48825 +
48826 +#include <linux/pagemap.h>
48827 +
48828 +static reiser4_block_nr extent_unit_start(const coord_t * item);
48829 +
48830 +/* Return either first or last extent (depending on @side) of the item
48831 + @coord is set to. Set @pos_in_unit either to first or to last block
48832 + of extent. */
48833 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
48834 + reiser4_block_nr * pos_in_unit)
48835 +{
48836 + reiser4_extent *ext;
48837 +
48838 + if (side == LEFT_SIDE) {
48839 + /* get first extent of item */
48840 + ext = extent_item(coord);
48841 + *pos_in_unit = 0;
48842 + } else {
48843 + /* get last extent of item and last position within it */
48844 + assert("vs-363", side == RIGHT_SIDE);
48845 + ext = extent_item(coord) + coord_last_unit_pos(coord);
48846 + *pos_in_unit = extent_get_width(ext) - 1;
48847 + }
48848 +
48849 + return ext;
48850 +}
48851 +
48852 +/* item_plugin->f.utmost_child */
48853 +/* Return the child. Coord is set to extent item. Find jnode corresponding
48854 + either to first or to last unformatted node pointed by the item */
48855 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
48856 +{
48857 + reiser4_extent *ext;
48858 + reiser4_block_nr pos_in_unit;
48859 +
48860 + ext = extent_utmost_ext(coord, side, &pos_in_unit);
48861 +
48862 + switch (state_of_extent(ext)) {
48863 + case HOLE_EXTENT:
48864 + *childp = NULL;
48865 + return 0;
48866 + case ALLOCATED_EXTENT:
48867 + case UNALLOCATED_EXTENT:
48868 + break;
48869 + default:
48870 + /* this should never happen */
48871 + assert("vs-1417", 0);
48872 + }
48873 +
48874 + {
48875 + reiser4_key key;
48876 + reiser4_tree *tree;
48877 + unsigned long index;
48878 +
48879 + if (side == LEFT_SIDE) {
48880 + /* get key of first byte addressed by the extent */
48881 + item_key_by_coord(coord, &key);
48882 + } else {
48883 + /* get key of byte which next after last byte addressed by the extent */
48884 + append_key_extent(coord, &key);
48885 + }
48886 +
48887 + assert("vs-544",
48888 + (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
48889 + /* index of first or last (depending on @side) page addressed
48890 + by the extent */
48891 + index =
48892 + (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
48893 + if (side == RIGHT_SIDE)
48894 + index--;
48895 +
48896 + tree = coord->node->zjnode.tree;
48897 + *childp = jlookup(tree, get_key_objectid(&key), index);
48898 + }
48899 +
48900 + return 0;
48901 +}
48902 +
48903 +/* item_plugin->f.utmost_child_real_block */
48904 +/* Return the child's block, if allocated. */
48905 +int
48906 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
48907 + reiser4_block_nr * block)
48908 +{
48909 + reiser4_extent *ext;
48910 +
48911 + ext = extent_by_coord(coord);
48912 +
48913 + switch (state_of_extent(ext)) {
48914 + case ALLOCATED_EXTENT:
48915 + *block = extent_get_start(ext);
48916 + if (side == RIGHT_SIDE)
48917 + *block += extent_get_width(ext) - 1;
48918 + break;
48919 + case HOLE_EXTENT:
48920 + case UNALLOCATED_EXTENT:
48921 + *block = 0;
48922 + break;
48923 + default:
48924 + /* this should never happen */
48925 + assert("vs-1418", 0);
48926 + }
48927 +
48928 + return 0;
48929 +}
48930 +
48931 +/* item_plugin->f.scan */
48932 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
48933 + This scan continues, advancing the parent coordinate, until either it encounters a
48934 + formatted child or it finishes scanning this node.
48935 +
48936 + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
48937 + not sure this is last property (same atom) is enforced, but it should be the case since
48938 + one atom must write the parent and the others must read the parent, thus fusing?). In
48939 + any case, the code below asserts this case for unallocated extents. Unallocated
48940 + extents are thus optimized because we can skip to the endpoint when scanning.
48941 +
48942 + It returns control to reiser4_scan_extent, handles these terminating conditions,
48943 + e.g., by loading the next twig.
48944 +*/
48945 +int reiser4_scan_extent(flush_scan * scan)
48946 +{
48947 + coord_t coord;
48948 + jnode *neighbor;
48949 + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
48950 + reiser4_block_nr unit_start;
48951 + __u64 oid;
48952 + reiser4_key key;
48953 + int ret = 0, allocated, incr;
48954 + reiser4_tree *tree;
48955 +
48956 + if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
48957 + scan->stop = 1;
48958 + return 0; /* Race with truncate, this node is already
48959 + * truncated. */
48960 + }
48961 +
48962 + coord_dup(&coord, &scan->parent_coord);
48963 +
48964 + assert("jmacd-1404", !reiser4_scan_finished(scan));
48965 + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
48966 + assert("jmacd-1406", jnode_is_unformatted(scan->node));
48967 +
48968 + /* The scan_index variable corresponds to the current page index of the
48969 + unformatted block scan position. */
48970 + scan_index = index_jnode(scan->node);
48971 +
48972 + assert("jmacd-7889", item_is_extent(&coord));
48973 +
48974 + repeat:
48975 + /* objectid of file */
48976 + oid = get_key_objectid(item_key_by_coord(&coord, &key));
48977 +
48978 + allocated = !extent_is_unallocated(&coord);
48979 + /* Get the values of this extent unit: */
48980 + unit_index = extent_unit_index(&coord);
48981 + unit_width = extent_unit_width(&coord);
48982 + unit_start = extent_unit_start(&coord);
48983 +
48984 + assert("jmacd-7187", unit_width > 0);
48985 + assert("jmacd-7188", scan_index >= unit_index);
48986 + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
48987 +
48988 + /* Depending on the scan direction, we set different maximum values for scan_index
48989 + (scan_max) and the number of nodes that would be passed if the scan goes the
48990 + entire way (scan_dist). Incr is an integer reflecting the incremental
48991 + direction of scan_index. */
48992 + if (reiser4_scanning_left(scan)) {
48993 + scan_max = unit_index;
48994 + scan_dist = scan_index - unit_index;
48995 + incr = -1;
48996 + } else {
48997 + scan_max = unit_index + unit_width - 1;
48998 + scan_dist = scan_max - unit_index;
48999 + incr = +1;
49000 + }
49001 +
49002 + tree = coord.node->zjnode.tree;
49003 +
49004 + /* If the extent is allocated we have to check each of its blocks. If the extent
49005 + is unallocated we can skip to the scan_max. */
49006 + if (allocated) {
49007 + do {
49008 + neighbor = jlookup(tree, oid, scan_index);
49009 + if (neighbor == NULL)
49010 + goto stop_same_parent;
49011 +
49012 + if (scan->node != neighbor
49013 + && !reiser4_scan_goto(scan, neighbor)) {
49014 + /* @neighbor was jput() by reiser4_scan_goto */
49015 + goto stop_same_parent;
49016 + }
49017 +
49018 + ret = scan_set_current(scan, neighbor, 1, &coord);
49019 + if (ret != 0) {
49020 + goto exit;
49021 + }
49022 +
49023 + /* reference to @neighbor is stored in @scan, no need
49024 + to jput(). */
49025 + scan_index += incr;
49026 +
49027 + } while (incr + scan_max != scan_index);
49028 +
49029 + } else {
49030 + /* Optimized case for unallocated extents, skip to the end. */
49031 + neighbor = jlookup(tree, oid, scan_max /*index */ );
49032 + if (neighbor == NULL) {
49033 + /* Race with truncate */
49034 + scan->stop = 1;
49035 + ret = 0;
49036 + goto exit;
49037 + }
49038 +
49039 + assert("zam-1043",
49040 + reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
49041 +
49042 + ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49043 + if (ret != 0) {
49044 + goto exit;
49045 + }
49046 + }
49047 +
49048 + if (coord_sideof_unit(&coord, scan->direction) == 0
49049 + && item_is_extent(&coord)) {
49050 + /* Continue as long as there are more extent units. */
49051 +
49052 + scan_index =
49053 + extent_unit_index(&coord) +
49054 + (reiser4_scanning_left(scan) ?
49055 + extent_unit_width(&coord) - 1 : 0);
49056 + goto repeat;
49057 + }
49058 +
49059 + if (0) {
49060 + stop_same_parent:
49061 +
49062 + /* If we are scanning left and we stop in the middle of an allocated
49063 + extent, we know the preceder immediately.. */
49064 + /* middle of extent is (scan_index - unit_index) != 0. */
49065 + if (reiser4_scanning_left(scan) &&
49066 + (scan_index - unit_index) != 0) {
49067 + /* FIXME(B): Someone should step-through and verify that this preceder
49068 + calculation is indeed correct. */
49069 + /* @unit_start is starting block (number) of extent
49070 + unit. Flush stopped at the @scan_index block from
49071 + the beginning of the file, which is (scan_index -
49072 + unit_index) block within extent.
49073 + */
49074 + if (unit_start) {
49075 + /* skip preceder update when we are at hole */
49076 + scan->preceder_blk =
49077 + unit_start + scan_index - unit_index;
49078 + check_preceder(scan->preceder_blk);
49079 + }
49080 + }
49081 +
49082 + /* In this case, we leave coord set to the parent of scan->node. */
49083 + scan->stop = 1;
49084 +
49085 + } else {
49086 + /* In this case, we are still scanning, coord is set to the next item which is
49087 + either off-the-end of the node or not an extent. */
49088 + assert("jmacd-8912", scan->stop == 0);
49089 + assert("jmacd-7812",
49090 + (coord_is_after_sideof_unit(&coord, scan->direction)
49091 + || !item_is_extent(&coord)));
49092 + }
49093 +
49094 + ret = 0;
49095 + exit:
49096 + return ret;
49097 +}
49098 +
49099 +/* ask block allocator for some blocks */
49100 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49101 + reiser4_block_nr wanted_count,
49102 + reiser4_block_nr *first_allocated,
49103 + reiser4_block_nr *allocated,
49104 + block_stage_t block_stage)
49105 +{
49106 + *allocated = wanted_count;
49107 + preceder->max_dist = 0; /* scan whole disk, if needed */
49108 +
49109 + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49110 + preceder->block_stage = block_stage;
49111 +
49112 + /* FIXME: we do not handle errors here now */
49113 + check_me("vs-420",
49114 + reiser4_alloc_blocks(preceder, first_allocated, allocated,
49115 + BA_PERMANENT) == 0);
49116 + /* update flush_pos's preceder to last allocated block number */
49117 + preceder->blk = *first_allocated + *allocated - 1;
49118 +}
49119 +
49120 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49121 + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49122 + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49123 +static reiser4_block_nr reserve_replace(void)
49124 +{
49125 + reiser4_block_nr grabbed, needed;
49126 +
49127 + grabbed = get_current_context()->grabbed_blocks;
49128 + needed = estimate_one_insert_into_item(current_tree);
49129 + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49130 + return grabbed;
49131 +}
49132 +
49133 +static void free_replace_reserved(reiser4_block_nr grabbed)
49134 +{
49135 + reiser4_context *ctx;
49136 +
49137 + ctx = get_current_context();
49138 + grabbed2free(ctx, get_super_private(ctx->super),
49139 + ctx->grabbed_blocks - grabbed);
49140 +}
49141 +
49142 +/* Block offset of first block addressed by unit */
49143 +__u64 extent_unit_index(const coord_t * item)
49144 +{
49145 + reiser4_key key;
49146 +
49147 + assert("vs-648", coord_is_existing_unit(item));
49148 + unit_key_by_coord(item, &key);
49149 + return get_key_offset(&key) >> current_blocksize_bits;
49150 +}
49151 +
49152 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
49153 + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
49154 +__u64 extent_unit_width(const coord_t * item)
49155 +{
49156 + assert("vs-649", coord_is_existing_unit(item));
49157 + return width_by_coord(item);
49158 +}
49159 +
49160 +/* Starting block location of this unit */
49161 +static reiser4_block_nr extent_unit_start(const coord_t * item)
49162 +{
49163 + return extent_get_start(extent_by_coord(item));
49164 +}
49165 +
49166 +/**
49167 + * split_allocated_extent -
49168 + * @coord:
49169 + * @pos_in_unit:
49170 + *
49171 + * replace allocated extent with two allocated extents
49172 + */
49173 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49174 +{
49175 + int result;
49176 + struct replace_handle *h;
49177 + reiser4_extent *ext;
49178 + reiser4_block_nr grabbed;
49179 +
49180 + ext = extent_by_coord(coord);
49181 + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49182 + assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49183 +
49184 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49185 + if (h == NULL)
49186 + return RETERR(-ENOMEM);
49187 + h->coord = coord;
49188 + h->lh = znode_lh(coord->node);
49189 + h->pkey = &h->key;
49190 + unit_key_by_coord(coord, h->pkey);
49191 + set_key_offset(h->pkey,
49192 + (get_key_offset(h->pkey) +
49193 + pos_in_unit * current_blocksize));
49194 + reiser4_set_extent(&h->overwrite, extent_get_start(ext),
49195 + pos_in_unit);
49196 + reiser4_set_extent(&h->new_extents[0],
49197 + extent_get_start(ext) + pos_in_unit,
49198 + extent_get_width(ext) - pos_in_unit);
49199 + h->nr_new_extents = 1;
49200 + h->flags = COPI_DONT_SHIFT_LEFT;
49201 + h->paste_key = h->key;
49202 +
49203 + /* reserve space for extent unit paste, @grabbed is reserved before */
49204 + grabbed = reserve_replace();
49205 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49206 + extent */);
49207 + /* restore reserved */
49208 + free_replace_reserved(grabbed);
49209 + kfree(h);
49210 + return result;
49211 +}
49212 +
49213 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49214 + one). Return 1 if it succeeded, 0 - otherwise */
49215 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49216 + reiser4_extent *replace)
49217 +{
49218 + assert("vs-1415", extent_by_coord(coord) == ext);
49219 +
49220 + if (coord->unit_pos == 0
49221 + || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49222 + /* @ext either does not exist or is not allocated extent */
49223 + return 0;
49224 + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49225 + extent_get_start(replace))
49226 + return 0;
49227 +
49228 + /* we can glue, widen previous unit */
49229 + extent_set_width(ext - 1,
49230 + extent_get_width(ext - 1) + extent_get_width(replace));
49231 +
49232 + if (extent_get_width(ext) != extent_get_width(replace)) {
49233 + /* make current extent narrower */
49234 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
49235 + extent_set_start(ext,
49236 + extent_get_start(ext) +
49237 + extent_get_width(replace));
49238 + extent_set_width(ext,
49239 + extent_get_width(ext) -
49240 + extent_get_width(replace));
49241 + } else {
49242 + /* current extent completely glued with its left neighbor, remove it */
49243 + coord_t from, to;
49244 +
49245 + coord_dup(&from, coord);
49246 + from.unit_pos = nr_units_extent(coord) - 1;
49247 + coord_dup(&to, &from);
49248 +
49249 + /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49250 + freed after unit removal to end of item */
49251 + memmove(ext, ext + 1,
49252 + (from.unit_pos -
49253 + coord->unit_pos) * sizeof(reiser4_extent));
49254 + /* wipe part of item which is going to be cut, so that node_check will not be confused */
49255 + cut_node_content(&from, &to, NULL, NULL, NULL);
49256 + }
49257 + znode_make_dirty(coord->node);
49258 + /* move coord back */
49259 + coord->unit_pos--;
49260 + return 1;
49261 +}
49262 +
49263 +/**
49264 + * conv_extent - replace extent with 2 ones
49265 + * @coord: coordinate of extent to be replaced
49266 + * @replace: extent to overwrite the one @coord is set to
49267 + *
49268 + * Overwrites extent @coord is set to and paste one extent unit after
49269 + * overwritten one if @replace is shorter than initial extent
49270 + */
49271 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
49272 +{
49273 + int result;
49274 + struct replace_handle *h;
49275 + reiser4_extent *ext;
49276 + reiser4_block_nr start, width, new_width;
49277 + reiser4_block_nr grabbed;
49278 + extent_state state;
49279 +
49280 + ext = extent_by_coord(coord);
49281 + state = state_of_extent(ext);
49282 + start = extent_get_start(ext);
49283 + width = extent_get_width(ext);
49284 + new_width = extent_get_width(replace);
49285 +
49286 + assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49287 + state == ALLOCATED_EXTENT));
49288 + assert("vs-1459", width >= new_width);
49289 +
49290 + if (try_to_merge_with_left(coord, ext, replace)) {
49291 + /* merged @replace with left neighbor. Current unit is either
49292 + removed or narrowed */
49293 + return 0;
49294 + }
49295 +
49296 + if (width == new_width) {
49297 + /* replace current extent with @replace */
49298 + *ext = *replace;
49299 + znode_make_dirty(coord->node);
49300 + return 0;
49301 + }
49302 +
49303 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49304 + if (h == NULL)
49305 + return RETERR(-ENOMEM);
49306 + h->coord = coord;
49307 + h->lh = znode_lh(coord->node);
49308 + h->pkey = &h->key;
49309 + unit_key_by_coord(coord, h->pkey);
49310 + set_key_offset(h->pkey,
49311 + (get_key_offset(h->pkey) + new_width * current_blocksize));
49312 + h->overwrite = *replace;
49313 +
49314 + /* replace @ext with @replace and padding extent */
49315 + reiser4_set_extent(&h->new_extents[0],
49316 + (state == ALLOCATED_EXTENT) ?
49317 + (start + new_width) :
49318 + UNALLOCATED_EXTENT_START,
49319 + width - new_width);
49320 + h->nr_new_extents = 1;
49321 + h->flags = COPI_DONT_SHIFT_LEFT;
49322 + h->paste_key = h->key;
49323 +
49324 + /* reserve space for extent unit paste, @grabbed is reserved before */
49325 + grabbed = reserve_replace();
49326 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49327 + extent */);
49328 +
49329 + /* restore reserved */
49330 + free_replace_reserved(grabbed);
49331 + kfree(h);
49332 + return result;
49333 +}
49334 +
49335 +/**
49336 + * assign_real_blocknrs
49337 + * @flush_pos:
49338 + * @oid: objectid of file jnodes to assign block number to belongs to
49339 + * @index: first jnode on the range
49340 + * @count: number of jnodes to assign block numbers to
49341 + * @first: start of allocated block range
49342 + *
49343 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
49344 + * @index. Jnodes get lookuped with jlookup.
49345 + */
49346 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49347 + unsigned long index, reiser4_block_nr count,
49348 + reiser4_block_nr first)
49349 +{
49350 + unsigned long i;
49351 + reiser4_tree *tree;
49352 + txn_atom *atom;
49353 + int nr;
49354 +
49355 + atom = atom_locked_by_fq(flush_pos->fq);
49356 + assert("vs-1468", atom);
49357 + BUG_ON(atom == NULL);
49358 +
49359 + nr = 0;
49360 + tree = current_tree;
49361 + for (i = 0; i < count; ++i, ++index) {
49362 + jnode *node;
49363 +
49364 + node = jlookup(tree, oid, index);
49365 + assert("", node != NULL);
49366 + BUG_ON(node == NULL);
49367 +
49368 + spin_lock_jnode(node);
49369 + assert("", !jnode_is_flushprepped(node));
49370 + assert("vs-1475", node->atom == atom);
49371 + assert("vs-1476", atomic_read(&node->x_count) > 0);
49372 +
49373 + JF_CLR(node, JNODE_FLUSH_RESERVED);
49374 + jnode_set_block(node, &first);
49375 + unformatted_make_reloc(node, flush_pos->fq);
49376 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49377 + FQ_LIST, 0));
49378 + spin_unlock_jnode(node);
49379 + first++;
49380 +
49381 + atomic_dec(&node->x_count);
49382 + nr ++;
49383 + }
49384 +
49385 + spin_unlock_atom(atom);
49386 + return;
49387 +}
49388 +
49389 +/**
49390 + * make_node_ovrwr - assign node to overwrite set
49391 + * @jnodes: overwrite set list head
49392 + * @node: jnode to belong to overwrite set
49393 + *
49394 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49395 + * which is an accumulator for nodes before they get to overwrite set list of
49396 + * atom.
49397 + */
49398 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49399 +{
49400 + spin_lock_jnode(node);
49401 +
49402 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49403 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49404 +
49405 + JF_SET(node, JNODE_OVRWR);
49406 + list_move_tail(&node->capture_link, jnodes);
49407 + ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49408 +
49409 + spin_unlock_jnode(node);
49410 +}
49411 +
49412 +/**
49413 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49414 + * @flush_pos: flush position
49415 + * @oid: objectid of file jnodes belong to
49416 + * @index: starting index
49417 + * @width: extent width
49418 + *
49419 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49420 + * overwrite set. Starting from the one with index @index. If end of slum is
49421 + * detected (node is not found or flushprepped) - stop iterating and set flush
49422 + * position's state to POS_INVALID.
49423 + */
49424 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49425 + unsigned long index, reiser4_block_nr width)
49426 +{
49427 + unsigned long i;
49428 + reiser4_tree *tree;
49429 + jnode *node;
49430 + txn_atom *atom;
49431 + LIST_HEAD(jnodes);
49432 +
49433 + tree = current_tree;
49434 +
49435 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49436 + assert("vs-1478", atom);
49437 +
49438 + for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49439 + node = jlookup(tree, oid, index);
49440 + if (!node) {
49441 + flush_pos->state = POS_INVALID;
49442 + break;
49443 + }
49444 + if (jnode_check_flushprepped(node)) {
49445 + flush_pos->state = POS_INVALID;
49446 + atomic_dec(&node->x_count);
49447 + break;
49448 + }
49449 + if (node->atom != atom) {
49450 + flush_pos->state = POS_INVALID;
49451 + atomic_dec(&node->x_count);
49452 + break;
49453 + }
49454 + make_node_ovrwr(&jnodes, node);
49455 + atomic_dec(&node->x_count);
49456 + }
49457 +
49458 + list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49459 + spin_unlock_atom(atom);
49460 +}
49461 +
49462 +/**
49463 + * allocated_extent_slum_size
49464 + * @flush_pos:
49465 + * @oid:
49466 + * @index:
49467 + * @count:
49468 + *
49469 + *
49470 + */
49471 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49472 + unsigned long index, unsigned long count)
49473 +{
49474 + unsigned long i;
49475 + reiser4_tree *tree;
49476 + txn_atom *atom;
49477 + int nr;
49478 +
49479 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49480 + assert("vs-1468", atom);
49481 +
49482 + nr = 0;
49483 + tree = current_tree;
49484 + for (i = 0; i < count; ++i, ++index) {
49485 + jnode *node;
49486 +
49487 + node = jlookup(tree, oid, index);
49488 + if (!node)
49489 + break;
49490 +
49491 + if (jnode_check_flushprepped(node)) {
49492 + atomic_dec(&node->x_count);
49493 + break;
49494 + }
49495 +
49496 + if (node->atom != atom) {
49497 + /*
49498 + * this is possible on overwrite: extent_write may
49499 + * capture several unformatted nodes without capturing
49500 + * any formatted nodes.
49501 + */
49502 + atomic_dec(&node->x_count);
49503 + break;
49504 + }
49505 +
49506 + assert("vs-1476", atomic_read(&node->x_count) > 1);
49507 + atomic_dec(&node->x_count);
49508 + nr ++;
49509 + }
49510 +
49511 + spin_unlock_atom(atom);
49512 + return nr;
49513 +}
49514 +
49515 +/**
49516 + * alloc_extent
49517 + * @flush_pos:
49518 + *
49519 + *
49520 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49521 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49522 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49523 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49524 + * set to 1 and to overwrite set otherwise
49525 + */
49526 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
49527 +{
49528 + coord_t *coord;
49529 + reiser4_extent *ext;
49530 + reiser4_extent replace_ext;
49531 + oid_t oid;
49532 + reiser4_block_nr protected;
49533 + reiser4_block_nr start;
49534 + __u64 index;
49535 + __u64 width;
49536 + extent_state state;
49537 + int result;
49538 + reiser4_block_nr first_allocated;
49539 + __u64 allocated;
49540 + reiser4_key key;
49541 + block_stage_t block_stage;
49542 +
49543 + assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49544 + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49545 + && item_is_extent(&flush_pos->coord));
49546 +
49547 + coord = &flush_pos->coord;
49548 +
49549 + ext = extent_by_coord(coord);
49550 + state = state_of_extent(ext);
49551 + if (state == HOLE_EXTENT) {
49552 + flush_pos->state = POS_INVALID;
49553 + return 0;
49554 + }
49555 +
49556 + item_key_by_coord(coord, &key);
49557 + oid = get_key_objectid(&key);
49558 + index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49559 + start = extent_get_start(ext);
49560 + width = extent_get_width(ext);
49561 +
49562 + assert("vs-1457", width > flush_pos->pos_in_unit);
49563 +
49564 + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49565 + /* relocate */
49566 + if (flush_pos->pos_in_unit) {
49567 + /* split extent unit into two */
49568 + result =
49569 + split_allocated_extent(coord,
49570 + flush_pos->pos_in_unit);
49571 + flush_pos->pos_in_unit = 0;
49572 + return result;
49573 + }
49574 +
49575 + /* limit number of nodes to allocate */
49576 + if (flush_pos->nr_to_write < width)
49577 + width = flush_pos->nr_to_write;
49578 +
49579 + if (state == ALLOCATED_EXTENT) {
49580 + /*
49581 + * all protected nodes are not flushprepped, therefore
49582 + * they are counted as flush_reserved
49583 + */
49584 + block_stage = BLOCK_FLUSH_RESERVED;
49585 + protected = allocated_extent_slum_size(flush_pos, oid,
49586 + index, width);
49587 + if (protected == 0) {
49588 + flush_pos->state = POS_INVALID;
49589 + flush_pos->pos_in_unit = 0;
49590 + return 0;
49591 + }
49592 + } else {
49593 + block_stage = BLOCK_UNALLOCATED;
49594 + protected = width;
49595 + }
49596 +
49597 + /*
49598 + * look at previous unit if possible. If it is allocated, make
49599 + * preceder more precise
49600 + */
49601 + if (coord->unit_pos &&
49602 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49603 + reiser4_pos_hint(flush_pos)->blk =
49604 + extent_get_start(ext - 1) +
49605 + extent_get_width(ext - 1);
49606 +
49607 + /* allocate new block numbers for protected nodes */
49608 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49609 + protected,
49610 + &first_allocated, &allocated,
49611 + block_stage);
49612 +
49613 + if (state == ALLOCATED_EXTENT)
49614 + /*
49615 + * on relocating - free nodes which are going to be
49616 + * relocated
49617 + */
49618 + reiser4_dealloc_blocks(&start, &allocated,
49619 + BLOCK_ALLOCATED, BA_DEFER);
49620 +
49621 + /* assign new block numbers to protected nodes */
49622 + assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
49623 +
49624 + /* prepare extent which will replace current one */
49625 + reiser4_set_extent(&replace_ext, first_allocated, allocated);
49626 +
49627 + /* adjust extent item */
49628 + result = conv_extent(coord, &replace_ext);
49629 + if (result != 0 && result != -ENOMEM) {
49630 + warning("vs-1461",
49631 + "Failed to allocate extent. Should not happen\n");
49632 + return result;
49633 + }
49634 +
49635 + /*
49636 + * break flush: we prepared for flushing as many blocks as we
49637 + * were asked for
49638 + */
49639 + if (flush_pos->nr_to_write == allocated)
49640 + flush_pos->state = POS_INVALID;
49641 + } else {
49642 + /* overwrite */
49643 + mark_jnodes_overwrite(flush_pos, oid, index, width);
49644 + }
49645 + flush_pos->pos_in_unit = 0;
49646 + return 0;
49647 +}
49648 +
49649 +/* if @key is glueable to the item @coord is set to */
49650 +static int must_insert(const coord_t *coord, const reiser4_key *key)
49651 +{
49652 + reiser4_key last;
49653 +
49654 + if (item_id_by_coord(coord) == EXTENT_POINTER_ID
49655 + && keyeq(append_key_extent(coord, &last), key))
49656 + return 0;
49657 + return 1;
49658 +}
49659 +
49660 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
49661 + or modify last unit of last item to have greater width */
49662 +static int put_unit_to_end(znode *node, const reiser4_key *key,
49663 + reiser4_extent *copy_ext)
49664 +{
49665 + int result;
49666 + coord_t coord;
49667 + cop_insert_flag flags;
49668 + reiser4_extent *last_ext;
49669 + reiser4_item_data data;
49670 +
49671 + /* set coord after last unit in an item */
49672 + coord_init_last_unit(&coord, node);
49673 + coord.between = AFTER_UNIT;
49674 +
49675 + flags =
49676 + COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
49677 + if (must_insert(&coord, key)) {
49678 + result =
49679 + insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
49680 + key, NULL /*lh */ , flags);
49681 +
49682 + } else {
49683 + /* try to glue with last unit */
49684 + last_ext = extent_by_coord(&coord);
49685 + if (state_of_extent(last_ext) &&
49686 + extent_get_start(last_ext) + extent_get_width(last_ext) ==
49687 + extent_get_start(copy_ext)) {
49688 + /* widen last unit of node */
49689 + extent_set_width(last_ext,
49690 + extent_get_width(last_ext) +
49691 + extent_get_width(copy_ext));
49692 + znode_make_dirty(node);
49693 + return 0;
49694 + }
49695 +
49696 + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
49697 + result =
49698 + insert_into_item(&coord, NULL /*lh */ , key,
49699 + init_new_extent(&data, copy_ext, 1),
49700 + flags);
49701 + }
49702 +
49703 + assert("vs-438", result == 0 || result == -E_NODE_FULL);
49704 + return result;
49705 +}
49706 +
49707 +/* @coord is set to extent unit */
49708 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
49709 + flush_pos_t *flush_pos,
49710 + reiser4_key *stop_key)
49711 +{
49712 + reiser4_extent *ext;
49713 + __u64 index;
49714 + __u64 width;
49715 + reiser4_block_nr start;
49716 + extent_state state;
49717 + oid_t oid;
49718 + reiser4_block_nr first_allocated;
49719 + __u64 allocated;
49720 + __u64 protected;
49721 + reiser4_extent copy_extent;
49722 + reiser4_key key;
49723 + int result;
49724 + block_stage_t block_stage;
49725 +
49726 + assert("vs-1457", flush_pos->pos_in_unit == 0);
49727 + assert("vs-1467", coord_is_leftmost_unit(coord));
49728 + assert("vs-1467", item_is_extent(coord));
49729 +
49730 + ext = extent_by_coord(coord);
49731 + index = extent_unit_index(coord);
49732 + start = extent_get_start(ext);
49733 + width = extent_get_width(ext);
49734 + state = state_of_extent(ext);
49735 + unit_key_by_coord(coord, &key);
49736 + oid = get_key_objectid(&key);
49737 +
49738 + if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
49739 + (state == UNALLOCATED_EXTENT)) {
49740 + /* relocate */
49741 + if (state == ALLOCATED_EXTENT) {
49742 + /* all protected nodes are not flushprepped, therefore
49743 + * they are counted as flush_reserved */
49744 + block_stage = BLOCK_FLUSH_RESERVED;
49745 + protected = allocated_extent_slum_size(flush_pos, oid,
49746 + index, width);
49747 + if (protected == 0) {
49748 + flush_pos->state = POS_INVALID;
49749 + flush_pos->pos_in_unit = 0;
49750 + return 0;
49751 + }
49752 + } else {
49753 + block_stage = BLOCK_UNALLOCATED;
49754 + protected = width;
49755 + }
49756 +
49757 + /*
49758 + * look at previous unit if possible. If it is allocated, make
49759 + * preceder more precise
49760 + */
49761 + if (coord->unit_pos &&
49762 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49763 + reiser4_pos_hint(flush_pos)->blk =
49764 + extent_get_start(ext - 1) +
49765 + extent_get_width(ext - 1);
49766 +
49767 + /* allocate new block numbers for protected nodes */
49768 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49769 + protected,
49770 + &first_allocated, &allocated,
49771 + block_stage);
49772 +
49773 + /* prepare extent which will be copied to left */
49774 + reiser4_set_extent(&copy_extent, first_allocated, allocated);
49775 +
49776 + result = put_unit_to_end(left, &key, &copy_extent);
49777 + if (result == -E_NODE_FULL) {
49778 + int target_block_stage;
49779 +
49780 + /* free blocks which were just allocated */
49781 + target_block_stage =
49782 + (state ==
49783 + ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
49784 + BLOCK_UNALLOCATED;
49785 + reiser4_dealloc_blocks(&first_allocated, &allocated,
49786 + target_block_stage,
49787 + BA_PERMANENT);
49788 +
49789 + /* rewind the preceder. */
49790 + flush_pos->preceder.blk = first_allocated;
49791 + check_preceder(flush_pos->preceder.blk);
49792 +
49793 + return SQUEEZE_TARGET_FULL;
49794 + }
49795 +
49796 + if (state == ALLOCATED_EXTENT) {
49797 + /* free nodes which were relocated */
49798 + reiser4_dealloc_blocks(&start, &allocated,
49799 + BLOCK_ALLOCATED, BA_DEFER);
49800 + }
49801 +
49802 + /* assign new block numbers to protected nodes */
49803 + assign_real_blocknrs(flush_pos, oid, index, allocated,
49804 + first_allocated);
49805 +
49806 + set_key_offset(&key,
49807 + get_key_offset(&key) +
49808 + (allocated << current_blocksize_bits));
49809 + } else {
49810 + /*
49811 + * overwrite: try to copy unit as it is to left neighbor and
49812 + * make all first not flushprepped nodes overwrite nodes
49813 + */
49814 + reiser4_set_extent(&copy_extent, start, width);
49815 + result = put_unit_to_end(left, &key, &copy_extent);
49816 + if (result == -E_NODE_FULL)
49817 + return SQUEEZE_TARGET_FULL;
49818 +
49819 + if (state != HOLE_EXTENT)
49820 + mark_jnodes_overwrite(flush_pos, oid, index, width);
49821 + set_key_offset(&key,
49822 + get_key_offset(&key) +
49823 + (width << current_blocksize_bits));
49824 + }
49825 + *stop_key = key;
49826 + return SQUEEZE_CONTINUE;
49827 +}
49828 +
49829 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
49830 +{
49831 + return key_by_inode_and_offset_common(inode, off, key);
49832 +}
49833 +
49834 +/*
49835 + * Local variables:
49836 + * c-indentation-style: "K&R"
49837 + * mode-name: "LC"
49838 + * c-basic-offset: 8
49839 + * tab-width: 8
49840 + * fill-column: 79
49841 + * scroll-step: 1
49842 + * End:
49843 + */
49844 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h linux-2.6.20/fs/reiser4/plugin/item/extent.h
49845 --- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
49846 +++ linux-2.6.20/fs/reiser4/plugin/item/extent.h 2007-05-06 14:50:43.811010720 +0400
49847 @@ -0,0 +1,231 @@
49848 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49849 +
49850 +#ifndef __REISER4_EXTENT_H__
49851 +#define __REISER4_EXTENT_H__
49852 +
49853 +/* on disk extent */
49854 +typedef struct {
49855 + reiser4_dblock_nr start;
49856 + reiser4_dblock_nr width;
49857 +} reiser4_extent;
49858 +
49859 +typedef struct extent_stat {
49860 + int unallocated_units;
49861 + int unallocated_blocks;
49862 + int allocated_units;
49863 + int allocated_blocks;
49864 + int hole_units;
49865 + int hole_blocks;
49866 +} extent_stat;
49867 +
49868 +/* extents in an extent item can be either holes, or unallocated or allocated
49869 + extents */
49870 +typedef enum {
49871 + HOLE_EXTENT,
49872 + UNALLOCATED_EXTENT,
49873 + ALLOCATED_EXTENT
49874 +} extent_state;
49875 +
49876 +#define HOLE_EXTENT_START 0
49877 +#define UNALLOCATED_EXTENT_START 1
49878 +#define UNALLOCATED_EXTENT_START2 2
49879 +
49880 +typedef struct {
49881 + reiser4_block_nr pos_in_unit;
49882 + reiser4_block_nr width; /* width of current unit */
49883 + pos_in_node_t nr_units; /* number of units */
49884 + int ext_offset; /* offset from the beginning of zdata() */
49885 + unsigned long expected_page;
49886 +#if REISER4_DEBUG
49887 + reiser4_extent extent;
49888 +#endif
49889 +} extent_coord_extension_t;
49890 +
49891 +/* macros to set/get fields of on-disk extent */
49892 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
49893 +{
49894 + return le64_to_cpu(ext->start);
49895 +}
49896 +
49897 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
49898 +{
49899 + return le64_to_cpu(ext->width);
49900 +}
49901 +
49902 +extern __u64 reiser4_current_block_count(void);
49903 +
49904 +static inline void
49905 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
49906 +{
49907 + cassert(sizeof(ext->start) == 8);
49908 + assert("nikita-2510",
49909 + ergo(start > 1, start < reiser4_current_block_count()));
49910 + put_unaligned(cpu_to_le64(start), &ext->start);
49911 +}
49912 +
49913 +static inline void
49914 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
49915 +{
49916 + cassert(sizeof(ext->width) == 8);
49917 + assert("", width > 0);
49918 + put_unaligned(cpu_to_le64(width), &ext->width);
49919 + assert("nikita-2511",
49920 + ergo(extent_get_start(ext) > 1,
49921 + extent_get_start(ext) + width <=
49922 + reiser4_current_block_count()));
49923 +}
49924 +
49925 +#define extent_item(coord) \
49926 +({ \
49927 + assert("nikita-3143", item_is_extent(coord)); \
49928 + ((reiser4_extent *)item_body_by_coord (coord)); \
49929 +})
49930 +
49931 +#define extent_by_coord(coord) \
49932 +({ \
49933 + assert("nikita-3144", item_is_extent(coord)); \
49934 + (extent_item (coord) + (coord)->unit_pos); \
49935 +})
49936 +
49937 +#define width_by_coord(coord) \
49938 +({ \
49939 + assert("nikita-3145", item_is_extent(coord)); \
49940 + extent_get_width (extent_by_coord(coord)); \
49941 +})
49942 +
49943 +struct carry_cut_data;
49944 +struct carry_kill_data;
49945 +
49946 +/* plugin->u.item.b.* */
49947 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
49948 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49949 + const reiser4_item_data *);
49950 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
49951 +pos_in_node_t nr_units_extent(const coord_t *);
49952 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
49953 +void init_coord_extent(coord_t *);
49954 +int init_extent(coord_t *, reiser4_item_data *);
49955 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
49956 +int can_shift_extent(unsigned free_space,
49957 + coord_t * source, znode * target, shift_direction,
49958 + unsigned *size, unsigned want);
49959 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
49960 + unsigned count, shift_direction where_is_free_space,
49961 + unsigned free_space);
49962 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
49963 + struct carry_kill_data *);
49964 +int create_hook_extent(const coord_t * coord, void *arg);
49965 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49966 + struct carry_cut_data *, reiser4_key * smallest_removed,
49967 + reiser4_key * new_first);
49968 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49969 + struct carry_kill_data *, reiser4_key * smallest_removed,
49970 + reiser4_key * new_first);
49971 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
49972 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
49973 +void print_extent(const char *, coord_t *);
49974 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
49975 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
49976 + reiser4_block_nr * block);
49977 +void item_stat_extent(const coord_t * coord, void *vp);
49978 +int reiser4_check_extent(const coord_t * coord, const char **error);
49979 +
49980 +/* plugin->u.item.s.file.* */
49981 +ssize_t reiser4_write_extent(struct file *, const char __user *,
49982 + size_t, loff_t *);
49983 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
49984 +int reiser4_readpage_extent(void *, struct page *);
49985 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
49986 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
49987 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
49988 +int get_block_address_extent(const coord_t *, sector_t block,
49989 + sector_t * result);
49990 +
49991 +/* these are used in flush.c
49992 + FIXME-VS: should they be somewhere in item_plugin? */
49993 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
49994 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
49995 + reiser4_key * stop_key);
49996 +
49997 +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
49998 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
49999 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
50000 +
50001 +/* plugin->u.item.f. */
50002 +int reiser4_scan_extent(flush_scan * scan);
50003 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
50004 +
50005 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
50006 + int nr_extents);
50007 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
50008 +extent_state state_of_extent(reiser4_extent * ext);
50009 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
50010 + reiser4_block_nr width);
50011 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
50012 + int *plugged_hole);
50013 +
50014 +#include "../../coord.h"
50015 +#include "../../lock.h"
50016 +#include "../../tap.h"
50017 +
50018 +struct replace_handle {
50019 + /* these are to be set before calling reiser4_replace_extent */
50020 + coord_t *coord;
50021 + lock_handle *lh;
50022 + reiser4_key key;
50023 + reiser4_key *pkey;
50024 + reiser4_extent overwrite;
50025 + reiser4_extent new_extents[2];
50026 + int nr_new_extents;
50027 + unsigned flags;
50028 +
50029 + /* these are used by reiser4_replace_extent */
50030 + reiser4_item_data item;
50031 + coord_t coord_after;
50032 + lock_handle lh_after;
50033 + tap_t watch;
50034 + reiser4_key paste_key;
50035 +#if REISER4_DEBUG
50036 + reiser4_extent orig_ext;
50037 + reiser4_key tmp;
50038 +#endif
50039 +};
50040 +
50041 +/* this structure is kmalloced before calling make_extent to avoid excessive
50042 + stack consumption on plug_hole->reiser4_replace_extent */
50043 +struct make_extent_handle {
50044 + uf_coord_t *uf_coord;
50045 + reiser4_block_nr blocknr;
50046 + int created;
50047 + struct inode *inode;
50048 + union {
50049 + struct {
50050 + } append;
50051 + struct replace_handle replace;
50052 + } u;
50053 +};
50054 +
50055 +int reiser4_replace_extent(struct replace_handle *,
50056 + int return_inserted_position);
50057 +lock_handle *znode_lh(znode *);
50058 +
50059 +/* the reiser4 repacker support */
50060 +struct repacker_cursor;
50061 +extern int process_extent_backward_for_repacking(tap_t *,
50062 + struct repacker_cursor *);
50063 +extern int mark_extent_for_repacking(tap_t *, int);
50064 +
50065 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
50066 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
50067 +
50068 +/* __REISER4_EXTENT_H__ */
50069 +#endif
50070 +/*
50071 + Local variables:
50072 + c-indentation-style: "K&R"
50073 + mode-name: "LC"
50074 + c-basic-offset: 8
50075 + tab-width: 8
50076 + fill-column: 120
50077 + End:
50078 +*/
50079 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c
50080 --- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
50081 +++ linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c 2007-05-06 14:50:43.815011970 +0400
50082 @@ -0,0 +1,889 @@
50083 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50084 +
50085 +#include "item.h"
50086 +#include "../../inode.h"
50087 +#include "../../tree_walk.h" /* check_sibling_list() */
50088 +#include "../../page_cache.h"
50089 +#include "../../carry.h"
50090 +
50091 +#include <linux/quotaops.h>
50092 +
50093 +/* item_plugin->b.max_key_inside */
50094 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50095 +{
50096 + item_key_by_coord(coord, key);
50097 + set_key_offset(key, get_key_offset(reiser4_max_key()));
50098 + return key;
50099 +}
50100 +
50101 +/* item_plugin->b.can_contain_key
50102 + this checks whether @key of @data is matching to position set by @coord */
50103 +int
50104 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50105 + const reiser4_item_data * data)
50106 +{
50107 + reiser4_key item_key;
50108 +
50109 + if (item_plugin_by_coord(coord) != data->iplug)
50110 + return 0;
50111 +
50112 + item_key_by_coord(coord, &item_key);
50113 + if (get_key_locality(key) != get_key_locality(&item_key) ||
50114 + get_key_objectid(key) != get_key_objectid(&item_key) ||
50115 + get_key_ordering(key) != get_key_ordering(&item_key))
50116 + return 0;
50117 +
50118 + return 1;
50119 +}
50120 +
50121 +/* item_plugin->b.mergeable
50122 + first item is of extent type */
50123 +/* Audited by: green(2002.06.13) */
50124 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
50125 +{
50126 + reiser4_key key1, key2;
50127 +
50128 + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50129 + /* FIXME-VS: Which is it? Assert or return 0 */
50130 + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50131 + return 0;
50132 + }
50133 +
50134 + item_key_by_coord(p1, &key1);
50135 + item_key_by_coord(p2, &key2);
50136 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
50137 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
50138 + get_key_ordering(&key1) != get_key_ordering(&key2) ||
50139 + get_key_type(&key1) != get_key_type(&key2))
50140 + return 0;
50141 + if (get_key_offset(&key1) +
50142 + reiser4_extent_size(p1, nr_units_extent(p1)) !=
50143 + get_key_offset(&key2))
50144 + return 0;
50145 + return 1;
50146 +}
50147 +
50148 +/* item_plugin->b.nr_units */
50149 +pos_in_node_t nr_units_extent(const coord_t * coord)
50150 +{
50151 + /* length of extent item has to be multiple of extent size */
50152 + assert("vs-1424",
50153 + (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50154 + return item_length_by_coord(coord) / sizeof(reiser4_extent);
50155 +}
50156 +
50157 +/* item_plugin->b.lookup */
50158 +lookup_result
50159 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50160 + coord_t * coord)
50161 +{ /* znode and item_pos are
50162 + set to an extent item to
50163 + look through */
50164 + reiser4_key item_key;
50165 + reiser4_block_nr lookuped, offset;
50166 + unsigned i, nr_units;
50167 + reiser4_extent *ext;
50168 + unsigned blocksize;
50169 + unsigned char blocksize_bits;
50170 +
50171 + item_key_by_coord(coord, &item_key);
50172 + offset = get_key_offset(&item_key);
50173 +
50174 + /* key we are looking for must be greater than key of item @coord */
50175 + assert("vs-414", keygt(key, &item_key));
50176 +
50177 + assert("umka-99945",
50178 + !keygt(key, max_key_inside_extent(coord, &item_key)));
50179 +
50180 + ext = extent_item(coord);
50181 + assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50182 +
50183 + blocksize = current_blocksize;
50184 + blocksize_bits = current_blocksize_bits;
50185 +
50186 + /* offset we are looking for */
50187 + lookuped = get_key_offset(key);
50188 +
50189 + nr_units = nr_units_extent(coord);
50190 + /* go through all extents until the one which address given offset */
50191 + for (i = 0; i < nr_units; i++, ext++) {
50192 + offset += (extent_get_width(ext) << blocksize_bits);
50193 + if (offset > lookuped) {
50194 + /* desired byte is somewhere in this extent */
50195 + coord->unit_pos = i;
50196 + coord->between = AT_UNIT;
50197 + return CBK_COORD_FOUND;
50198 + }
50199 + }
50200 +
50201 + /* set coord after last unit */
50202 + coord->unit_pos = nr_units - 1;
50203 + coord->between = AFTER_UNIT;
50204 + return CBK_COORD_FOUND;
50205 +}
50206 +
50207 +/* item_plugin->b.paste
50208 + item @coord is set to has been appended with @data->length of free
50209 + space. data->data contains data to be pasted into the item in position
50210 + @coord->in_item.unit_pos. It must fit into that free space.
50211 + @coord must be set between units.
50212 +*/
50213 +int
50214 +paste_extent(coord_t * coord, reiser4_item_data * data,
50215 + carry_plugin_info * info UNUSED_ARG)
50216 +{
50217 + unsigned old_nr_units;
50218 + reiser4_extent *ext;
50219 + int item_length;
50220 +
50221 + ext = extent_item(coord);
50222 + item_length = item_length_by_coord(coord);
50223 + old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50224 +
50225 + /* this is also used to copy extent into newly created item, so
50226 + old_nr_units could be 0 */
50227 + assert("vs-260", item_length >= data->length);
50228 +
50229 + /* make sure that coord is set properly */
50230 + assert("vs-35",
50231 + ((!coord_is_existing_unit(coord))
50232 + || (!old_nr_units && !coord->unit_pos)));
50233 +
50234 + /* first unit to be moved */
50235 + switch (coord->between) {
50236 + case AFTER_UNIT:
50237 + coord->unit_pos++;
50238 + case BEFORE_UNIT:
50239 + coord->between = AT_UNIT;
50240 + break;
50241 + case AT_UNIT:
50242 + assert("vs-331", !old_nr_units && !coord->unit_pos);
50243 + break;
50244 + default:
50245 + impossible("vs-330", "coord is set improperly");
50246 + }
50247 +
50248 + /* prepare space for new units */
50249 + memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50250 + ext + coord->unit_pos,
50251 + (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50252 +
50253 + /* copy new data from kernel space */
50254 + assert("vs-556", data->user == 0);
50255 + memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50256 +
50257 + /* after paste @coord is set to first of pasted units */
50258 + assert("vs-332", coord_is_existing_unit(coord));
50259 + assert("vs-333",
50260 + !memcmp(data->data, extent_by_coord(coord),
50261 + (unsigned)data->length));
50262 + return 0;
50263 +}
50264 +
50265 +/* item_plugin->b.can_shift */
50266 +int
50267 +can_shift_extent(unsigned free_space, coord_t * source,
50268 + znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50269 + unsigned *size, unsigned want)
50270 +{
50271 + *size = item_length_by_coord(source);
50272 + if (*size > free_space)
50273 + /* never split a unit of extent item */
50274 + *size = free_space - free_space % sizeof(reiser4_extent);
50275 +
50276 + /* we can shift *size bytes, calculate how many do we want to shift */
50277 + if (*size > want * sizeof(reiser4_extent))
50278 + *size = want * sizeof(reiser4_extent);
50279 +
50280 + if (*size % sizeof(reiser4_extent) != 0)
50281 + impossible("vs-119", "Wrong extent size: %i %zd", *size,
50282 + sizeof(reiser4_extent));
50283 + return *size / sizeof(reiser4_extent);
50284 +
50285 +}
50286 +
50287 +/* item_plugin->b.copy_units */
50288 +void
50289 +copy_units_extent(coord_t * target, coord_t * source,
50290 + unsigned from, unsigned count,
50291 + shift_direction where_is_free_space, unsigned free_space)
50292 +{
50293 + char *from_ext, *to_ext;
50294 +
50295 + assert("vs-217", free_space == count * sizeof(reiser4_extent));
50296 +
50297 + from_ext = item_body_by_coord(source);
50298 + to_ext = item_body_by_coord(target);
50299 +
50300 + if (where_is_free_space == SHIFT_LEFT) {
50301 + assert("vs-215", from == 0);
50302 +
50303 + /* At this moment, item length was already updated in the item
50304 + header by shifting code, hence nr_units_extent() will
50305 + return "new" number of units---one we obtain after copying
50306 + units.
50307 + */
50308 + to_ext +=
50309 + (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50310 + } else {
50311 + reiser4_key key;
50312 + coord_t coord;
50313 +
50314 + assert("vs-216",
50315 + from + count == coord_last_unit_pos(source) + 1);
50316 +
50317 + from_ext += item_length_by_coord(source) - free_space;
50318 +
50319 + /* new units are inserted before first unit in an item,
50320 + therefore, we have to update item key */
50321 + coord = *source;
50322 + coord.unit_pos = from;
50323 + unit_key_extent(&coord, &key);
50324 +
50325 + node_plugin_by_node(target->node)->update_item_key(target, &key,
50326 + NULL /*info */);
50327 + }
50328 +
50329 + memcpy(to_ext, from_ext, free_space);
50330 +}
50331 +
50332 +/* item_plugin->b.create_hook
50333 + @arg is znode of leaf node for which we need to update right delimiting key */
50334 +int create_hook_extent(const coord_t * coord, void *arg)
50335 +{
50336 + coord_t *child_coord;
50337 + znode *node;
50338 + reiser4_key key;
50339 + reiser4_tree *tree;
50340 +
50341 + if (!arg)
50342 + return 0;
50343 +
50344 + child_coord = arg;
50345 + tree = znode_get_tree(coord->node);
50346 +
50347 + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50348 +
50349 + write_lock_tree(tree);
50350 + write_lock_dk(tree);
50351 + /* find a node on the left level for which right delimiting key has to
50352 + be updated */
50353 + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50354 + assert("vs-411", znode_is_left_connected(child_coord->node));
50355 + node = child_coord->node->left;
50356 + } else {
50357 + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50358 + node = child_coord->node;
50359 + assert("nikita-3314", node != NULL);
50360 + }
50361 +
50362 + if (node != NULL) {
50363 + znode_set_rd_key(node, item_key_by_coord(coord, &key));
50364 +
50365 + assert("nikita-3282", check_sibling_list(node));
50366 + /* break sibling links */
50367 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50368 + ON_DEBUG(node->right->left_version =
50369 + atomic_inc_return(&delim_key_version);
50370 + node->right_version =
50371 + atomic_inc_return(&delim_key_version););
50372 +
50373 + node->right->left = NULL;
50374 + node->right = NULL;
50375 + }
50376 + }
50377 + write_unlock_dk(tree);
50378 + write_unlock_tree(tree);
50379 + return 0;
50380 +}
50381 +
50382 +#define ITEM_TAIL_KILLED 0
50383 +#define ITEM_HEAD_KILLED 1
50384 +#define ITEM_KILLED 2
50385 +
50386 +/* item_plugin->b.kill_hook
50387 + this is called when @count units starting from @from-th one are going to be removed
50388 + */
50389 +int
50390 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50391 + struct carry_kill_data *kdata)
50392 +{
50393 + reiser4_extent *ext;
50394 + reiser4_block_nr start, length;
50395 + const reiser4_key *pfrom_key, *pto_key;
50396 + struct inode *inode;
50397 + reiser4_tree *tree;
50398 + pgoff_t from_off, to_off, offset, skip;
50399 + int retval;
50400 +
50401 + /* these are located in memory kmalloc-ed by kill_node_content */
50402 + reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50403 + coord_t *dup, *next;
50404 +
50405 + assert("zam-811", znode_is_write_locked(coord->node));
50406 + assert("nikita-3315", kdata != NULL);
50407 + assert("vs-34", kdata->buf != NULL);
50408 +
50409 + /* map structures to kdata->buf */
50410 + min_item_key = (reiser4_key *) (kdata->buf);
50411 + max_item_key = min_item_key + 1;
50412 + from_key = max_item_key + 1;
50413 + to_key = from_key + 1;
50414 + key = to_key + 1;
50415 + dup = (coord_t *) (key + 1);
50416 + next = dup + 1;
50417 +
50418 + item_key_by_coord(coord, min_item_key);
50419 + max_item_key_by_coord(coord, max_item_key);
50420 +
50421 + if (kdata->params.from_key) {
50422 + pfrom_key = kdata->params.from_key;
50423 + pto_key = kdata->params.to_key;
50424 + } else {
50425 + assert("vs-1549", from == coord->unit_pos);
50426 + unit_key_by_coord(coord, from_key);
50427 + pfrom_key = from_key;
50428 +
50429 + coord_dup(dup, coord);
50430 + dup->unit_pos = from + count - 1;
50431 + max_unit_key_by_coord(dup, to_key);
50432 + pto_key = to_key;
50433 + }
50434 +
50435 + if (!keylt(pto_key, max_item_key)) {
50436 + if (!keygt(pfrom_key, min_item_key)) {
50437 + znode *left, *right;
50438 +
50439 + /* item is to be removed completely */
50440 + assert("nikita-3316", kdata->left != NULL
50441 + && kdata->right != NULL);
50442 +
50443 + left = kdata->left->node;
50444 + right = kdata->right->node;
50445 +
50446 + tree = current_tree;
50447 + /* we have to do two things:
50448 + *
50449 + * 1. link left and right formatted neighbors of
50450 + * extent being removed, and
50451 + *
50452 + * 2. update their delimiting keys.
50453 + *
50454 + * atomicity of these operations is protected by
50455 + * taking dk-lock and tree-lock.
50456 + */
50457 + /* if neighbors of item being removed are znodes -
50458 + * link them */
50459 + write_lock_tree(tree);
50460 + write_lock_dk(tree);
50461 + link_left_and_right(left, right);
50462 + if (left) {
50463 + /* update right delimiting key of left
50464 + * neighbor of extent item */
50465 + /*coord_t next;
50466 + reiser4_key key; */
50467 +
50468 + coord_dup(next, coord);
50469 +
50470 + if (coord_next_item(next))
50471 + *key = *znode_get_rd_key(coord->node);
50472 + else
50473 + item_key_by_coord(next, key);
50474 + znode_set_rd_key(left, key);
50475 + }
50476 + write_unlock_dk(tree);
50477 + write_unlock_tree(tree);
50478 +
50479 + from_off =
50480 + get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50481 + to_off =
50482 + (get_key_offset(max_item_key) +
50483 + 1) >> PAGE_CACHE_SHIFT;
50484 + retval = ITEM_KILLED;
50485 + } else {
50486 + /* tail of item is to be removed */
50487 + from_off =
50488 + (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50489 + 1) >> PAGE_CACHE_SHIFT;
50490 + to_off =
50491 + (get_key_offset(max_item_key) +
50492 + 1) >> PAGE_CACHE_SHIFT;
50493 + retval = ITEM_TAIL_KILLED;
50494 + }
50495 + } else {
50496 + /* head of item is to be removed */
50497 + assert("vs-1571", keyeq(pfrom_key, min_item_key));
50498 + assert("vs-1572",
50499 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50500 + 0);
50501 + assert("vs-1573",
50502 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50503 + 1)) == 0);
50504 +
50505 + if (kdata->left->node) {
50506 + /* update right delimiting key of left neighbor of extent item */
50507 + /*reiser4_key key; */
50508 +
50509 + *key = *pto_key;
50510 + set_key_offset(key, get_key_offset(pto_key) + 1);
50511 +
50512 + write_lock_dk(current_tree);
50513 + znode_set_rd_key(kdata->left->node, key);
50514 + write_unlock_dk(current_tree);
50515 + }
50516 +
50517 + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50518 + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50519 + retval = ITEM_HEAD_KILLED;
50520 + }
50521 +
50522 + inode = kdata->inode;
50523 + assert("vs-1545", inode != NULL);
50524 + if (inode != NULL)
50525 + /* take care of pages and jnodes corresponding to part of item being killed */
50526 + reiser4_invalidate_pages(inode->i_mapping, from_off,
50527 + to_off - from_off,
50528 + kdata->params.truncate);
50529 +
50530 + ext = extent_item(coord) + from;
50531 + offset =
50532 + (get_key_offset(min_item_key) +
50533 + reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
50534 +
50535 + assert("vs-1551", from_off >= offset);
50536 + assert("vs-1552", from_off - offset <= extent_get_width(ext));
50537 + skip = from_off - offset;
50538 + offset = from_off;
50539 +
50540 + while (offset < to_off) {
50541 + length = extent_get_width(ext) - skip;
50542 + if (state_of_extent(ext) == HOLE_EXTENT) {
50543 + skip = 0;
50544 + offset += length;
50545 + ext++;
50546 + continue;
50547 + }
50548 +
50549 + if (offset + length > to_off) {
50550 + length = to_off - offset;
50551 + }
50552 +
50553 + DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50554 +
50555 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50556 + /* some jnodes corresponding to this unallocated extent */
50557 + fake_allocated2free(length, 0 /* unformatted */ );
50558 +
50559 + skip = 0;
50560 + offset += length;
50561 + ext++;
50562 + continue;
50563 + }
50564 +
50565 + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50566 +
50567 + if (length != 0) {
50568 + start = extent_get_start(ext) + skip;
50569 +
50570 + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50571 + immediately */
50572 + reiser4_dealloc_blocks(&start, &length,
50573 + 0 /* not used */ ,
50574 + BA_DEFER
50575 + /* unformatted with defer */ );
50576 + }
50577 + skip = 0;
50578 + offset += length;
50579 + ext++;
50580 + }
50581 + return retval;
50582 +}
50583 +
50584 +/* item_plugin->b.kill_units */
50585 +int
50586 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50587 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50588 + reiser4_key * new_first)
50589 +{
50590 + reiser4_extent *ext;
50591 + reiser4_key item_key;
50592 + pos_in_node_t count;
50593 + reiser4_key from_key, to_key;
50594 + const reiser4_key *pfrom_key, *pto_key;
50595 + loff_t off;
50596 + int result;
50597 +
50598 + assert("vs-1541",
50599 + ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50600 + || (kdata->params.from_key != NULL
50601 + && kdata->params.to_key != NULL)));
50602 +
50603 + if (kdata->params.from_key) {
50604 + pfrom_key = kdata->params.from_key;
50605 + pto_key = kdata->params.to_key;
50606 + } else {
50607 + coord_t dup;
50608 +
50609 + /* calculate key range of kill */
50610 + assert("vs-1549", from == coord->unit_pos);
50611 + unit_key_by_coord(coord, &from_key);
50612 + pfrom_key = &from_key;
50613 +
50614 + coord_dup(&dup, coord);
50615 + dup.unit_pos = to;
50616 + max_unit_key_by_coord(&dup, &to_key);
50617 + pto_key = &to_key;
50618 + }
50619 +
50620 + item_key_by_coord(coord, &item_key);
50621 +
50622 +#if REISER4_DEBUG
50623 + {
50624 + reiser4_key max_item_key;
50625 +
50626 + max_item_key_by_coord(coord, &max_item_key);
50627 +
50628 + if (new_first) {
50629 + /* head of item is to be cut */
50630 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50631 + assert("vs-1538", keylt(pto_key, &max_item_key));
50632 + } else {
50633 + /* tail of item is to be cut */
50634 + assert("vs-1540", keygt(pfrom_key, &item_key));
50635 + assert("vs-1543", !keylt(pto_key, &max_item_key));
50636 + }
50637 + }
50638 +#endif
50639 +
50640 + if (smallest_removed)
50641 + *smallest_removed = *pfrom_key;
50642 +
50643 + if (new_first) {
50644 + /* item head is cut. Item key will change. This new key is calculated here */
50645 + assert("vs-1556",
50646 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50647 + (PAGE_CACHE_SIZE - 1));
50648 + *new_first = *pto_key;
50649 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50650 + }
50651 +
50652 + count = to - from + 1;
50653 + result = kill_hook_extent(coord, from, count, kdata);
50654 + if (result == ITEM_TAIL_KILLED) {
50655 + assert("vs-1553",
50656 + get_key_offset(pfrom_key) >=
50657 + get_key_offset(&item_key) +
50658 + reiser4_extent_size(coord, from));
50659 + off =
50660 + get_key_offset(pfrom_key) -
50661 + (get_key_offset(&item_key) +
50662 + reiser4_extent_size(coord, from));
50663 + if (off) {
50664 + /* unit @from is to be cut partially. Its width decreases */
50665 + ext = extent_item(coord) + from;
50666 + extent_set_width(ext,
50667 + (off + PAGE_CACHE_SIZE -
50668 + 1) >> PAGE_CACHE_SHIFT);
50669 + count--;
50670 + }
50671 + } else {
50672 + __u64 max_to_offset;
50673 + __u64 rest;
50674 +
50675 + assert("vs-1575", result == ITEM_HEAD_KILLED);
50676 + assert("", from == 0);
50677 + assert("",
50678 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50679 + 1)) == 0);
50680 + assert("",
50681 + get_key_offset(pto_key) + 1 >
50682 + get_key_offset(&item_key) +
50683 + reiser4_extent_size(coord, to));
50684 + max_to_offset =
50685 + get_key_offset(&item_key) +
50686 + reiser4_extent_size(coord, to + 1) - 1;
50687 + assert("", get_key_offset(pto_key) <= max_to_offset);
50688 +
50689 + rest =
50690 + (max_to_offset -
50691 + get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50692 + if (rest) {
50693 + /* unit @to is to be cut partially */
50694 + ext = extent_item(coord) + to;
50695 +
50696 + assert("", extent_get_width(ext) > rest);
50697 +
50698 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50699 + extent_set_start(ext,
50700 + extent_get_start(ext) +
50701 + (extent_get_width(ext) -
50702 + rest));
50703 +
50704 + extent_set_width(ext, rest);
50705 + count--;
50706 + }
50707 + }
50708 + return count * sizeof(reiser4_extent);
50709 +}
50710 +
50711 +/* item_plugin->b.cut_units
50712 + this is too similar to kill_units_extent */
50713 +int
50714 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50715 + struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50716 + reiser4_key * new_first)
50717 +{
50718 + reiser4_extent *ext;
50719 + reiser4_key item_key;
50720 + pos_in_node_t count;
50721 + reiser4_key from_key, to_key;
50722 + const reiser4_key *pfrom_key, *pto_key;
50723 + loff_t off;
50724 +
50725 + assert("vs-1541",
50726 + ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50727 + || (cdata->params.from_key != NULL
50728 + && cdata->params.to_key != NULL)));
50729 +
50730 + if (cdata->params.from_key) {
50731 + pfrom_key = cdata->params.from_key;
50732 + pto_key = cdata->params.to_key;
50733 + } else {
50734 + coord_t dup;
50735 +
50736 + /* calculate key range of kill */
50737 + coord_dup(&dup, coord);
50738 + dup.unit_pos = from;
50739 + unit_key_by_coord(&dup, &from_key);
50740 +
50741 + dup.unit_pos = to;
50742 + max_unit_key_by_coord(&dup, &to_key);
50743 +
50744 + pfrom_key = &from_key;
50745 + pto_key = &to_key;
50746 + }
50747 +
50748 + assert("vs-1555",
50749 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50750 + assert("vs-1556",
50751 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50752 + (PAGE_CACHE_SIZE - 1));
50753 +
50754 + item_key_by_coord(coord, &item_key);
50755 +
50756 +#if REISER4_DEBUG
50757 + {
50758 + reiser4_key max_item_key;
50759 +
50760 + assert("vs-1584",
50761 + get_key_locality(pfrom_key) ==
50762 + get_key_locality(&item_key));
50763 + assert("vs-1585",
50764 + get_key_type(pfrom_key) == get_key_type(&item_key));
50765 + assert("vs-1586",
50766 + get_key_objectid(pfrom_key) ==
50767 + get_key_objectid(&item_key));
50768 + assert("vs-1587",
50769 + get_key_ordering(pfrom_key) ==
50770 + get_key_ordering(&item_key));
50771 +
50772 + max_item_key_by_coord(coord, &max_item_key);
50773 +
50774 + if (new_first != NULL) {
50775 + /* head of item is to be cut */
50776 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50777 + assert("vs-1538", keylt(pto_key, &max_item_key));
50778 + } else {
50779 + /* tail of item is to be cut */
50780 + assert("vs-1540", keygt(pfrom_key, &item_key));
50781 + assert("vs-1543", keyeq(pto_key, &max_item_key));
50782 + }
50783 + }
50784 +#endif
50785 +
50786 + if (smallest_removed)
50787 + *smallest_removed = *pfrom_key;
50788 +
50789 + if (new_first) {
50790 + /* item head is cut. Item key will change. This new key is calculated here */
50791 + *new_first = *pto_key;
50792 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50793 + }
50794 +
50795 + count = to - from + 1;
50796 +
50797 + assert("vs-1553",
50798 + get_key_offset(pfrom_key) >=
50799 + get_key_offset(&item_key) + reiser4_extent_size(coord, from));
50800 + off =
50801 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50802 + reiser4_extent_size(coord, from));
50803 + if (off) {
50804 + /* tail of unit @from is to be cut partially. Its width decreases */
50805 + assert("vs-1582", new_first == NULL);
50806 + ext = extent_item(coord) + from;
50807 + extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50808 + count--;
50809 + }
50810 +
50811 + assert("vs-1554",
50812 + get_key_offset(pto_key) <=
50813 + get_key_offset(&item_key) +
50814 + reiser4_extent_size(coord, to + 1) - 1);
50815 + off =
50816 + (get_key_offset(&item_key) +
50817 + reiser4_extent_size(coord, to + 1) - 1) -
50818 + get_key_offset(pto_key);
50819 + if (off) {
50820 + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50821 + and width decreased. */
50822 + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50823 + ext = extent_item(coord) + to;
50824 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50825 + extent_set_start(ext,
50826 + extent_get_start(ext) +
50827 + (extent_get_width(ext) -
50828 + (off >> PAGE_CACHE_SHIFT)));
50829 +
50830 + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50831 + count--;
50832 + }
50833 + return count * sizeof(reiser4_extent);
50834 +}
50835 +
50836 +/* item_plugin->b.unit_key */
50837 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50838 +{
50839 + assert("vs-300", coord_is_existing_unit(coord));
50840 +
50841 + item_key_by_coord(coord, key);
50842 + set_key_offset(key,
50843 + (get_key_offset(key) +
50844 + reiser4_extent_size(coord, coord->unit_pos)));
50845 +
50846 + return key;
50847 +}
50848 +
50849 +/* item_plugin->b.max_unit_key */
50850 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
50851 +{
50852 + assert("vs-300", coord_is_existing_unit(coord));
50853 +
50854 + item_key_by_coord(coord, key);
50855 + set_key_offset(key,
50856 + (get_key_offset(key) +
50857 + reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
50858 + return key;
50859 +}
50860 +
50861 +/* item_plugin->b.estimate
50862 + item_plugin->b.item_data_by_flow */
50863 +
50864 +#if REISER4_DEBUG
50865 +
50866 +/* item_plugin->b.check
50867 + used for debugging, every item should have here the most complete
50868 + possible check of the consistency of the item that the inventor can
50869 + construct
50870 +*/
50871 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
50872 + const char **error /* where to store error message */)
50873 +{
50874 + reiser4_extent *ext, *first;
50875 + unsigned i, j;
50876 + reiser4_block_nr start, width, blk_cnt;
50877 + unsigned num_units;
50878 + reiser4_tree *tree;
50879 + oid_t oid;
50880 + reiser4_key key;
50881 + coord_t scan;
50882 +
50883 + assert("vs-933", REISER4_DEBUG);
50884 +
50885 + if (znode_get_level(coord->node) != TWIG_LEVEL) {
50886 + *error = "Extent on the wrong level";
50887 + return -1;
50888 + }
50889 + if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
50890 + *error = "Wrong item size";
50891 + return -1;
50892 + }
50893 + ext = first = extent_item(coord);
50894 + blk_cnt = reiser4_block_count(reiser4_get_current_sb());
50895 + num_units = coord_num_units(coord);
50896 + tree = znode_get_tree(coord->node);
50897 + item_key_by_coord(coord, &key);
50898 + oid = get_key_objectid(&key);
50899 + coord_dup(&scan, coord);
50900 +
50901 + for (i = 0; i < num_units; ++i, ++ext) {
50902 + __u64 index;
50903 +
50904 + scan.unit_pos = i;
50905 + index = extent_unit_index(&scan);
50906 +
50907 +#if 0
50908 + /* check that all jnodes are present for the unallocated
50909 + * extent */
50910 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50911 + for (j = 0; j < extent_get_width(ext); j++) {
50912 + jnode *node;
50913 +
50914 + node = jlookup(tree, oid, index + j);
50915 + if (node == NULL) {
50916 + print_coord("scan", &scan, 0);
50917 + *error = "Jnode missing";
50918 + return -1;
50919 + }
50920 + jput(node);
50921 + }
50922 + }
50923 +#endif
50924 +
50925 + start = extent_get_start(ext);
50926 + if (start < 2)
50927 + continue;
50928 + /* extent is allocated one */
50929 + width = extent_get_width(ext);
50930 + if (start >= blk_cnt) {
50931 + *error = "Start too large";
50932 + return -1;
50933 + }
50934 + if (start + width > blk_cnt) {
50935 + *error = "End too large";
50936 + return -1;
50937 + }
50938 + /* make sure that this extent does not overlap with other
50939 + allocated extents extents */
50940 + for (j = 0; j < i; j++) {
50941 + if (state_of_extent(first + j) != ALLOCATED_EXTENT)
50942 + continue;
50943 + if (!
50944 + ((extent_get_start(ext) >=
50945 + extent_get_start(first + j) +
50946 + extent_get_width(first + j))
50947 + || (extent_get_start(ext) +
50948 + extent_get_width(ext) <=
50949 + extent_get_start(first + j)))) {
50950 + *error = "Extent overlaps with others";
50951 + return -1;
50952 + }
50953 + }
50954 +
50955 + }
50956 +
50957 + return 0;
50958 +}
50959 +
50960 +#endif /* REISER4_DEBUG */
50961 +
50962 +/*
50963 + Local variables:
50964 + c-indentation-style: "K&R"
50965 + mode-name: "LC"
50966 + c-basic-offset: 8
50967 + tab-width: 8
50968 + fill-column: 120
50969 + scroll-step: 1
50970 + End:
50971 +*/
50972 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c linux-2.6.20/fs/reiser4/plugin/item/internal.c
50973 --- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
50974 +++ linux-2.6.20/fs/reiser4/plugin/item/internal.c 2007-05-06 14:50:43.815011970 +0400
50975 @@ -0,0 +1,396 @@
50976 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50977 +
50978 +/* Implementation of internal-item plugin methods. */
50979 +
50980 +#include "../../forward.h"
50981 +#include "../../debug.h"
50982 +#include "../../dformat.h"
50983 +#include "../../key.h"
50984 +#include "../../coord.h"
50985 +#include "internal.h"
50986 +#include "item.h"
50987 +#include "../node/node.h"
50988 +#include "../plugin.h"
50989 +#include "../../jnode.h"
50990 +#include "../../znode.h"
50991 +#include "../../tree_walk.h"
50992 +#include "../../tree_mod.h"
50993 +#include "../../tree.h"
50994 +#include "../../super.h"
50995 +#include "../../block_alloc.h"
50996 +
50997 +/* see internal.h for explanation */
50998 +
50999 +/* plugin->u.item.b.mergeable */
51000 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51001 + const coord_t * p2 UNUSED_ARG /* second item */ )
51002 +{
51003 + /* internal items are not mergeable */
51004 + return 0;
51005 +}
51006 +
51007 +/* ->lookup() method for internal items */
51008 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51009 + lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51010 + coord_t * coord /* coord of item */ )
51011 +{
51012 + reiser4_key ukey;
51013 +
51014 + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51015 + default:
51016 + impossible("", "keycmp()?!");
51017 + case LESS_THAN:
51018 + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51019 + item plugin can not be taken using coord set this way */
51020 + assert("vs-681", coord->unit_pos == 0);
51021 + coord->between = AFTER_UNIT;
51022 + case EQUAL_TO:
51023 + return CBK_COORD_FOUND;
51024 + case GREATER_THAN:
51025 + return CBK_COORD_NOTFOUND;
51026 + }
51027 +}
51028 +
51029 +/* return body of internal item at @coord */
51030 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
51031 + * item */ )
51032 +{
51033 + assert("nikita-607", coord != NULL);
51034 + assert("nikita-1650",
51035 + item_plugin_by_coord(coord) ==
51036 + item_plugin_by_id(NODE_POINTER_ID));
51037 + return (internal_item_layout *) item_body_by_coord(coord);
51038 +}
51039 +
51040 +void reiser4_update_internal(const coord_t * coord,
51041 + const reiser4_block_nr * blocknr)
51042 +{
51043 + internal_item_layout *item = internal_at(coord);
51044 + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51045 +
51046 + put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51047 +}
51048 +
51049 +/* return child block number stored in the internal item at @coord */
51050 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51051 +{
51052 + assert("nikita-608", coord != NULL);
51053 + return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51054 +}
51055 +
51056 +/* get znode pointed to by internal @item */
51057 +static znode *znode_at(const coord_t * item /* coord of item */ ,
51058 + znode * parent /* parent node */ )
51059 +{
51060 + return child_znode(item, parent, 1, 0);
51061 +}
51062 +
51063 +/* store pointer from internal item into "block". Implementation of
51064 + ->down_link() method */
51065 +void down_link_internal(const coord_t * coord /* coord of item */ ,
51066 + const reiser4_key * key UNUSED_ARG /* key to get
51067 + * pointer for */ ,
51068 + reiser4_block_nr * block /* resulting block number */ )
51069 +{
51070 + ON_DEBUG(reiser4_key item_key);
51071 +
51072 + assert("nikita-609", coord != NULL);
51073 + assert("nikita-611", block != NULL);
51074 + assert("nikita-612", (key == NULL) ||
51075 + /* twig horrors */
51076 + (znode_get_level(coord->node) == TWIG_LEVEL)
51077 + || keyle(item_key_by_coord(coord, &item_key), key));
51078 +
51079 + *block = pointer_at(coord);
51080 + assert("nikita-2960", reiser4_blocknr_is_sane(block));
51081 +}
51082 +
51083 +/* Get the child's block number, or 0 if the block is unallocated. */
51084 +int
51085 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51086 + reiser4_block_nr * block)
51087 +{
51088 + assert("jmacd-2059", coord != NULL);
51089 +
51090 + *block = pointer_at(coord);
51091 + assert("nikita-2961", reiser4_blocknr_is_sane(block));
51092 +
51093 + if (reiser4_blocknr_is_fake(block)) {
51094 + *block = 0;
51095 + }
51096 +
51097 + return 0;
51098 +}
51099 +
51100 +/* Return the child. */
51101 +int
51102 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51103 + jnode ** childp)
51104 +{
51105 + reiser4_block_nr block = pointer_at(coord);
51106 + znode *child;
51107 +
51108 + assert("jmacd-2059", childp != NULL);
51109 + assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51110 +
51111 + child = zlook(znode_get_tree(coord->node), &block);
51112 +
51113 + if (IS_ERR(child)) {
51114 + return PTR_ERR(child);
51115 + }
51116 +
51117 + *childp = ZJNODE(child);
51118 +
51119 + return 0;
51120 +}
51121 +
51122 +#if REISER4_DEBUG
51123 +
51124 +static void check_link(znode * left, znode * right)
51125 +{
51126 + znode *scan;
51127 +
51128 + for (scan = left; scan != right; scan = scan->right) {
51129 + if (ZF_ISSET(scan, JNODE_RIP))
51130 + break;
51131 + if (znode_is_right_connected(scan) && scan->right != NULL) {
51132 + if (ZF_ISSET(scan->right, JNODE_RIP))
51133 + break;
51134 + assert("nikita-3285",
51135 + znode_is_left_connected(scan->right));
51136 + assert("nikita-3265",
51137 + ergo(scan != left,
51138 + ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51139 + assert("nikita-3284", scan->right->left == scan);
51140 + } else
51141 + break;
51142 + }
51143 +}
51144 +
51145 +int check__internal(const coord_t * coord, const char **error)
51146 +{
51147 + reiser4_block_nr blk;
51148 + znode *child;
51149 + coord_t cpy;
51150 +
51151 + blk = pointer_at(coord);
51152 + if (!reiser4_blocknr_is_sane(&blk)) {
51153 + *error = "Invalid pointer";
51154 + return -1;
51155 + }
51156 + coord_dup(&cpy, coord);
51157 + child = znode_at(&cpy, cpy.node);
51158 + if (child != NULL) {
51159 + znode *left_child;
51160 + znode *right_child;
51161 +
51162 + left_child = right_child = NULL;
51163 +
51164 + assert("nikita-3256", znode_invariant(child));
51165 + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51166 + left_child = znode_at(&cpy, cpy.node);
51167 + if (left_child != NULL) {
51168 + read_lock_tree(znode_get_tree(child));
51169 + check_link(left_child, child);
51170 + read_unlock_tree(znode_get_tree(child));
51171 + zput(left_child);
51172 + }
51173 + }
51174 + coord_dup(&cpy, coord);
51175 + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51176 + right_child = znode_at(&cpy, cpy.node);
51177 + if (right_child != NULL) {
51178 + read_lock_tree(znode_get_tree(child));
51179 + check_link(child, right_child);
51180 + read_unlock_tree(znode_get_tree(child));
51181 + zput(right_child);
51182 + }
51183 + }
51184 + zput(child);
51185 + }
51186 + return 0;
51187 +}
51188 +
51189 +#endif /* REISER4_DEBUG */
51190 +
51191 +/* return true only if this item really points to "block" */
51192 +/* Audited by: green(2002.06.14) */
51193 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51194 + const reiser4_block_nr * block /* block number to
51195 + * check */ )
51196 +{
51197 + assert("nikita-613", coord != NULL);
51198 + assert("nikita-614", block != NULL);
51199 +
51200 + return pointer_at(coord) == *block;
51201 +}
51202 +
51203 +/* hook called by ->create_item() method of node plugin after new internal
51204 + item was just created.
51205 +
51206 + This is point where pointer to new node is inserted into tree. Initialize
51207 + parent pointer in child znode, insert child into sibling list and slum.
51208 +
51209 +*/
51210 +int create_hook_internal(const coord_t * item /* coord of item */ ,
51211 + void *arg /* child's left neighbor, if any */ )
51212 +{
51213 + znode *child;
51214 + __u64 child_ptr;
51215 +
51216 + assert("nikita-1252", item != NULL);
51217 + assert("nikita-1253", item->node != NULL);
51218 + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51219 + assert("nikita-1450", item->unit_pos == 0);
51220 +
51221 + /*
51222 + * preparing to item insertion build_child_ptr_data sets pointer to
51223 + * data to be inserted to jnode's blocknr which is in cpu byte
51224 + * order. Node's create_item simply copied those data. As result we
51225 + * have child pointer in cpu's byte order. Convert content of internal
51226 + * item to little endian byte order.
51227 + */
51228 + child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51229 + reiser4_update_internal(item, &child_ptr);
51230 +
51231 + child = znode_at(item, item->node);
51232 + if (child != NULL && !IS_ERR(child)) {
51233 + znode *left;
51234 + int result = 0;
51235 + reiser4_tree *tree;
51236 +
51237 + left = arg;
51238 + tree = znode_get_tree(item->node);
51239 + write_lock_tree(tree);
51240 + write_lock_dk(tree);
51241 + assert("nikita-1400", (child->in_parent.node == NULL)
51242 + || (znode_above_root(child->in_parent.node)));
51243 + ++item->node->c_count;
51244 + coord_to_parent_coord(item, &child->in_parent);
51245 + sibling_list_insert_nolock(child, left);
51246 +
51247 + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51248 + ZF_CLR(child, JNODE_ORPHAN);
51249 +
51250 + if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51251 + znode_get_rd_key(child))) {
51252 + znode_set_rd_key(child, znode_get_rd_key(left));
51253 + }
51254 + write_unlock_dk(tree);
51255 + write_unlock_tree(tree);
51256 + zput(child);
51257 + return result;
51258 + } else {
51259 + if (child == NULL)
51260 + child = ERR_PTR(-EIO);
51261 + return PTR_ERR(child);
51262 + }
51263 +}
51264 +
51265 +/* hook called by ->cut_and_kill() method of node plugin just before internal
51266 + item is removed.
51267 +
51268 + This is point where empty node is removed from the tree. Clear parent
51269 + pointer in child, and mark node for pending deletion.
51270 +
51271 + Node will be actually deleted later and in several installations:
51272 +
51273 + . when last lock on this node will be released, node will be removed from
51274 + the sibling list and its lock will be invalidated
51275 +
51276 + . when last reference to this node will be dropped, bitmap will be updated
51277 + and node will be actually removed from the memory.
51278 +
51279 +*/
51280 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
51281 + pos_in_node_t from UNUSED_ARG /* start unit */ ,
51282 + pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51283 + struct carry_kill_data *p UNUSED_ARG)
51284 +{
51285 + znode *child;
51286 +
51287 + assert("nikita-1222", item != NULL);
51288 + assert("nikita-1224", from == 0);
51289 + assert("nikita-1225", count == 1);
51290 +
51291 + child = znode_at(item, item->node);
51292 + if (IS_ERR(child))
51293 + return PTR_ERR(child);
51294 + else if (node_is_empty(child)) {
51295 + reiser4_tree *tree;
51296 +
51297 + assert("nikita-1397", znode_is_write_locked(child));
51298 + assert("nikita-1398", child->c_count == 0);
51299 + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51300 +
51301 + tree = znode_get_tree(item->node);
51302 + write_lock_tree(tree);
51303 + init_parent_coord(&child->in_parent, NULL);
51304 + --item->node->c_count;
51305 + write_unlock_tree(tree);
51306 + zput(child);
51307 + return 0;
51308 + } else {
51309 + warning("nikita-1223",
51310 + "Cowardly refuse to remove link to non-empty node");
51311 + zput(child);
51312 + return RETERR(-EIO);
51313 + }
51314 +}
51315 +
51316 +/* hook called by ->shift() node plugin method when iternal item was just
51317 + moved from one node to another.
51318 +
51319 + Update parent pointer in child and c_counts in old and new parent
51320 +
51321 +*/
51322 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
51323 + unsigned from UNUSED_ARG /* start unit */ ,
51324 + unsigned count UNUSED_ARG /* stop unit */ ,
51325 + znode * old_node /* old parent */ )
51326 +{
51327 + znode *child;
51328 + znode *new_node;
51329 + reiser4_tree *tree;
51330 +
51331 + assert("nikita-1276", item != NULL);
51332 + assert("nikita-1277", from == 0);
51333 + assert("nikita-1278", count == 1);
51334 + assert("nikita-1451", item->unit_pos == 0);
51335 +
51336 + new_node = item->node;
51337 + assert("nikita-2132", new_node != old_node);
51338 + tree = znode_get_tree(item->node);
51339 + child = child_znode(item, old_node, 1, 0);
51340 + if (child == NULL)
51341 + return 0;
51342 + if (!IS_ERR(child)) {
51343 + write_lock_tree(tree);
51344 + ++new_node->c_count;
51345 + assert("nikita-1395", znode_parent(child) == old_node);
51346 + assert("nikita-1396", old_node->c_count > 0);
51347 + coord_to_parent_coord(item, &child->in_parent);
51348 + assert("nikita-1781", znode_parent(child) == new_node);
51349 + assert("nikita-1782",
51350 + check_tree_pointer(item, child) == NS_FOUND);
51351 + --old_node->c_count;
51352 + write_unlock_tree(tree);
51353 + zput(child);
51354 + return 0;
51355 + } else
51356 + return PTR_ERR(child);
51357 +}
51358 +
51359 +/* plugin->u.item.b.max_key_inside - not defined */
51360 +
51361 +/* plugin->u.item.b.nr_units - item.c:single_unit */
51362 +
51363 +/* Make Linus happy.
51364 + Local variables:
51365 + c-indentation-style: "K&R"
51366 + mode-name: "LC"
51367 + c-basic-offset: 8
51368 + tab-width: 8
51369 + fill-column: 120
51370 + End:
51371 +*/
51372 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h linux-2.6.20/fs/reiser4/plugin/item/internal.h
51373 --- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
51374 +++ linux-2.6.20/fs/reiser4/plugin/item/internal.h 2007-05-06 14:50:43.815011970 +0400
51375 @@ -0,0 +1,57 @@
51376 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51377 +/* Internal item contains down-link to the child of the internal/twig
51378 + node in a tree. It is internal items that are actually used during
51379 + tree traversal. */
51380 +
51381 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51382 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51383 +
51384 +#include "../../forward.h"
51385 +#include "../../dformat.h"
51386 +
51387 +/* on-disk layout of internal item */
51388 +typedef struct internal_item_layout {
51389 + /* 0 */ reiser4_dblock_nr pointer;
51390 + /* 4 */
51391 +} internal_item_layout;
51392 +
51393 +struct cut_list;
51394 +
51395 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
51396 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51397 + coord_t * coord);
51398 +/* store pointer from internal item into "block". Implementation of
51399 + ->down_link() method */
51400 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51401 + reiser4_block_nr * block);
51402 +extern int has_pointer_to_internal(const coord_t * coord,
51403 + const reiser4_block_nr * block);
51404 +extern int create_hook_internal(const coord_t * item, void *arg);
51405 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51406 + pos_in_node_t count, struct carry_kill_data *);
51407 +extern int shift_hook_internal(const coord_t * item, unsigned from,
51408 + unsigned count, znode * old_node);
51409 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
51410 +
51411 +extern int utmost_child_internal(const coord_t * coord, sideof side,
51412 + jnode ** child);
51413 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51414 + reiser4_block_nr * block);
51415 +
51416 +extern void reiser4_update_internal(const coord_t * coord,
51417 + const reiser4_block_nr * blocknr);
51418 +/* FIXME: reiserfs has check_internal */
51419 +extern int check__internal(const coord_t * coord, const char **error);
51420 +
51421 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51422 +#endif
51423 +
51424 +/* Make Linus happy.
51425 + Local variables:
51426 + c-indentation-style: "K&R"
51427 + mode-name: "LC"
51428 + c-basic-offset: 8
51429 + tab-width: 8
51430 + fill-column: 120
51431 + End:
51432 +*/
51433 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.c linux-2.6.20/fs/reiser4/plugin/item/item.c
51434 --- linux-2.6.20.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
51435 +++ linux-2.6.20/fs/reiser4/plugin/item/item.c 2007-05-06 14:50:43.815011970 +0400
51436 @@ -0,0 +1,719 @@
51437 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51438 +
51439 +/* definition of item plugins. */
51440 +
51441 +#include "../../forward.h"
51442 +#include "../../debug.h"
51443 +#include "../../key.h"
51444 +#include "../../coord.h"
51445 +#include "../plugin_header.h"
51446 +#include "sde.h"
51447 +#include "internal.h"
51448 +#include "item.h"
51449 +#include "static_stat.h"
51450 +#include "../plugin.h"
51451 +#include "../../znode.h"
51452 +#include "../../tree.h"
51453 +#include "../../context.h"
51454 +#include "ctail.h"
51455 +
51456 +/* return pointer to item body */
51457 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51458 +{
51459 + assert("nikita-324", coord != NULL);
51460 + assert("nikita-325", coord->node != NULL);
51461 + assert("nikita-326", znode_is_loaded(coord->node));
51462 + assert("nikita-3200", coord->offset == INVALID_OFFSET);
51463 +
51464 + coord->offset =
51465 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51466 + zdata(coord->node);
51467 + ON_DEBUG(coord->body_v = coord->node->times_locked);
51468 +}
51469 +
51470 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51471 +{
51472 + return zdata(coord->node) + coord->offset;
51473 +}
51474 +
51475 +#if REISER4_DEBUG
51476 +
51477 +int item_body_is_valid(const coord_t * coord)
51478 +{
51479 + return
51480 + coord->offset ==
51481 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51482 + zdata(coord->node);
51483 +}
51484 +
51485 +#endif
51486 +
51487 +/* return length of item at @coord */
51488 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51489 +{
51490 + int len;
51491 +
51492 + assert("nikita-327", coord != NULL);
51493 + assert("nikita-328", coord->node != NULL);
51494 + assert("nikita-329", znode_is_loaded(coord->node));
51495 +
51496 + len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51497 + return len;
51498 +}
51499 +
51500 +void obtain_item_plugin(const coord_t * coord)
51501 +{
51502 + assert("nikita-330", coord != NULL);
51503 + assert("nikita-331", coord->node != NULL);
51504 + assert("nikita-332", znode_is_loaded(coord->node));
51505 +
51506 + coord_set_iplug((coord_t *) coord,
51507 + node_plugin_by_node(coord->node)->
51508 + plugin_by_coord(coord));
51509 + assert("nikita-2479",
51510 + coord_iplug(coord) ==
51511 + node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51512 +}
51513 +
51514 +/* return id of item */
51515 +/* Audited by: green(2002.06.15) */
51516 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51517 +{
51518 + assert("vs-539", coord != NULL);
51519 + assert("vs-538", coord->node != NULL);
51520 + assert("vs-537", znode_is_loaded(coord->node));
51521 + assert("vs-536", item_plugin_by_coord(coord) != NULL);
51522 + assert("vs-540",
51523 + item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51524 +
51525 + return item_id_by_plugin(item_plugin_by_coord(coord));
51526 +}
51527 +
51528 +/* return key of item at @coord */
51529 +/* Audited by: green(2002.06.15) */
51530 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51531 + reiser4_key * key /* result */ )
51532 +{
51533 + assert("nikita-338", coord != NULL);
51534 + assert("nikita-339", coord->node != NULL);
51535 + assert("nikita-340", znode_is_loaded(coord->node));
51536 +
51537 + return node_plugin_by_node(coord->node)->key_at(coord, key);
51538 +}
51539 +
51540 +/* this returns max key in the item */
51541 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51542 + reiser4_key * key /* result */ )
51543 +{
51544 + coord_t last;
51545 +
51546 + assert("nikita-338", coord != NULL);
51547 + assert("nikita-339", coord->node != NULL);
51548 + assert("nikita-340", znode_is_loaded(coord->node));
51549 +
51550 + /* make coord pointing to last item's unit */
51551 + coord_dup(&last, coord);
51552 + last.unit_pos = coord_num_units(&last) - 1;
51553 + assert("vs-1560", coord_is_existing_unit(&last));
51554 +
51555 + max_unit_key_by_coord(&last, key);
51556 + return key;
51557 +}
51558 +
51559 +/* return key of unit at @coord */
51560 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51561 + reiser4_key * key /* result */ )
51562 +{
51563 + assert("nikita-772", coord != NULL);
51564 + assert("nikita-774", coord->node != NULL);
51565 + assert("nikita-775", znode_is_loaded(coord->node));
51566 +
51567 + if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51568 + return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51569 + else
51570 + return item_key_by_coord(coord, key);
51571 +}
51572 +
51573 +/* return the biggest key contained the unit @coord */
51574 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51575 + reiser4_key * key /* result */ )
51576 +{
51577 + assert("nikita-772", coord != NULL);
51578 + assert("nikita-774", coord->node != NULL);
51579 + assert("nikita-775", znode_is_loaded(coord->node));
51580 +
51581 + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51582 + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51583 + else
51584 + return unit_key_by_coord(coord, key);
51585 +}
51586 +
51587 +/* ->max_key_inside() method for items consisting of exactly one key (like
51588 + stat-data) */
51589 +static reiser4_key *max_key_inside_single_key(const coord_t *
51590 + coord /* coord of item */ ,
51591 + reiser4_key *
51592 + result /* resulting key */ )
51593 +{
51594 + assert("nikita-604", coord != NULL);
51595 +
51596 + /* coord -> key is starting key of this item and it has to be already
51597 + filled in */
51598 + return unit_key_by_coord(coord, result);
51599 +}
51600 +
51601 +/* ->nr_units() method for items consisting of exactly one unit always */
51602 +pos_in_node_t
51603 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51604 +{
51605 + return 1;
51606 +}
51607 +
51608 +static int
51609 +paste_no_paste(coord_t * coord UNUSED_ARG,
51610 + reiser4_item_data * data UNUSED_ARG,
51611 + carry_plugin_info * info UNUSED_ARG)
51612 +{
51613 + return 0;
51614 +}
51615 +
51616 +/* default ->fast_paste() method */
51617 +static int
51618 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51619 +{
51620 + return 1;
51621 +}
51622 +
51623 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
51624 + const reiser4_key * key /* key to check */ ,
51625 + const reiser4_item_data * data /* parameters of item
51626 + * being created */ )
51627 +{
51628 + item_plugin *iplug;
51629 + reiser4_key min_key_in_item;
51630 + reiser4_key max_key_in_item;
51631 +
51632 + assert("nikita-1658", item != NULL);
51633 + assert("nikita-1659", key != NULL);
51634 +
51635 + iplug = item_plugin_by_coord(item);
51636 + if (iplug->b.can_contain_key != NULL)
51637 + return iplug->b.can_contain_key(item, key, data);
51638 + else {
51639 + assert("nikita-1681", iplug->b.max_key_inside != NULL);
51640 + item_key_by_coord(item, &min_key_in_item);
51641 + iplug->b.max_key_inside(item, &max_key_in_item);
51642 +
51643 + /* can contain key if
51644 + min_key_in_item <= key &&
51645 + key <= max_key_in_item
51646 + */
51647 + return keyle(&min_key_in_item, key)
51648 + && keyle(key, &max_key_in_item);
51649 + }
51650 +}
51651 +
51652 +/* mergeable method for non mergeable items */
51653 +static int
51654 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51655 +{
51656 + return 0;
51657 +}
51658 +
51659 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51660 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51661 + const coord_t * i2 /* coord of second item */ )
51662 +{
51663 + item_plugin *iplug;
51664 + reiser4_key k1;
51665 + reiser4_key k2;
51666 +
51667 + assert("nikita-1336", i1 != NULL);
51668 + assert("nikita-1337", i2 != NULL);
51669 +
51670 + iplug = item_plugin_by_coord(i1);
51671 + assert("nikita-1338", iplug != NULL);
51672 +
51673 + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51674 + shifting code when nodes are in "suspended" state. */
51675 + assert("nikita-1663",
51676 + keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51677 +
51678 + if (iplug->b.mergeable != NULL) {
51679 + return iplug->b.mergeable(i1, i2);
51680 + } else if (iplug->b.max_key_inside != NULL) {
51681 + iplug->b.max_key_inside(i1, &k1);
51682 + item_key_by_coord(i2, &k2);
51683 +
51684 + /* mergeable if ->max_key_inside() >= key of i2; */
51685 + return keyge(iplug->b.max_key_inside(i1, &k1),
51686 + item_key_by_coord(i2, &k2));
51687 + } else {
51688 + item_key_by_coord(i1, &k1);
51689 + item_key_by_coord(i2, &k2);
51690 +
51691 + return
51692 + (get_key_locality(&k1) == get_key_locality(&k2)) &&
51693 + (get_key_objectid(&k1) == get_key_objectid(&k2))
51694 + && (iplug == item_plugin_by_coord(i2));
51695 + }
51696 +}
51697 +
51698 +int item_is_extent(const coord_t * item)
51699 +{
51700 + assert("vs-482", coord_is_existing_item(item));
51701 + return item_id_by_coord(item) == EXTENT_POINTER_ID;
51702 +}
51703 +
51704 +int item_is_tail(const coord_t * item)
51705 +{
51706 + assert("vs-482", coord_is_existing_item(item));
51707 + return item_id_by_coord(item) == FORMATTING_ID;
51708 +}
51709 +
51710 +#if REISER4_DEBUG
51711 +
51712 +int item_is_statdata(const coord_t * item)
51713 +{
51714 + assert("vs-516", coord_is_existing_item(item));
51715 + return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
51716 +}
51717 +
51718 +int item_is_ctail(const coord_t * item)
51719 +{
51720 + assert("edward-xx", coord_is_existing_item(item));
51721 + return item_id_by_coord(item) == CTAIL_ID;
51722 +}
51723 +
51724 +#endif /* REISER4_DEBUG */
51725 +
51726 +static int change_item(struct inode *inode,
51727 + reiser4_plugin * plugin,
51728 + pset_member memb)
51729 +{
51730 + /* cannot change constituent item (sd, or dir_item) */
51731 + return RETERR(-EINVAL);
51732 +}
51733 +
51734 +static reiser4_plugin_ops item_plugin_ops = {
51735 + .init = NULL,
51736 + .load = NULL,
51737 + .save_len = NULL,
51738 + .save = NULL,
51739 + .change = change_item
51740 +};
51741 +
51742 +item_plugin item_plugins[LAST_ITEM_ID] = {
51743 + [STATIC_STAT_DATA_ID] = {
51744 + .h = {
51745 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51746 + .id = STATIC_STAT_DATA_ID,
51747 + .groups = (1 << STAT_DATA_ITEM_TYPE),
51748 + .pops = &item_plugin_ops,
51749 + .label = "sd",
51750 + .desc = "stat-data",
51751 + .linkage = {NULL, NULL}
51752 + },
51753 + .b = {
51754 + .max_key_inside = max_key_inside_single_key,
51755 + .can_contain_key = NULL,
51756 + .mergeable = not_mergeable,
51757 + .nr_units = nr_units_single_unit,
51758 + .lookup = NULL,
51759 + .init = NULL,
51760 + .paste = paste_no_paste,
51761 + .fast_paste = NULL,
51762 + .can_shift = NULL,
51763 + .copy_units = NULL,
51764 + .create_hook = NULL,
51765 + .kill_hook = NULL,
51766 + .shift_hook = NULL,
51767 + .cut_units = NULL,
51768 + .kill_units = NULL,
51769 + .unit_key = NULL,
51770 + .max_unit_key = NULL,
51771 + .estimate = NULL,
51772 + .item_data_by_flow = NULL,
51773 +#if REISER4_DEBUG
51774 + .check = NULL
51775 +#endif
51776 + },
51777 + .f = {
51778 + .utmost_child = NULL,
51779 + .utmost_child_real_block = NULL,
51780 + .update = NULL,
51781 + .scan = NULL,
51782 + .convert = NULL
51783 + },
51784 + .s = {
51785 + .sd = {
51786 + .init_inode = init_inode_static_sd,
51787 + .save_len = save_len_static_sd,
51788 + .save = save_static_sd
51789 + }
51790 + }
51791 + },
51792 + [SIMPLE_DIR_ENTRY_ID] = {
51793 + .h = {
51794 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51795 + .id = SIMPLE_DIR_ENTRY_ID,
51796 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51797 + .pops = &item_plugin_ops,
51798 + .label = "de",
51799 + .desc = "directory entry",
51800 + .linkage = {NULL, NULL}
51801 + },
51802 + .b = {
51803 + .max_key_inside = max_key_inside_single_key,
51804 + .can_contain_key = NULL,
51805 + .mergeable = NULL,
51806 + .nr_units = nr_units_single_unit,
51807 + .lookup = NULL,
51808 + .init = NULL,
51809 + .paste = NULL,
51810 + .fast_paste = NULL,
51811 + .can_shift = NULL,
51812 + .copy_units = NULL,
51813 + .create_hook = NULL,
51814 + .kill_hook = NULL,
51815 + .shift_hook = NULL,
51816 + .cut_units = NULL,
51817 + .kill_units = NULL,
51818 + .unit_key = NULL,
51819 + .max_unit_key = NULL,
51820 + .estimate = NULL,
51821 + .item_data_by_flow = NULL,
51822 +#if REISER4_DEBUG
51823 + .check = NULL
51824 +#endif
51825 + },
51826 + .f = {
51827 + .utmost_child = NULL,
51828 + .utmost_child_real_block = NULL,
51829 + .update = NULL,
51830 + .scan = NULL,
51831 + .convert = NULL
51832 + },
51833 + .s = {
51834 + .dir = {
51835 + .extract_key = extract_key_de,
51836 + .update_key = update_key_de,
51837 + .extract_name = extract_name_de,
51838 + .extract_file_type = extract_file_type_de,
51839 + .add_entry = add_entry_de,
51840 + .rem_entry = rem_entry_de,
51841 + .max_name_len = max_name_len_de
51842 + }
51843 + }
51844 + },
51845 + [COMPOUND_DIR_ID] = {
51846 + .h = {
51847 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51848 + .id = COMPOUND_DIR_ID,
51849 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51850 + .pops = &item_plugin_ops,
51851 + .label = "cde",
51852 + .desc = "compressed directory entry",
51853 + .linkage = {NULL, NULL}
51854 + },
51855 + .b = {
51856 + .max_key_inside = max_key_inside_cde,
51857 + .can_contain_key = can_contain_key_cde,
51858 + .mergeable = mergeable_cde,
51859 + .nr_units = nr_units_cde,
51860 + .lookup = lookup_cde,
51861 + .init = init_cde,
51862 + .paste = paste_cde,
51863 + .fast_paste = agree_to_fast_op,
51864 + .can_shift = can_shift_cde,
51865 + .copy_units = copy_units_cde,
51866 + .create_hook = NULL,
51867 + .kill_hook = NULL,
51868 + .shift_hook = NULL,
51869 + .cut_units = cut_units_cde,
51870 + .kill_units = kill_units_cde,
51871 + .unit_key = unit_key_cde,
51872 + .max_unit_key = unit_key_cde,
51873 + .estimate = estimate_cde,
51874 + .item_data_by_flow = NULL,
51875 +#if REISER4_DEBUG
51876 + .check = reiser4_check_cde
51877 +#endif
51878 + },
51879 + .f = {
51880 + .utmost_child = NULL,
51881 + .utmost_child_real_block = NULL,
51882 + .update = NULL,
51883 + .scan = NULL,
51884 + .convert = NULL
51885 + },
51886 + .s = {
51887 + .dir = {
51888 + .extract_key = extract_key_cde,
51889 + .update_key = update_key_cde,
51890 + .extract_name = extract_name_cde,
51891 + .extract_file_type = extract_file_type_de,
51892 + .add_entry = add_entry_cde,
51893 + .rem_entry = rem_entry_cde,
51894 + .max_name_len = max_name_len_cde
51895 + }
51896 + }
51897 + },
51898 + [NODE_POINTER_ID] = {
51899 + .h = {
51900 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51901 + .id = NODE_POINTER_ID,
51902 + .groups = (1 << INTERNAL_ITEM_TYPE),
51903 + .pops = NULL,
51904 + .label = "internal",
51905 + .desc = "internal item",
51906 + .linkage = {NULL, NULL}
51907 + },
51908 + .b = {
51909 + .max_key_inside = NULL,
51910 + .can_contain_key = NULL,
51911 + .mergeable = mergeable_internal,
51912 + .nr_units = nr_units_single_unit,
51913 + .lookup = lookup_internal,
51914 + .init = NULL,
51915 + .paste = NULL,
51916 + .fast_paste = NULL,
51917 + .can_shift = NULL,
51918 + .copy_units = NULL,
51919 + .create_hook = create_hook_internal,
51920 + .kill_hook = kill_hook_internal,
51921 + .shift_hook = shift_hook_internal,
51922 + .cut_units = NULL,
51923 + .kill_units = NULL,
51924 + .unit_key = NULL,
51925 + .max_unit_key = NULL,
51926 + .estimate = NULL,
51927 + .item_data_by_flow = NULL,
51928 +#if REISER4_DEBUG
51929 + .check = check__internal
51930 +#endif
51931 + },
51932 + .f = {
51933 + .utmost_child = utmost_child_internal,
51934 + .utmost_child_real_block =
51935 + utmost_child_real_block_internal,
51936 + .update = reiser4_update_internal,
51937 + .scan = NULL,
51938 + .convert = NULL
51939 + },
51940 + .s = {
51941 + .internal = {
51942 + .down_link = down_link_internal,
51943 + .has_pointer_to = has_pointer_to_internal
51944 + }
51945 + }
51946 + },
51947 + [EXTENT_POINTER_ID] = {
51948 + .h = {
51949 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51950 + .id = EXTENT_POINTER_ID,
51951 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51952 + .pops = NULL,
51953 + .label = "extent",
51954 + .desc = "extent item",
51955 + .linkage = {NULL, NULL}
51956 + },
51957 + .b = {
51958 + .max_key_inside = max_key_inside_extent,
51959 + .can_contain_key = can_contain_key_extent,
51960 + .mergeable = mergeable_extent,
51961 + .nr_units = nr_units_extent,
51962 + .lookup = lookup_extent,
51963 + .init = NULL,
51964 + .paste = paste_extent,
51965 + .fast_paste = agree_to_fast_op,
51966 + .can_shift = can_shift_extent,
51967 + .create_hook = create_hook_extent,
51968 + .copy_units = copy_units_extent,
51969 + .kill_hook = kill_hook_extent,
51970 + .shift_hook = NULL,
51971 + .cut_units = cut_units_extent,
51972 + .kill_units = kill_units_extent,
51973 + .unit_key = unit_key_extent,
51974 + .max_unit_key = max_unit_key_extent,
51975 + .estimate = NULL,
51976 + .item_data_by_flow = NULL,
51977 +#if REISER4_DEBUG
51978 + .check = reiser4_check_extent
51979 +#endif
51980 + },
51981 + .f = {
51982 + .utmost_child = utmost_child_extent,
51983 + .utmost_child_real_block =
51984 + utmost_child_real_block_extent,
51985 + .update = NULL,
51986 + .scan = reiser4_scan_extent,
51987 + .convert = NULL,
51988 + .key_by_offset = key_by_offset_extent
51989 + },
51990 + .s = {
51991 + .file = {
51992 + .write = reiser4_write_extent,
51993 + .read = reiser4_read_extent,
51994 + .readpage = reiser4_readpage_extent,
51995 + .get_block = get_block_address_extent,
51996 + .append_key = append_key_extent,
51997 + .init_coord_extension =
51998 + init_coord_extension_extent
51999 + }
52000 + }
52001 + },
52002 + [FORMATTING_ID] = {
52003 + .h = {
52004 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52005 + .id = FORMATTING_ID,
52006 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52007 + .pops = NULL,
52008 + .label = "body",
52009 + .desc = "body (or tail?) item",
52010 + .linkage = {NULL, NULL}
52011 + },
52012 + .b = {
52013 + .max_key_inside = max_key_inside_tail,
52014 + .can_contain_key = can_contain_key_tail,
52015 + .mergeable = mergeable_tail,
52016 + .nr_units = nr_units_tail,
52017 + .lookup = lookup_tail,
52018 + .init = NULL,
52019 + .paste = paste_tail,
52020 + .fast_paste = agree_to_fast_op,
52021 + .can_shift = can_shift_tail,
52022 + .create_hook = NULL,
52023 + .copy_units = copy_units_tail,
52024 + .kill_hook = kill_hook_tail,
52025 + .shift_hook = NULL,
52026 + .cut_units = cut_units_tail,
52027 + .kill_units = kill_units_tail,
52028 + .unit_key = unit_key_tail,
52029 + .max_unit_key = unit_key_tail,
52030 + .estimate = NULL,
52031 + .item_data_by_flow = NULL,
52032 +#if REISER4_DEBUG
52033 + .check = NULL
52034 +#endif
52035 + },
52036 + .f = {
52037 + .utmost_child = NULL,
52038 + .utmost_child_real_block = NULL,
52039 + .update = NULL,
52040 + .scan = NULL,
52041 + .convert = NULL
52042 + },
52043 + .s = {
52044 + .file = {
52045 + .write = reiser4_write_tail,
52046 + .read = reiser4_read_tail,
52047 + .readpage = readpage_tail,
52048 + .get_block = get_block_address_tail,
52049 + .append_key = append_key_tail,
52050 + .init_coord_extension =
52051 + init_coord_extension_tail
52052 + }
52053 + }
52054 + },
52055 + [CTAIL_ID] = {
52056 + .h = {
52057 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52058 + .id = CTAIL_ID,
52059 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52060 + .pops = NULL,
52061 + .label = "ctail",
52062 + .desc = "cryptcompress tail item",
52063 + .linkage = {NULL, NULL}
52064 + },
52065 + .b = {
52066 + .max_key_inside = max_key_inside_tail,
52067 + .can_contain_key = can_contain_key_ctail,
52068 + .mergeable = mergeable_ctail,
52069 + .nr_units = nr_units_ctail,
52070 + .lookup = NULL,
52071 + .init = init_ctail,
52072 + .paste = paste_ctail,
52073 + .fast_paste = agree_to_fast_op,
52074 + .can_shift = can_shift_ctail,
52075 + .create_hook = create_hook_ctail,
52076 + .copy_units = copy_units_ctail,
52077 + .kill_hook = kill_hook_ctail,
52078 + .shift_hook = shift_hook_ctail,
52079 + .cut_units = cut_units_ctail,
52080 + .kill_units = kill_units_ctail,
52081 + .unit_key = unit_key_tail,
52082 + .max_unit_key = unit_key_tail,
52083 + .estimate = estimate_ctail,
52084 + .item_data_by_flow = NULL,
52085 +#if REISER4_DEBUG
52086 + .check = check_ctail
52087 +#endif
52088 + },
52089 + .f = {
52090 + .utmost_child = utmost_child_ctail,
52091 + /* FIXME-EDWARD: write this */
52092 + .utmost_child_real_block = NULL,
52093 + .update = NULL,
52094 + .scan = scan_ctail,
52095 + .convert = convert_ctail
52096 + },
52097 + .s = {
52098 + .file = {
52099 + .write = NULL,
52100 + .read = read_ctail,
52101 + .readpage = readpage_ctail,
52102 + .get_block = get_block_address_tail,
52103 + .append_key = append_key_ctail,
52104 + .init_coord_extension =
52105 + init_coord_extension_tail
52106 + }
52107 + }
52108 + },
52109 + [BLACK_BOX_ID] = {
52110 + .h = {
52111 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52112 + .id = BLACK_BOX_ID,
52113 + .groups = (1 << OTHER_ITEM_TYPE),
52114 + .pops = NULL,
52115 + .label = "blackbox",
52116 + .desc = "black box item",
52117 + .linkage = {NULL, NULL}
52118 + },
52119 + .b = {
52120 + .max_key_inside = NULL,
52121 + .can_contain_key = NULL,
52122 + .mergeable = not_mergeable,
52123 + .nr_units = nr_units_single_unit,
52124 + /* to need for ->lookup method */
52125 + .lookup = NULL,
52126 + .init = NULL,
52127 + .paste = NULL,
52128 + .fast_paste = NULL,
52129 + .can_shift = NULL,
52130 + .copy_units = NULL,
52131 + .create_hook = NULL,
52132 + .kill_hook = NULL,
52133 + .shift_hook = NULL,
52134 + .cut_units = NULL,
52135 + .kill_units = NULL,
52136 + .unit_key = NULL,
52137 + .max_unit_key = NULL,
52138 + .estimate = NULL,
52139 + .item_data_by_flow = NULL,
52140 +#if REISER4_DEBUG
52141 + .check = NULL
52142 +#endif
52143 + }
52144 + }
52145 +};
52146 +
52147 +/* Make Linus happy.
52148 + Local variables:
52149 + c-indentation-style: "K&R"
52150 + mode-name: "LC"
52151 + c-basic-offset: 8
52152 + tab-width: 8
52153 + fill-column: 120
52154 + End:
52155 +*/
52156 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.h linux-2.6.20/fs/reiser4/plugin/item/item.h
52157 --- linux-2.6.20.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
52158 +++ linux-2.6.20/fs/reiser4/plugin/item/item.h 2007-05-06 14:50:43.819013220 +0400
52159 @@ -0,0 +1,400 @@
52160 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52161 +
52162 +/* first read balance.c comments before reading this */
52163 +
52164 +/* An item_plugin implements all of the operations required for
52165 + balancing that are item specific. */
52166 +
52167 +/* an item plugin also implements other operations that are specific to that
52168 + item. These go into the item specific operations portion of the item
52169 + handler, and all of the item specific portions of the item handler are put
52170 + into a union. */
52171 +
52172 +#if !defined( __REISER4_ITEM_H__ )
52173 +#define __REISER4_ITEM_H__
52174 +
52175 +#include "../../forward.h"
52176 +#include "../plugin_header.h"
52177 +#include "../../dformat.h"
52178 +#include "../../seal.h"
52179 +#include "../../plugin/file/file.h"
52180 +
52181 +#include <linux/fs.h> /* for struct file, struct inode */
52182 +#include <linux/mm.h> /* for struct page */
52183 +#include <linux/dcache.h> /* for struct dentry */
52184 +
52185 +typedef enum {
52186 + STAT_DATA_ITEM_TYPE,
52187 + DIR_ENTRY_ITEM_TYPE,
52188 + INTERNAL_ITEM_TYPE,
52189 + UNIX_FILE_METADATA_ITEM_TYPE,
52190 + OTHER_ITEM_TYPE
52191 +} item_type_id;
52192 +
52193 +/* this is the part of each item plugin that all items are expected to
52194 + support or at least explicitly fail to support by setting the
52195 + pointer to null. */
52196 +typedef struct {
52197 + /* operations called by balancing
52198 +
52199 + It is interesting to consider that some of these item
52200 + operations could be given sources or targets that are not
52201 + really items in nodes. This could be ok/useful.
52202 +
52203 + */
52204 + /* maximal key that can _possibly_ be occupied by this item
52205 +
52206 + When inserting, and node ->lookup() method (called by
52207 + coord_by_key()) reaches an item after binary search,
52208 + the ->max_key_inside() item plugin method is used to determine
52209 + whether new item should pasted into existing item
52210 + (new_key<=max_key_inside()) or new item has to be created
52211 + (new_key>max_key_inside()).
52212 +
52213 + For items that occupy exactly one key (like stat-data)
52214 + this method should return this key. For items that can
52215 + grow indefinitely (extent, directory item) this should
52216 + return reiser4_max_key().
52217 +
52218 + For example extent with the key
52219 +
52220 + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52221 +
52222 + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52223 + */
52224 + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52225 +
52226 + /* true if item @coord can merge data at @key. */
52227 + int (*can_contain_key) (const coord_t *, const reiser4_key *,
52228 + const reiser4_item_data *);
52229 + /* mergeable() - check items for mergeability
52230 +
52231 + Optional method. Returns true if two items can be merged.
52232 +
52233 + */
52234 + int (*mergeable) (const coord_t *, const coord_t *);
52235 +
52236 + /* number of atomic things in an item.
52237 + NOTE FOR CONTRIBUTORS: use a generic method
52238 + nr_units_single_unit() for solid (atomic) items, as
52239 + tree operations use it as a criterion of solidness
52240 + (see is_solid_item macro) */
52241 + pos_in_node_t(*nr_units) (const coord_t *);
52242 +
52243 + /* search within item for a unit within the item, and return a
52244 + pointer to it. This can be used to calculate how many
52245 + bytes to shrink an item if you use pointer arithmetic and
52246 + compare to the start of the item body if the item's data
52247 + are continuous in the node, if the item's data are not
52248 + continuous in the node, all sorts of other things are maybe
52249 + going to break as well. */
52250 + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52251 + /* method called by ode_plugin->create_item() to initialise new
52252 + item */
52253 + int (*init) (coord_t * target, coord_t * from,
52254 + reiser4_item_data * data);
52255 + /* method called (e.g., by reiser4_resize_item()) to place new data
52256 + into item when it grows */
52257 + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52258 + /* return true if paste into @coord is allowed to skip
52259 + carry. That is, if such paste would require any changes
52260 + at the parent level
52261 + */
52262 + int (*fast_paste) (const coord_t *);
52263 + /* how many but not more than @want units of @source can be
52264 + shifted into @target node. If pend == append - we try to
52265 + append last item of @target by first units of @source. If
52266 + pend == prepend - we try to "prepend" first item in @target
52267 + by last units of @source. @target node has @free_space
52268 + bytes of free space. Total size of those units are returned
52269 + via @size.
52270 +
52271 + @target is not NULL if shifting to the mergeable item and
52272 + NULL is new item will be created during shifting.
52273 + */
52274 + int (*can_shift) (unsigned free_space, coord_t *,
52275 + znode *, shift_direction, unsigned *size,
52276 + unsigned want);
52277 +
52278 + /* starting off @from-th unit of item @source append or
52279 + prepend @count units to @target. @target has been already
52280 + expanded by @free_space bytes. That must be exactly what is
52281 + needed for those items in @target. If @where_is_free_space
52282 + == SHIFT_LEFT - free space is at the end of @target item,
52283 + othersize - it is in the beginning of it. */
52284 + void (*copy_units) (coord_t *, coord_t *,
52285 + unsigned from, unsigned count,
52286 + shift_direction where_is_free_space,
52287 + unsigned free_space);
52288 +
52289 + int (*create_hook) (const coord_t *, void *);
52290 + /* do whatever is necessary to do when @count units starting
52291 + from @from-th one are removed from the tree */
52292 + /* FIXME-VS: this is used to be here for, in particular,
52293 + extents and items of internal type to free blocks they point
52294 + to at the same time with removing items from a
52295 + tree. Problems start, however, when dealloc_block fails due
52296 + to some reason. Item gets removed, but blocks it pointed to
52297 + are not freed. It is not clear how to fix this for items of
52298 + internal type because a need to remove internal item may
52299 + appear in the middle of balancing, and there is no way to
52300 + undo changes made. OTOH, if space allocator involves
52301 + balancing to perform dealloc_block - this will probably
52302 + break balancing due to deadlock issues
52303 + */
52304 + int (*kill_hook) (const coord_t *, pos_in_node_t from,
52305 + pos_in_node_t count, struct carry_kill_data *);
52306 + int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52307 + znode * _node);
52308 +
52309 + /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52310 + including boundaries. When units are cut from item beginning - move space which gets freed to head of
52311 + item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52312 + item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52313 + @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52314 + */
52315 + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52316 + struct carry_cut_data *,
52317 + reiser4_key * smallest_removed,
52318 + reiser4_key * new_first_key);
52319 +
52320 + /* like cut_units, except that these units are removed from the
52321 + tree, not only from a node */
52322 + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52323 + struct carry_kill_data *,
52324 + reiser4_key * smallest_removed,
52325 + reiser4_key * new_first);
52326 +
52327 + /* if @key_of_coord == 1 - returned key of coord, otherwise -
52328 + key of unit is returned. If @coord is not set to certain
52329 + unit - ERR_PTR(-ENOENT) is returned */
52330 + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52331 + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52332 + /* estimate how much space is needed for paste @data into item at
52333 + @coord. if @coord==0 - estimate insertion, otherwise - estimate
52334 + pasting
52335 + */
52336 + int (*estimate) (const coord_t *, const reiser4_item_data *);
52337 +
52338 + /* converts flow @f to item data. @coord == 0 on insert */
52339 + int (*item_data_by_flow) (const coord_t *, const flow_t *,
52340 + reiser4_item_data *);
52341 +
52342 + /*void (*show) (struct seq_file *, coord_t *); */
52343 +
52344 +#if REISER4_DEBUG
52345 + /* used for debugging, every item should have here the most
52346 + complete possible check of the consistency of the item that
52347 + the inventor can construct */
52348 + int (*check) (const coord_t *, const char **error);
52349 +#endif
52350 +
52351 +} balance_ops;
52352 +
52353 +typedef struct {
52354 + /* return the right or left child of @coord, only if it is in memory */
52355 + int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52356 +
52357 + /* return whether the right or left child of @coord has a non-fake
52358 + block number. */
52359 + int (*utmost_child_real_block) (const coord_t *, sideof side,
52360 + reiser4_block_nr *);
52361 + /* relocate child at @coord to the @block */
52362 + void (*update) (const coord_t *, const reiser4_block_nr *);
52363 + /* count unformatted nodes per item for leave relocation policy, etc.. */
52364 + int (*scan) (flush_scan * scan);
52365 + /* convert item by flush */
52366 + int (*convert) (flush_pos_t * pos);
52367 + /* backward mapping from jnode offset to a key. */
52368 + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52369 +} flush_ops;
52370 +
52371 +/* operations specific to the directory item */
52372 +typedef struct {
52373 + /* extract stat-data key from directory entry at @coord and place it
52374 + into @key. */
52375 + int (*extract_key) (const coord_t *, reiser4_key * key);
52376 + /* update object key in item. */
52377 + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52378 + /* extract name from directory entry at @coord and return it */
52379 + char *(*extract_name) (const coord_t *, char *buf);
52380 + /* extract file type (DT_* stuff) from directory entry at @coord and
52381 + return it */
52382 + unsigned (*extract_file_type) (const coord_t *);
52383 + int (*add_entry) (struct inode * dir,
52384 + coord_t *, lock_handle *,
52385 + const struct dentry * name,
52386 + reiser4_dir_entry_desc * entry);
52387 + int (*rem_entry) (struct inode * dir, const struct qstr * name,
52388 + coord_t *, lock_handle *,
52389 + reiser4_dir_entry_desc * entry);
52390 + int (*max_name_len) (const struct inode * dir);
52391 +} dir_entry_ops;
52392 +
52393 +/* operations specific to items regular (unix) file metadata are built of */
52394 +typedef struct {
52395 + int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52396 + int (*read) (struct file *, flow_t *, hint_t *);
52397 + int (*readpage) (void *, struct page *);
52398 + int (*get_block) (const coord_t *, sector_t, sector_t *);
52399 + /*
52400 + * key of first byte which is not addressed by the item @coord is set
52401 + * to.
52402 + * For example, for extent item with the key
52403 + *
52404 + * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52405 + *
52406 + * ->append_key is
52407 + *
52408 + * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52409 + */
52410 + reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52411 +
52412 + void (*init_coord_extension) (uf_coord_t *, loff_t);
52413 +} file_ops;
52414 +
52415 +/* operations specific to items of stat data type */
52416 +typedef struct {
52417 + int (*init_inode) (struct inode * inode, char *sd, int len);
52418 + int (*save_len) (struct inode * inode);
52419 + int (*save) (struct inode * inode, char **area);
52420 +} sd_ops;
52421 +
52422 +/* operations specific to internal item */
52423 +typedef struct {
52424 + /* all tree traversal want to know from internal item is where
52425 + to go next. */
52426 + void (*down_link) (const coord_t * coord,
52427 + const reiser4_key * key, reiser4_block_nr * block);
52428 + /* check that given internal item contains given pointer. */
52429 + int (*has_pointer_to) (const coord_t * coord,
52430 + const reiser4_block_nr * block);
52431 +} internal_item_ops;
52432 +
52433 +struct item_plugin {
52434 + /* generic fields */
52435 + plugin_header h;
52436 +
52437 + /* methods common for all item types */
52438 + balance_ops b;
52439 + /* methods used during flush */
52440 + flush_ops f;
52441 +
52442 + /* methods specific to particular type of item */
52443 + union {
52444 + dir_entry_ops dir;
52445 + file_ops file;
52446 + sd_ops sd;
52447 + internal_item_ops internal;
52448 + } s;
52449 +
52450 +};
52451 +
52452 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
52453 +
52454 +static inline item_id item_id_by_plugin(item_plugin * plugin)
52455 +{
52456 + return plugin->h.id;
52457 +}
52458 +
52459 +static inline char get_iplugid(item_plugin * iplug)
52460 +{
52461 + assert("nikita-2838", iplug != NULL);
52462 + assert("nikita-2839", iplug->h.id < 0xff);
52463 + return (char)item_id_by_plugin(iplug);
52464 +}
52465 +
52466 +extern unsigned long znode_times_locked(const znode * z);
52467 +
52468 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52469 +{
52470 + assert("nikita-2837", coord != NULL);
52471 + assert("nikita-2838", iplug != NULL);
52472 + coord->iplugid = get_iplugid(iplug);
52473 + ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52474 +}
52475 +
52476 +static inline item_plugin *coord_iplug(const coord_t * coord)
52477 +{
52478 + assert("nikita-2833", coord != NULL);
52479 + assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52480 + assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52481 + return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52482 + coord->iplugid);
52483 +}
52484 +
52485 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52486 + const reiser4_item_data *);
52487 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52488 +extern int item_is_extent(const coord_t *);
52489 +extern int item_is_tail(const coord_t *);
52490 +extern int item_is_statdata(const coord_t * item);
52491 +extern int item_is_ctail(const coord_t *);
52492 +
52493 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52494 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
52495 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52496 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52497 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52498 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52499 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52500 + reiser4_key * key);
52501 +extern void obtain_item_plugin(const coord_t * coord);
52502 +
52503 +#if defined(REISER4_DEBUG)
52504 +extern int znode_is_loaded(const znode * node);
52505 +#endif
52506 +
52507 +/* return plugin of item at @coord */
52508 +static inline item_plugin *item_plugin_by_coord(const coord_t *
52509 + coord /* coord to query */ )
52510 +{
52511 + assert("nikita-330", coord != NULL);
52512 + assert("nikita-331", coord->node != NULL);
52513 + assert("nikita-332", znode_is_loaded(coord->node));
52514 +
52515 + if (unlikely(!coord_is_iplug_set(coord)))
52516 + obtain_item_plugin(coord);
52517 + return coord_iplug(coord);
52518 +}
52519 +
52520 +/* this returns true if item is of internal type */
52521 +static inline int item_is_internal(const coord_t * item)
52522 +{
52523 + assert("vs-483", coord_is_existing_item(item));
52524 + return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
52525 +}
52526 +
52527 +extern void item_body_by_coord_hard(coord_t * coord);
52528 +extern void *item_body_by_coord_easy(const coord_t * coord);
52529 +#if REISER4_DEBUG
52530 +extern int item_body_is_valid(const coord_t * coord);
52531 +#endif
52532 +
52533 +/* return pointer to item body */
52534 +static inline void *item_body_by_coord(const coord_t *
52535 + coord /* coord to query */ )
52536 +{
52537 + assert("nikita-324", coord != NULL);
52538 + assert("nikita-325", coord->node != NULL);
52539 + assert("nikita-326", znode_is_loaded(coord->node));
52540 +
52541 + if (coord->offset == INVALID_OFFSET)
52542 + item_body_by_coord_hard((coord_t *) coord);
52543 + assert("nikita-3201", item_body_is_valid(coord));
52544 + assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52545 + return item_body_by_coord_easy(coord);
52546 +}
52547 +
52548 +/* __REISER4_ITEM_H__ */
52549 +#endif
52550 +/* Make Linus happy.
52551 + Local variables:
52552 + c-indentation-style: "K&R"
52553 + mode-name: "LC"
52554 + c-basic-offset: 8
52555 + tab-width: 8
52556 + fill-column: 120
52557 + scroll-step: 1
52558 + End:
52559 +*/
52560 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile linux-2.6.20/fs/reiser4/plugin/item/Makefile
52561 --- linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
52562 +++ linux-2.6.20/fs/reiser4/plugin/item/Makefile 2007-05-06 14:50:43.819013220 +0400
52563 @@ -0,0 +1,18 @@
52564 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
52565 +
52566 +item_plugins-objs := \
52567 + item.o \
52568 + static_stat.o \
52569 + sde.o \
52570 + cde.o \
52571 + blackbox.o \
52572 + internal.o \
52573 + tail.o \
52574 + ctail.o \
52575 + extent.o \
52576 + extent_item_ops.o \
52577 + extent_file_ops.o \
52578 + extent_flush_ops.o
52579 +
52580 +
52581 +
52582 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c linux-2.6.20/fs/reiser4/plugin/item/sde.c
52583 --- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
52584 +++ linux-2.6.20/fs/reiser4/plugin/item/sde.c 2007-05-06 14:50:43.819013220 +0400
52585 @@ -0,0 +1,190 @@
52586 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52587 +
52588 +/* Directory entry implementation */
52589 +#include "../../forward.h"
52590 +#include "../../debug.h"
52591 +#include "../../dformat.h"
52592 +#include "../../kassign.h"
52593 +#include "../../coord.h"
52594 +#include "sde.h"
52595 +#include "item.h"
52596 +#include "../plugin.h"
52597 +#include "../../znode.h"
52598 +#include "../../carry.h"
52599 +#include "../../tree.h"
52600 +#include "../../inode.h"
52601 +
52602 +#include <linux/fs.h> /* for struct inode */
52603 +#include <linux/dcache.h> /* for struct dentry */
52604 +#include <linux/quotaops.h>
52605 +
52606 +/* ->extract_key() method of simple directory item plugin. */
52607 +int extract_key_de(const coord_t * coord /* coord of item */ ,
52608 + reiser4_key * key /* resulting key */ )
52609 +{
52610 + directory_entry_format *dent;
52611 +
52612 + assert("nikita-1458", coord != NULL);
52613 + assert("nikita-1459", key != NULL);
52614 +
52615 + dent = (directory_entry_format *) item_body_by_coord(coord);
52616 + assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52617 + return extract_key_from_id(&dent->id, key);
52618 +}
52619 +
52620 +int
52621 +update_key_de(const coord_t * coord, const reiser4_key * key,
52622 + lock_handle * lh UNUSED_ARG)
52623 +{
52624 + directory_entry_format *dent;
52625 + obj_key_id obj_id;
52626 + int result;
52627 +
52628 + assert("nikita-2342", coord != NULL);
52629 + assert("nikita-2343", key != NULL);
52630 +
52631 + dent = (directory_entry_format *) item_body_by_coord(coord);
52632 + result = build_obj_key_id(key, &obj_id);
52633 + if (result == 0) {
52634 + dent->id = obj_id;
52635 + znode_make_dirty(coord->node);
52636 + }
52637 + return 0;
52638 +}
52639 +
52640 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52641 + char *buf)
52642 +{
52643 + reiser4_key key;
52644 +
52645 + unit_key_by_coord(coord, &key);
52646 + if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52647 + reiser4_print_address("oops", znode_get_block(coord->node));
52648 + if (!is_longname_key(&key)) {
52649 + if (is_dot_key(&key))
52650 + return (char *)".";
52651 + else
52652 + return extract_name_from_key(&key, buf);
52653 + } else
52654 + return (char *)dent->name;
52655 +}
52656 +
52657 +/* ->extract_name() method of simple directory item plugin. */
52658 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52659 +{
52660 + directory_entry_format *dent;
52661 +
52662 + assert("nikita-1460", coord != NULL);
52663 +
52664 + dent = (directory_entry_format *) item_body_by_coord(coord);
52665 + return extract_dent_name(coord, dent, buf);
52666 +}
52667 +
52668 +/* ->extract_file_type() method of simple directory item plugin. */
52669 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52670 + * item */ )
52671 +{
52672 + assert("nikita-1764", coord != NULL);
52673 + /* we don't store file type in the directory entry yet.
52674 +
52675 + But see comments at kassign.h:obj_key_id
52676 + */
52677 + return DT_UNKNOWN;
52678 +}
52679 +
52680 +int add_entry_de(struct inode *dir /* directory of item */ ,
52681 + coord_t * coord /* coord of item */ ,
52682 + lock_handle * lh /* insertion lock handle */ ,
52683 + const struct dentry *de /* name to add */ ,
52684 + reiser4_dir_entry_desc * entry /* parameters of new directory
52685 + * entry */ )
52686 +{
52687 + reiser4_item_data data;
52688 + directory_entry_format *dent;
52689 + int result;
52690 + const char *name;
52691 + int len;
52692 + int longname;
52693 +
52694 + name = de->d_name.name;
52695 + len = de->d_name.len;
52696 + assert("nikita-1163", strlen(name) == len);
52697 +
52698 + longname = is_longname(name, len);
52699 +
52700 + data.length = sizeof *dent;
52701 + if (longname)
52702 + data.length += len + 1;
52703 + data.data = NULL;
52704 + data.user = 0;
52705 + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52706 +
52707 + /* NOTE-NIKITA quota plugin */
52708 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52709 + return -EDQUOT;
52710 +
52711 + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52712 + if (result != 0)
52713 + return result;
52714 +
52715 + dent = (directory_entry_format *) item_body_by_coord(coord);
52716 + build_inode_key_id(entry->obj, &dent->id);
52717 + if (longname) {
52718 + memcpy(dent->name, name, len);
52719 + put_unaligned(0, &dent->name[len]);
52720 + }
52721 + return 0;
52722 +}
52723 +
52724 +int rem_entry_de(struct inode *dir /* directory of item */ ,
52725 + const struct qstr *name UNUSED_ARG,
52726 + coord_t * coord /* coord of item */ ,
52727 + lock_handle * lh UNUSED_ARG /* lock handle for
52728 + * removal */ ,
52729 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52730 + * directory entry
52731 + * being removed */ )
52732 +{
52733 + coord_t shadow;
52734 + int result;
52735 + int length;
52736 +
52737 + length = item_length_by_coord(coord);
52738 + if (inode_get_bytes(dir) < length) {
52739 + warning("nikita-2627", "Dir is broke: %llu: %llu",
52740 + (unsigned long long)get_inode_oid(dir),
52741 + inode_get_bytes(dir));
52742 +
52743 + return RETERR(-EIO);
52744 + }
52745 +
52746 + /* cut_node() is supposed to take pointers to _different_
52747 + coords, because it will modify them without respect to
52748 + possible aliasing. To work around this, create temporary copy
52749 + of @coord.
52750 + */
52751 + coord_dup(&shadow, coord);
52752 + result =
52753 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52754 + if (result == 0) {
52755 + /* NOTE-NIKITA quota plugin */
52756 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
52757 + }
52758 + return result;
52759 +}
52760 +
52761 +int max_name_len_de(const struct inode *dir)
52762 +{
52763 + return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
52764 + sizeof(directory_entry_format) - 2;
52765 +}
52766 +
52767 +/* Make Linus happy.
52768 + Local variables:
52769 + c-indentation-style: "K&R"
52770 + mode-name: "LC"
52771 + c-basic-offset: 8
52772 + tab-width: 8
52773 + fill-column: 120
52774 + End:
52775 +*/
52776 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h linux-2.6.20/fs/reiser4/plugin/item/sde.h
52777 --- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
52778 +++ linux-2.6.20/fs/reiser4/plugin/item/sde.h 2007-05-06 14:50:43.819013220 +0400
52779 @@ -0,0 +1,66 @@
52780 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52781 +
52782 +/* Directory entry. */
52783 +
52784 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52785 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52786 +
52787 +#include "../../forward.h"
52788 +#include "../../dformat.h"
52789 +#include "../../kassign.h"
52790 +#include "../../key.h"
52791 +
52792 +#include <linux/fs.h>
52793 +#include <linux/dcache.h> /* for struct dentry */
52794 +
52795 +typedef struct directory_entry_format {
52796 + /* key of object stat-data. It's not necessary to store whole
52797 + key here, because it's always key of stat-data, so minor
52798 + packing locality and offset can be omitted here. But this
52799 + relies on particular key allocation scheme for stat-data, so,
52800 + for extensibility sake, whole key can be stored here.
52801 +
52802 + We store key as array of bytes, because we don't want 8-byte
52803 + alignment of dir entries.
52804 + */
52805 + obj_key_id id;
52806 + /* file name. Null terminated string. */
52807 + d8 name[0];
52808 +} directory_entry_format;
52809 +
52810 +void print_de(const char *prefix, coord_t * coord);
52811 +int extract_key_de(const coord_t * coord, reiser4_key * key);
52812 +int update_key_de(const coord_t * coord, const reiser4_key * key,
52813 + lock_handle * lh);
52814 +char *extract_name_de(const coord_t * coord, char *buf);
52815 +unsigned extract_file_type_de(const coord_t * coord);
52816 +int add_entry_de(struct inode *dir, coord_t * coord,
52817 + lock_handle * lh, const struct dentry *name,
52818 + reiser4_dir_entry_desc * entry);
52819 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52820 + lock_handle * lh, reiser4_dir_entry_desc * entry);
52821 +int max_name_len_de(const struct inode *dir);
52822 +
52823 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52824 +
52825 +char *extract_dent_name(const coord_t * coord,
52826 + directory_entry_format * dent, char *buf);
52827 +
52828 +#if REISER4_LARGE_KEY
52829 +#define DE_NAME_BUF_LEN (24)
52830 +#else
52831 +#define DE_NAME_BUF_LEN (16)
52832 +#endif
52833 +
52834 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52835 +#endif
52836 +
52837 +/* Make Linus happy.
52838 + Local variables:
52839 + c-indentation-style: "K&R"
52840 + mode-name: "LC"
52841 + c-basic-offset: 8
52842 + tab-width: 8
52843 + fill-column: 120
52844 + End:
52845 +*/
52846 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.20/fs/reiser4/plugin/item/static_stat.c
52847 --- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
52848 +++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.c 2007-05-06 14:50:43.823014469 +0400
52849 @@ -0,0 +1,1107 @@
52850 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52851 +
52852 +/* stat data manipulation. */
52853 +
52854 +#include "../../forward.h"
52855 +#include "../../super.h"
52856 +#include "../../vfs_ops.h"
52857 +#include "../../inode.h"
52858 +#include "../../debug.h"
52859 +#include "../../dformat.h"
52860 +#include "../object.h"
52861 +#include "../plugin.h"
52862 +#include "../plugin_header.h"
52863 +#include "static_stat.h"
52864 +#include "item.h"
52865 +
52866 +#include <linux/types.h>
52867 +#include <linux/fs.h>
52868 +
52869 +/* see static_stat.h for explanation */
52870 +
52871 +/* helper function used while we are dumping/loading inode/plugin state
52872 + to/from the stat-data. */
52873 +
52874 +static void move_on(int *length /* space remaining in stat-data */ ,
52875 + char **area /* current coord in stat data */ ,
52876 + int size_of /* how many bytes to move forward */ )
52877 +{
52878 + assert("nikita-615", length != NULL);
52879 + assert("nikita-616", area != NULL);
52880 +
52881 + *length -= size_of;
52882 + *area += size_of;
52883 +
52884 + assert("nikita-617", *length >= 0);
52885 +}
52886 +
52887 +/* helper function used while loading inode/plugin state from stat-data.
52888 + Complain if there is less space in stat-data than was expected.
52889 + Can only happen on disk corruption. */
52890 +static int not_enough_space(struct inode *inode /* object being processed */ ,
52891 + const char *where /* error message */ )
52892 +{
52893 + assert("nikita-618", inode != NULL);
52894 +
52895 + warning("nikita-619", "Not enough space in %llu while loading %s",
52896 + (unsigned long long)get_inode_oid(inode), where);
52897 +
52898 + return RETERR(-EINVAL);
52899 +}
52900 +
52901 +/* helper function used while loading inode/plugin state from
52902 + stat-data. Call it if invalid plugin id was found. */
52903 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
52904 + struct inode *inode /* object being processed */ )
52905 +{
52906 + warning("nikita-620", "Unknown plugin %i in %llu",
52907 + id, (unsigned long long)get_inode_oid(inode));
52908 +
52909 + return RETERR(-EINVAL);
52910 +}
52911 +
52912 +/* this is installed as ->init_inode() method of
52913 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
52914 + Copies data from on-disk stat-data format into inode.
52915 + Handles stat-data extensions. */
52916 +/* was sd_load */
52917 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
52918 + char *sd /* stat-data body */ ,
52919 + int len /* length of stat-data */ )
52920 +{
52921 + int result;
52922 + int bit;
52923 + int chunk;
52924 + __u16 mask;
52925 + __u64 bigmask;
52926 + reiser4_stat_data_base *sd_base;
52927 + reiser4_inode *state;
52928 +
52929 + assert("nikita-625", inode != NULL);
52930 + assert("nikita-626", sd != NULL);
52931 +
52932 + result = 0;
52933 + sd_base = (reiser4_stat_data_base *) sd;
52934 + state = reiser4_inode_data(inode);
52935 + mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
52936 + bigmask = mask;
52937 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
52938 +
52939 + move_on(&len, &sd, sizeof *sd_base);
52940 + for (bit = 0, chunk = 0;
52941 + mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
52942 + ++bit, mask >>= 1) {
52943 + if (((bit + 1) % 16) != 0) {
52944 + /* handle extension */
52945 + sd_ext_plugin *sdplug;
52946 +
52947 + if (bit >= LAST_SD_EXTENSION) {
52948 + warning("vpf-1904",
52949 + "No such extension %i in inode %llu",
52950 + bit,
52951 + (unsigned long long)
52952 + get_inode_oid(inode));
52953 +
52954 + result = RETERR(-EINVAL);
52955 + break;
52956 + }
52957 +
52958 + sdplug = sd_ext_plugin_by_id(bit);
52959 + if (sdplug == NULL) {
52960 + warning("nikita-627",
52961 + "No such extension %i in inode %llu",
52962 + bit,
52963 + (unsigned long long)
52964 + get_inode_oid(inode));
52965 +
52966 + result = RETERR(-EINVAL);
52967 + break;
52968 + }
52969 + if (mask & 1) {
52970 + assert("nikita-628", sdplug->present);
52971 + /* alignment is not supported in node layout
52972 + plugin yet.
52973 + result = align( inode, &len, &sd,
52974 + sdplug -> alignment );
52975 + if( result != 0 )
52976 + return result; */
52977 + result = sdplug->present(inode, &sd, &len);
52978 + } else if (sdplug->absent != NULL)
52979 + result = sdplug->absent(inode);
52980 + if (result)
52981 + break;
52982 + /* else, we are looking at the last bit in 16-bit
52983 + portion of bitmask */
52984 + } else if (mask & 1) {
52985 + /* next portion of bitmask */
52986 + if (len < (int)sizeof(d16)) {
52987 + warning("nikita-629",
52988 + "No space for bitmap in inode %llu",
52989 + (unsigned long long)
52990 + get_inode_oid(inode));
52991 +
52992 + result = RETERR(-EINVAL);
52993 + break;
52994 + }
52995 + mask = le16_to_cpu(get_unaligned((d16 *)sd));
52996 + bigmask <<= 16;
52997 + bigmask |= mask;
52998 + move_on(&len, &sd, sizeof(d16));
52999 + ++chunk;
53000 + if (chunk == 3) {
53001 + if (!(mask & 0x8000)) {
53002 + /* clear last bit */
53003 + mask &= ~0x8000;
53004 + continue;
53005 + }
53006 + /* too much */
53007 + warning("nikita-630",
53008 + "Too many extensions in %llu",
53009 + (unsigned long long)
53010 + get_inode_oid(inode));
53011 +
53012 + result = RETERR(-EINVAL);
53013 + break;
53014 + }
53015 + } else
53016 + /* bitmask exhausted */
53017 + break;
53018 + }
53019 + state->extmask = bigmask;
53020 + /* common initialisations */
53021 + if (len - (bit / 16 * sizeof(d16)) > 0) {
53022 + /* alignment in save_len_static_sd() is taken into account
53023 + -edward */
53024 + warning("nikita-631", "unused space in inode %llu",
53025 + (unsigned long long)get_inode_oid(inode));
53026 + }
53027 +
53028 + return result;
53029 +}
53030 +
53031 +/* estimates size of stat-data required to store inode.
53032 + Installed as ->save_len() method of
53033 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53034 +/* was sd_len */
53035 +int save_len_static_sd(struct inode *inode /* object being processed */ )
53036 +{
53037 + unsigned int result;
53038 + __u64 mask;
53039 + int bit;
53040 +
53041 + assert("nikita-632", inode != NULL);
53042 +
53043 + result = sizeof(reiser4_stat_data_base);
53044 + mask = reiser4_inode_data(inode)->extmask;
53045 + for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53046 + if (mask & 1) {
53047 + sd_ext_plugin *sdplug;
53048 +
53049 + sdplug = sd_ext_plugin_by_id(bit);
53050 + assert("nikita-633", sdplug != NULL);
53051 + /* no aligment support
53052 + result +=
53053 + round_up( result, sdplug -> alignment ) - result; */
53054 + result += sdplug->save_len(inode);
53055 + }
53056 + }
53057 + result += bit / 16 * sizeof(d16);
53058 + return result;
53059 +}
53060 +
53061 +/* saves inode into stat-data.
53062 + Installed as ->save() method of
53063 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53064 +/* was sd_save */
53065 +int save_static_sd(struct inode *inode /* object being processed */ ,
53066 + char **area /* where to save stat-data */ )
53067 +{
53068 + int result;
53069 + __u64 emask;
53070 + int bit;
53071 + unsigned int len;
53072 + reiser4_stat_data_base *sd_base;
53073 +
53074 + assert("nikita-634", inode != NULL);
53075 + assert("nikita-635", area != NULL);
53076 +
53077 + result = 0;
53078 + emask = reiser4_inode_data(inode)->extmask;
53079 + sd_base = (reiser4_stat_data_base *) * area;
53080 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53081 + /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53082 +
53083 + *area += sizeof *sd_base;
53084 + len = 0xffffffffu;
53085 + for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53086 + if (emask & 1) {
53087 + if ((bit + 1) % 16 != 0) {
53088 + sd_ext_plugin *sdplug;
53089 + sdplug = sd_ext_plugin_by_id(bit);
53090 + assert("nikita-636", sdplug != NULL);
53091 + /* no alignment support yet
53092 + align( inode, &len, area,
53093 + sdplug -> alignment ); */
53094 + result = sdplug->save(inode, area);
53095 + if (result)
53096 + break;
53097 + } else {
53098 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53099 + (d16 *)(*area));
53100 + /*cputod16((unsigned)(emask & 0xffff),
53101 + (d16 *) * area);*/
53102 + *area += sizeof(d16);
53103 + }
53104 + }
53105 + }
53106 + return result;
53107 +}
53108 +
53109 +/* stat-data extension handling functions. */
53110 +
53111 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
53112 + char **area /* position in stat-data */ ,
53113 + int *len /* remaining length */ )
53114 +{
53115 + if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53116 + reiser4_light_weight_stat *sd_lw;
53117 +
53118 + sd_lw = (reiser4_light_weight_stat *) * area;
53119 +
53120 + inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53121 + inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53122 + inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53123 + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53124 + inode->i_mode &= ~S_IFIFO;
53125 + warning("", "partially converted file is encountered");
53126 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
53127 + }
53128 + move_on(len, area, sizeof *sd_lw);
53129 + return 0;
53130 + } else
53131 + return not_enough_space(inode, "lw sd");
53132 +}
53133 +
53134 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
53135 + * processed */ )
53136 +{
53137 + return sizeof(reiser4_light_weight_stat);
53138 +}
53139 +
53140 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
53141 + char **area /* position in stat-data */ )
53142 +{
53143 + reiser4_light_weight_stat *sd;
53144 + mode_t delta;
53145 +
53146 + assert("nikita-2705", inode != NULL);
53147 + assert("nikita-2706", area != NULL);
53148 + assert("nikita-2707", *area != NULL);
53149 +
53150 + sd = (reiser4_light_weight_stat *) * area;
53151 +
53152 + delta = (reiser4_inode_get_flag(inode,
53153 + REISER4_PART_MIXED) ? S_IFIFO : 0);
53154 + put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53155 + put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53156 + put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53157 + *area += sizeof *sd;
53158 + return 0;
53159 +}
53160 +
53161 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
53162 + char **area /* position in stat-data */ ,
53163 + int *len /* remaining length */ )
53164 +{
53165 + assert("nikita-637", inode != NULL);
53166 + assert("nikita-638", area != NULL);
53167 + assert("nikita-639", *area != NULL);
53168 + assert("nikita-640", len != NULL);
53169 + assert("nikita-641", *len > 0);
53170 +
53171 + if (*len >= (int)sizeof(reiser4_unix_stat)) {
53172 + reiser4_unix_stat *sd;
53173 +
53174 + sd = (reiser4_unix_stat *) * area;
53175 +
53176 + inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53177 + inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53178 + inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53179 + inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53180 + inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53181 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53182 + inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53183 + else
53184 + inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53185 + move_on(len, area, sizeof *sd);
53186 + return 0;
53187 + } else
53188 + return not_enough_space(inode, "unix sd");
53189 +}
53190 +
53191 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
53192 +{
53193 + inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53194 + inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53195 + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53196 + inode_set_bytes(inode, inode->i_size);
53197 + /* mark inode as lightweight, so that caller (lookup_common) will
53198 + complete initialisation by copying [ug]id from a parent. */
53199 + reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53200 + return 0;
53201 +}
53202 +
53203 +/* Audited by: green(2002.06.14) */
53204 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53205 + * processed */ )
53206 +{
53207 + return sizeof(reiser4_unix_stat);
53208 +}
53209 +
53210 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
53211 + char **area /* position in stat-data */ )
53212 +{
53213 + reiser4_unix_stat *sd;
53214 +
53215 + assert("nikita-642", inode != NULL);
53216 + assert("nikita-643", area != NULL);
53217 + assert("nikita-644", *area != NULL);
53218 +
53219 + sd = (reiser4_unix_stat *) * area;
53220 + put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53221 + put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53222 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53223 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53224 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53225 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53226 + put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53227 + else
53228 + put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53229 + *area += sizeof *sd;
53230 + return 0;
53231 +}
53232 +
53233 +static int
53234 +present_large_times_sd(struct inode *inode /* object being processed */ ,
53235 + char **area /* position in stat-data */ ,
53236 + int *len /* remaining length */ )
53237 +{
53238 + if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53239 + reiser4_large_times_stat *sd_lt;
53240 +
53241 + sd_lt = (reiser4_large_times_stat *) * area;
53242 +
53243 + inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53244 + inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53245 + inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53246 +
53247 + move_on(len, area, sizeof *sd_lt);
53248 + return 0;
53249 + } else
53250 + return not_enough_space(inode, "large times sd");
53251 +}
53252 +
53253 +static int
53254 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
53255 + /* object being processed */ )
53256 +{
53257 + return sizeof(reiser4_large_times_stat);
53258 +}
53259 +
53260 +static int
53261 +save_large_times_sd(struct inode *inode /* object being processed */ ,
53262 + char **area /* position in stat-data */ )
53263 +{
53264 + reiser4_large_times_stat *sd;
53265 +
53266 + assert("nikita-2817", inode != NULL);
53267 + assert("nikita-2818", area != NULL);
53268 + assert("nikita-2819", *area != NULL);
53269 +
53270 + sd = (reiser4_large_times_stat *) * area;
53271 +
53272 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53273 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53274 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53275 +
53276 + *area += sizeof *sd;
53277 + return 0;
53278 +}
53279 +
53280 +/* symlink stat data extension */
53281 +
53282 +/* allocate memory for symlink target and attach it to inode->i_private */
53283 +static int
53284 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
53285 +{
53286 + assert("vs-845", inode->i_private == NULL);
53287 + assert("vs-846", !reiser4_inode_get_flag(inode,
53288 + REISER4_GENERIC_PTR_USED));
53289 + /* FIXME-VS: this is prone to deadlock. Not more than other similar
53290 + places, though */
53291 + inode->i_private = kmalloc((size_t) len + 1,
53292 + reiser4_ctx_gfp_mask_get());
53293 + if (!inode->i_private)
53294 + return RETERR(-ENOMEM);
53295 +
53296 + memcpy((char *)(inode->i_private), target, (size_t) len);
53297 + ((char *)(inode->i_private))[len] = 0;
53298 + reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53299 + return 0;
53300 +}
53301 +
53302 +/* this is called on read_inode. There is nothing to do actually, but some
53303 + sanity checks */
53304 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
53305 +{
53306 + int result;
53307 + int length;
53308 + reiser4_symlink_stat *sd;
53309 +
53310 + length = (int)inode->i_size;
53311 + /*
53312 + * *len is number of bytes in stat data item from *area to the end of
53313 + * item. It must be not less than size of symlink + 1 for ending 0
53314 + */
53315 + if (length > *len)
53316 + return not_enough_space(inode, "symlink");
53317 +
53318 + if (*(*area + length) != 0) {
53319 + warning("vs-840", "Symlink is not zero terminated");
53320 + return RETERR(-EIO);
53321 + }
53322 +
53323 + sd = (reiser4_symlink_stat *) * area;
53324 + result = symlink_target_to_inode(inode, sd->body, length);
53325 +
53326 + move_on(len, area, length + 1);
53327 + return result;
53328 +}
53329 +
53330 +static int save_len_symlink_sd(struct inode *inode)
53331 +{
53332 + return inode->i_size + 1;
53333 +}
53334 +
53335 +/* this is called on create and update stat data. Do nothing on update but
53336 + update @area */
53337 +static int save_symlink_sd(struct inode *inode, char **area)
53338 +{
53339 + int result;
53340 + int length;
53341 + reiser4_symlink_stat *sd;
53342 +
53343 + length = (int)inode->i_size;
53344 + /* inode->i_size must be set already */
53345 + assert("vs-841", length);
53346 +
53347 + result = 0;
53348 + sd = (reiser4_symlink_stat *) * area;
53349 + if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53350 + const char *target;
53351 +
53352 + target = (const char *)(inode->i_private);
53353 + inode->i_private = NULL;
53354 +
53355 + result = symlink_target_to_inode(inode, target, length);
53356 +
53357 + /* copy symlink to stat data */
53358 + memcpy(sd->body, target, (size_t) length);
53359 + (*area)[length] = 0;
53360 + } else {
53361 + /* there is nothing to do in update but move area */
53362 + assert("vs-844",
53363 + !memcmp(inode->i_private, sd->body,
53364 + (size_t) length + 1));
53365 + }
53366 +
53367 + *area += (length + 1);
53368 + return result;
53369 +}
53370 +
53371 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
53372 + char **area /* position in stat-data */ ,
53373 + int *len /* remaining length */ )
53374 +{
53375 + assert("nikita-645", inode != NULL);
53376 + assert("nikita-646", area != NULL);
53377 + assert("nikita-647", *area != NULL);
53378 + assert("nikita-648", len != NULL);
53379 + assert("nikita-649", *len > 0);
53380 +
53381 + if (*len >= (int)sizeof(reiser4_flags_stat)) {
53382 + reiser4_flags_stat *sd;
53383 +
53384 + sd = (reiser4_flags_stat *) * area;
53385 + inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53386 + move_on(len, area, sizeof *sd);
53387 + return 0;
53388 + } else
53389 + return not_enough_space(inode, "generation and attrs");
53390 +}
53391 +
53392 +/* Audited by: green(2002.06.14) */
53393 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53394 + * processed */ )
53395 +{
53396 + return sizeof(reiser4_flags_stat);
53397 +}
53398 +
53399 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
53400 + char **area /* position in stat-data */ )
53401 +{
53402 + reiser4_flags_stat *sd;
53403 +
53404 + assert("nikita-650", inode != NULL);
53405 + assert("nikita-651", area != NULL);
53406 + assert("nikita-652", *area != NULL);
53407 +
53408 + sd = (reiser4_flags_stat *) * area;
53409 + put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53410 + *area += sizeof *sd;
53411 + return 0;
53412 +}
53413 +
53414 +static int absent_plugin_sd(struct inode *inode);
53415 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53416 + char **area /* position in stat-data */ ,
53417 + int *len /* remaining length */,
53418 + int is_pset /* 1 if plugin set, 0 if heir set. */)
53419 +{
53420 + reiser4_plugin_stat *sd;
53421 + reiser4_plugin *plugin;
53422 + reiser4_inode *info;
53423 + int i;
53424 + __u16 mask;
53425 + int result;
53426 + int num_of_plugins;
53427 +
53428 + assert("nikita-653", inode != NULL);
53429 + assert("nikita-654", area != NULL);
53430 + assert("nikita-655", *area != NULL);
53431 + assert("nikita-656", len != NULL);
53432 + assert("nikita-657", *len > 0);
53433 +
53434 + if (*len < (int)sizeof(reiser4_plugin_stat))
53435 + return not_enough_space(inode, "plugin");
53436 +
53437 + sd = (reiser4_plugin_stat *) * area;
53438 + info = reiser4_inode_data(inode);
53439 +
53440 + mask = 0;
53441 + num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53442 + move_on(len, area, sizeof *sd);
53443 + result = 0;
53444 + for (i = 0; i < num_of_plugins; ++i) {
53445 + reiser4_plugin_slot *slot;
53446 + reiser4_plugin_type type;
53447 + pset_member memb;
53448 +
53449 + slot = (reiser4_plugin_slot *) * area;
53450 + if (*len < (int)sizeof *slot)
53451 + return not_enough_space(inode, "additional plugin");
53452 +
53453 + memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53454 + type = aset_member_to_type_unsafe(memb);
53455 +
53456 + if (type == REISER4_PLUGIN_TYPES) {
53457 + warning("nikita-3502",
53458 + "wrong %s member (%i) for %llu", is_pset ?
53459 + "pset" : "hset", memb,
53460 + (unsigned long long)get_inode_oid(inode));
53461 + return RETERR(-EINVAL);
53462 + }
53463 + plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
53464 + type, &slot->id);
53465 + if (plugin == NULL)
53466 + return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53467 +
53468 + /* plugin is loaded into inode, mark this into inode's
53469 + bitmask of loaded non-standard plugins */
53470 + if (!(mask & (1 << memb))) {
53471 + mask |= (1 << memb);
53472 + } else {
53473 + warning("nikita-658", "duplicate plugin for %llu",
53474 + (unsigned long long)get_inode_oid(inode));
53475 + return RETERR(-EINVAL);
53476 + }
53477 + move_on(len, area, sizeof *slot);
53478 + /* load plugin data, if any */
53479 + if (plugin->h.pops != NULL && plugin->h.pops->load)
53480 + result = plugin->h.pops->load(inode, plugin, area, len);
53481 + else
53482 + result = aset_set_unsafe(is_pset ? &info->pset :
53483 + &info->hset, memb, plugin);
53484 + if (result)
53485 + return result;
53486 + }
53487 + if (is_pset) {
53488 + /* if object plugin wasn't loaded from stat-data, guess it by
53489 + mode bits */
53490 + plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53491 + if (plugin == NULL)
53492 + result = absent_plugin_sd(inode);
53493 + info->plugin_mask = mask;
53494 + } else
53495 + info->heir_mask = mask;
53496 +
53497 + return result;
53498 +}
53499 +
53500 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
53501 + return present_plugin_sd(inode, area, len, 1 /* pset */);
53502 +}
53503 +
53504 +/* Determine object plugin for @inode based on i_mode.
53505 +
53506 + Many objects in reiser4 file system are controlled by standard object
53507 + plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53508 +
53509 + For such files we don't explicitly store plugin id in object stat
53510 + data. Rather required plugin is guessed from mode bits, where file "type"
53511 + is encoded (see stat(2)).
53512 +*/
53513 +static int
53514 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53515 +{
53516 + int fplug_id;
53517 + int dplug_id;
53518 + reiser4_inode *info;
53519 +
53520 + assert("nikita-736", inode != NULL);
53521 +
53522 + dplug_id = fplug_id = -1;
53523 +
53524 + switch (inode->i_mode & S_IFMT) {
53525 + case S_IFSOCK:
53526 + case S_IFBLK:
53527 + case S_IFCHR:
53528 + case S_IFIFO:
53529 + fplug_id = SPECIAL_FILE_PLUGIN_ID;
53530 + break;
53531 + case S_IFLNK:
53532 + fplug_id = SYMLINK_FILE_PLUGIN_ID;
53533 + break;
53534 + case S_IFDIR:
53535 + fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53536 + dplug_id = HASHED_DIR_PLUGIN_ID;
53537 + break;
53538 + default:
53539 + warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53540 + return RETERR(-EIO);
53541 + case S_IFREG:
53542 + fplug_id = UNIX_FILE_PLUGIN_ID;
53543 + break;
53544 + }
53545 + info = reiser4_inode_data(inode);
53546 + set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
53547 + plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
53548 + set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
53549 + plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
53550 + return 0;
53551 +}
53552 +
53553 +/* Audited by: green(2002.06.14) */
53554 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53555 +{
53556 + int result;
53557 +
53558 + assert("nikita-659", inode != NULL);
53559 +
53560 + result = guess_plugin_by_mode(inode);
53561 + /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53562 + but setup_inode_ops() will call make_bad_inode().
53563 + Another, more logical but bit more complex solution is to add
53564 + "bad-file plugin". */
53565 + /* FIXME-VS: activate was called here */
53566 + return result;
53567 +}
53568 +
53569 +/* helper function for plugin_sd_save_len(): calculate how much space
53570 + required to save state of given plugin */
53571 +/* Audited by: green(2002.06.14) */
53572 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53573 + struct inode *inode /* object being processed */ ,
53574 + pset_member memb,
53575 + int len, int is_pset)
53576 +{
53577 + reiser4_inode *info;
53578 + assert("nikita-661", inode != NULL);
53579 +
53580 + if (plugin == NULL)
53581 + return len;
53582 +
53583 + info = reiser4_inode_data(inode);
53584 + if (is_pset ?
53585 + info->plugin_mask & (1 << memb) :
53586 + info->heir_mask & (1 << memb)) {
53587 + len += sizeof(reiser4_plugin_slot);
53588 + if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53589 + /* non-standard plugin, call method */
53590 + /* commented as it is incompatible with alignment
53591 + * policy in save_plug() -edward */
53592 + /* len = round_up(len, plugin->h.pops->alignment); */
53593 + len += plugin->h.pops->save_len(inode, plugin);
53594 + }
53595 + }
53596 + return len;
53597 +}
53598 +
53599 +/* calculate how much space is required to save state of all plugins,
53600 + associated with inode */
53601 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
53602 + int is_pset)
53603 +{
53604 + int len;
53605 + int last;
53606 + reiser4_inode *state;
53607 + pset_member memb;
53608 +
53609 + assert("nikita-663", inode != NULL);
53610 +
53611 + state = reiser4_inode_data(inode);
53612 +
53613 + /* common case: no non-standard plugins */
53614 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53615 + return 0;
53616 + len = sizeof(reiser4_plugin_stat);
53617 + last = PSET_LAST;
53618 +
53619 + for (memb = 0; memb < last; ++memb) {
53620 + len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
53621 + inode, memb, len, is_pset);
53622 + }
53623 + assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53624 + return len;
53625 +}
53626 +
53627 +static int save_len_pset_sd(struct inode *inode) {
53628 + return save_len_plugin_sd(inode, 1 /* pset */);
53629 +}
53630 +
53631 +/* helper function for plugin_sd_save(): save plugin, associated with
53632 + inode. */
53633 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53634 + struct inode *inode /* object being processed */ ,
53635 + int memb /* what element of pset is saved */ ,
53636 + char **area /* position in stat-data */ ,
53637 + int *count /* incremented if plugin were actually saved. */,
53638 + int is_pset /* 1 for plugin set, 0 for heir set */)
53639 +{
53640 + reiser4_plugin_slot *slot;
53641 + int fake_len;
53642 + int result;
53643 +
53644 + assert("nikita-665", inode != NULL);
53645 + assert("nikita-666", area != NULL);
53646 + assert("nikita-667", *area != NULL);
53647 +
53648 + if (plugin == NULL)
53649 + return 0;
53650 +
53651 + if (is_pset ?
53652 + !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
53653 + !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
53654 + return 0;
53655 + slot = (reiser4_plugin_slot *) * area;
53656 + put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53657 + put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53658 + fake_len = (int)0xffff;
53659 + move_on(&fake_len, area, sizeof *slot);
53660 + ++*count;
53661 + result = 0;
53662 + if (plugin->h.pops != NULL) {
53663 + if (plugin->h.pops->save != NULL)
53664 + result = plugin->h.pops->save(inode, plugin, area);
53665 + }
53666 + return result;
53667 +}
53668 +
53669 +/* save state of all non-standard plugins associated with inode */
53670 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
53671 + char **area /* position in stat-data */,
53672 + int is_pset /* 1 for pset, 0 for hset */)
53673 +{
53674 + int fake_len;
53675 + int result = 0;
53676 + int num_of_plugins;
53677 + reiser4_plugin_stat *sd;
53678 + reiser4_inode *state;
53679 + pset_member memb;
53680 +
53681 + assert("nikita-669", inode != NULL);
53682 + assert("nikita-670", area != NULL);
53683 + assert("nikita-671", *area != NULL);
53684 +
53685 + state = reiser4_inode_data(inode);
53686 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53687 + return 0;
53688 + sd = (reiser4_plugin_stat *) * area;
53689 + fake_len = (int)0xffff;
53690 + move_on(&fake_len, area, sizeof *sd);
53691 +
53692 + num_of_plugins = 0;
53693 + for (memb = 0; memb < PSET_LAST; ++memb) {
53694 + result = save_plug(aset_get(is_pset ? state->pset : state->hset,
53695 + memb),
53696 + inode, memb, area, &num_of_plugins, is_pset);
53697 + if (result != 0)
53698 + break;
53699 + }
53700 +
53701 + put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53702 + return result;
53703 +}
53704 +
53705 +static int save_pset_sd(struct inode *inode, char **area) {
53706 + return save_plugin_sd(inode, area, 1 /* pset */);
53707 +}
53708 +
53709 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
53710 + return present_plugin_sd(inode, area, len, 0 /* hset */);
53711 +}
53712 +
53713 +static int save_len_hset_sd(struct inode *inode) {
53714 + return save_len_plugin_sd(inode, 0 /* pset */);
53715 +}
53716 +
53717 +static int save_hset_sd(struct inode *inode, char **area) {
53718 + return save_plugin_sd(inode, area, 0 /* hset */);
53719 +}
53720 +
53721 +/* helper function for crypto_sd_present(), crypto_sd_save.
53722 + Allocates memory for crypto stat, keyid and attaches it to the inode */
53723 +static int extract_crypto_stat (struct inode * inode,
53724 + reiser4_crypto_stat * sd)
53725 +{
53726 + crypto_stat_t * info;
53727 + assert("edward-11", !inode_crypto_stat(inode));
53728 + assert("edward-1413",
53729 + !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
53730 + /* create and attach a crypto-stat without secret key loaded */
53731 + info = reiser4_alloc_crypto_stat(inode);
53732 + if (IS_ERR(info))
53733 + return PTR_ERR(info);
53734 + info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53735 + memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
53736 + reiser4_attach_crypto_stat(inode, info);
53737 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53738 + return 0;
53739 +}
53740 +
53741 +/* crypto stat-data extension */
53742 +
53743 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
53744 +{
53745 + int result;
53746 + reiser4_crypto_stat *sd;
53747 + digest_plugin *dplug = inode_digest_plugin(inode);
53748 +
53749 + assert("edward-06", dplug != NULL);
53750 + assert("edward-684", dplug->fipsize);
53751 + assert("edward-07", area != NULL);
53752 + assert("edward-08", *area != NULL);
53753 + assert("edward-09", len != NULL);
53754 + assert("edward-10", *len > 0);
53755 +
53756 + if (*len < (int)sizeof(reiser4_crypto_stat)) {
53757 + return not_enough_space(inode, "crypto-sd");
53758 + }
53759 + /* *len is number of bytes in stat data item from *area to the end of
53760 + item. It must be not less than size of this extension */
53761 + assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53762 +
53763 + sd = (reiser4_crypto_stat *) * area;
53764 + result = extract_crypto_stat(inode, sd);
53765 + move_on(len, area, sizeof(*sd) + dplug->fipsize);
53766 +
53767 + return result;
53768 +}
53769 +
53770 +static int save_len_crypto_sd(struct inode *inode)
53771 +{
53772 + return sizeof(reiser4_crypto_stat) +
53773 + inode_digest_plugin(inode)->fipsize;
53774 +}
53775 +
53776 +static int save_crypto_sd(struct inode *inode, char **area)
53777 +{
53778 + int result = 0;
53779 + reiser4_crypto_stat *sd;
53780 + crypto_stat_t * info = inode_crypto_stat(inode);
53781 + digest_plugin *dplug = inode_digest_plugin(inode);
53782 +
53783 + assert("edward-12", dplug != NULL);
53784 + assert("edward-13", area != NULL);
53785 + assert("edward-14", *area != NULL);
53786 + assert("edward-15", info != NULL);
53787 + assert("edward-1414", info->keyid != NULL);
53788 + assert("edward-1415", info->keysize != 0);
53789 + assert("edward-76", reiser4_inode_data(inode) != NULL);
53790 +
53791 + if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
53792 + /* file is just created */
53793 + sd = (reiser4_crypto_stat *) *area;
53794 + /* copy everything but private key to the disk stat-data */
53795 + put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53796 + memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
53797 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53798 + }
53799 + *area += (sizeof(*sd) + dplug->fipsize);
53800 + return result;
53801 +}
53802 +
53803 +static int eio(struct inode *inode, char **area, int *len)
53804 +{
53805 + return RETERR(-EIO);
53806 +}
53807 +
53808 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53809 + [LIGHT_WEIGHT_STAT] = {
53810 + .h = {
53811 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53812 + .id = LIGHT_WEIGHT_STAT,
53813 + .pops = NULL,
53814 + .label = "light-weight sd",
53815 + .desc = "sd for light-weight files",
53816 + .linkage = {NULL,NULL}
53817 + },
53818 + .present = present_lw_sd,
53819 + .absent = NULL,
53820 + .save_len = save_len_lw_sd,
53821 + .save = save_lw_sd,
53822 + .alignment = 8
53823 + },
53824 + [UNIX_STAT] = {
53825 + .h = {
53826 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53827 + .id = UNIX_STAT,
53828 + .pops = NULL,
53829 + .label = "unix-sd",
53830 + .desc = "unix stat-data fields",
53831 + .linkage = {NULL,NULL}
53832 + },
53833 + .present = present_unix_sd,
53834 + .absent = absent_unix_sd,
53835 + .save_len = save_len_unix_sd,
53836 + .save = save_unix_sd,
53837 + .alignment = 8
53838 + },
53839 + [LARGE_TIMES_STAT] = {
53840 + .h = {
53841 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53842 + .id = LARGE_TIMES_STAT,
53843 + .pops = NULL,
53844 + .label = "64time-sd",
53845 + .desc = "nanosecond resolution for times",
53846 + .linkage = {NULL,NULL}
53847 + },
53848 + .present = present_large_times_sd,
53849 + .absent = NULL,
53850 + .save_len = save_len_large_times_sd,
53851 + .save = save_large_times_sd,
53852 + .alignment = 8
53853 + },
53854 + [SYMLINK_STAT] = {
53855 + /* stat data of symlink has this extension */
53856 + .h = {
53857 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53858 + .id = SYMLINK_STAT,
53859 + .pops = NULL,
53860 + .label = "symlink-sd",
53861 + .desc =
53862 + "stat data is appended with symlink name",
53863 + .linkage = {NULL,NULL}
53864 + },
53865 + .present = present_symlink_sd,
53866 + .absent = NULL,
53867 + .save_len = save_len_symlink_sd,
53868 + .save = save_symlink_sd,
53869 + .alignment = 8
53870 + },
53871 + [PLUGIN_STAT] = {
53872 + .h = {
53873 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53874 + .id = PLUGIN_STAT,
53875 + .pops = NULL,
53876 + .label = "plugin-sd",
53877 + .desc = "plugin stat-data fields",
53878 + .linkage = {NULL,NULL}
53879 + },
53880 + .present = present_pset_sd,
53881 + .absent = absent_plugin_sd,
53882 + .save_len = save_len_pset_sd,
53883 + .save = save_pset_sd,
53884 + .alignment = 8
53885 + },
53886 + [HEIR_STAT] = {
53887 + .h = {
53888 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53889 + .id = HEIR_STAT,
53890 + .pops = NULL,
53891 + .label = "heir-plugin-sd",
53892 + .desc = "heir plugin stat-data fields",
53893 + .linkage = {NULL,NULL}
53894 + },
53895 + .present = present_hset_sd,
53896 + .absent = NULL,
53897 + .save_len = save_len_hset_sd,
53898 + .save = save_hset_sd,
53899 + .alignment = 8
53900 + },
53901 + [FLAGS_STAT] = {
53902 + .h = {
53903 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53904 + .id = FLAGS_STAT,
53905 + .pops = NULL,
53906 + .label = "flags-sd",
53907 + .desc = "inode bit flags",
53908 + .linkage = {NULL, NULL}
53909 + },
53910 + .present = present_flags_sd,
53911 + .absent = NULL,
53912 + .save_len = save_len_flags_sd,
53913 + .save = save_flags_sd,
53914 + .alignment = 8
53915 + },
53916 + [CAPABILITIES_STAT] = {
53917 + .h = {
53918 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53919 + .id = CAPABILITIES_STAT,
53920 + .pops = NULL,
53921 + .label = "capabilities-sd",
53922 + .desc = "capabilities",
53923 + .linkage = {NULL, NULL}
53924 + },
53925 + .present = eio,
53926 + .absent = NULL,
53927 + .save_len = save_len_flags_sd,
53928 + .save = save_flags_sd,
53929 + .alignment = 8
53930 + },
53931 + [CRYPTO_STAT] = {
53932 + .h = {
53933 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53934 + .id = CRYPTO_STAT,
53935 + .pops = NULL,
53936 + .label = "crypto-sd",
53937 + .desc = "secret key size and id",
53938 + .linkage = {NULL, NULL}
53939 + },
53940 + .present = present_crypto_sd,
53941 + .absent = NULL,
53942 + .save_len = save_len_crypto_sd,
53943 + .save = save_crypto_sd,
53944 + .alignment = 8
53945 + }
53946 +};
53947 +
53948 +/* Make Linus happy.
53949 + Local variables:
53950 + c-indentation-style: "K&R"
53951 + mode-name: "LC"
53952 + c-basic-offset: 8
53953 + tab-width: 8
53954 + fill-column: 120
53955 + End:
53956 +*/
53957 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.20/fs/reiser4/plugin/item/static_stat.h
53958 --- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
53959 +++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.h 2007-05-06 14:50:43.823014469 +0400
53960 @@ -0,0 +1,224 @@
53961 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53962 +
53963 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
53964 +
53965 +In the case where each file has not less than the fields needed by the
53966 +stat() syscall, it is more compact to store those fields in this
53967 +struct.
53968 +
53969 +If this item does not exist, then all stats are dynamically resolved.
53970 +At the moment, we either resolve all stats dynamically or all of them
53971 +statically. If you think this is not fully optimal, and the rest of
53972 +reiser4 is working, then fix it...:-)
53973 +
53974 +*/
53975 +
53976 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
53977 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
53978 +
53979 +#include "../../forward.h"
53980 +#include "../../dformat.h"
53981 +
53982 +#include <linux/fs.h> /* for struct inode */
53983 +
53984 +/* Stat data layout: goals and implementation.
53985 +
53986 + We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
53987 + them, including not having semantic metadata attached to them.
53988 +
53989 + There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
53990 + want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
53991 + sized structure because the statically sized structure knows without recording it what the names and lengths of the
53992 + attributes are.
53993 +
53994 + This leads to a natural compromise, which is to special case those files which have simply the standard unix file
53995 + attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
53996 + file in their use of file attributes.
53997 +
53998 + Yet this compromise deserves to be compromised a little.
53999 +
54000 + We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54001 + bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54002 +
54003 + If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54004 + from parent directory (as uid, gid) or initialised to some sane values.
54005 +
54006 + To capitalize on existing code infrastructure, extensions are
54007 + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54008 + Each stat-data extension plugin implements four methods:
54009 +
54010 + ->present() called by sd_load() when this extension is found in stat-data
54011 + ->absent() called by sd_load() when this extension is not found in stat-data
54012 + ->save_len() called by sd_len() to calculate total length of stat-data
54013 + ->save() called by sd_save() to store extension data into stat-data
54014 +
54015 + Implementation is in fs/reiser4/plugin/item/static_stat.c
54016 +*/
54017 +
54018 +/* stat-data extension. Please order this by presumed frequency of use */
54019 +typedef enum {
54020 + /* support for light-weight files */
54021 + LIGHT_WEIGHT_STAT,
54022 + /* data required to implement unix stat(2) call. Layout is in
54023 + reiser4_unix_stat. If this is not present, file is light-weight */
54024 + UNIX_STAT,
54025 + /* this contains additional set of 32bit [anc]time fields to implement
54026 + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54027 + if this extension is governed by 32bittimes mount option. */
54028 + LARGE_TIMES_STAT,
54029 + /* stat data has link name included */
54030 + SYMLINK_STAT,
54031 + /* on-disk slots of non-standard plugins for main plugin table
54032 + (@reiser4_inode->pset), that is, plugins that cannot be deduced
54033 + from file mode bits), for example, aggregation, interpolation etc. */
54034 + PLUGIN_STAT,
54035 + /* this extension contains persistent inode flags. These flags are
54036 + single bits: immutable, append, only, etc. Layout is in
54037 + reiser4_flags_stat. */
54038 + FLAGS_STAT,
54039 + /* this extension contains capabilities sets, associated with this
54040 + file. Layout is in reiser4_capabilities_stat */
54041 + CAPABILITIES_STAT,
54042 + /* this extension contains size and public id of the secret key.
54043 + Layout is in reiser4_crypto_stat */
54044 + CRYPTO_STAT,
54045 + /* on-disk slots of non-default plugins for inheritance, which
54046 + are extracted to special plugin table (@reiser4_inode->hset).
54047 + By default, children of the object will inherit plugins from
54048 + its main plugin table (pset). */
54049 + HEIR_STAT,
54050 + LAST_SD_EXTENSION,
54051 + /*
54052 + * init_inode_static_sd() iterates over extension mask until all
54053 + * non-zero bits are processed. This means, that neither ->present(),
54054 + * nor ->absent() methods will be called for stat-data extensions that
54055 + * go after last present extension. But some basic extensions, we want
54056 + * either ->absent() or ->present() method to be called, because these
54057 + * extensions set up something in inode even when they are not
54058 + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54059 + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54060 + * ->present(), or ->absent() method will be called, independently of
54061 + * what other extensions are present.
54062 + */
54063 + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
54064 +} sd_ext_bits;
54065 +
54066 +/* minimal stat-data. This allows to support light-weight files. */
54067 +typedef struct reiser4_stat_data_base {
54068 + /* 0 */ __le16 extmask;
54069 + /* 2 */
54070 +} PACKED reiser4_stat_data_base;
54071 +
54072 +typedef struct reiser4_light_weight_stat {
54073 + /* 0 */ __le16 mode;
54074 + /* 2 */ __le32 nlink;
54075 + /* 6 */ __le64 size;
54076 + /* size in bytes */
54077 + /* 14 */
54078 +} PACKED reiser4_light_weight_stat;
54079 +
54080 +typedef struct reiser4_unix_stat {
54081 + /* owner id */
54082 + /* 0 */ __le32 uid;
54083 + /* group id */
54084 + /* 4 */ __le32 gid;
54085 + /* access time */
54086 + /* 8 */ __le32 atime;
54087 + /* modification time */
54088 + /* 12 */ __le32 mtime;
54089 + /* change time */
54090 + /* 16 */ __le32 ctime;
54091 + union {
54092 + /* minor:major for device files */
54093 + /* 20 */ __le64 rdev;
54094 + /* bytes used by file */
54095 + /* 20 */ __le64 bytes;
54096 + } u;
54097 + /* 28 */
54098 +} PACKED reiser4_unix_stat;
54099 +
54100 +/* symlink stored as part of inode */
54101 +typedef struct reiser4_symlink_stat {
54102 + char body[0];
54103 +} PACKED reiser4_symlink_stat;
54104 +
54105 +typedef struct reiser4_plugin_slot {
54106 + /* 0 */ __le16 pset_memb;
54107 + /* 2 */ __le16 id;
54108 + /* 4 *//* here plugin stores its persistent state */
54109 +} PACKED reiser4_plugin_slot;
54110 +
54111 +/* stat-data extension for files with non-standard plugin. */
54112 +typedef struct reiser4_plugin_stat {
54113 + /* number of additional plugins, associated with this object */
54114 + /* 0 */ __le16 plugins_no;
54115 + /* 2 */ reiser4_plugin_slot slot[0];
54116 + /* 2 */
54117 +} PACKED reiser4_plugin_stat;
54118 +
54119 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
54120 + * bit mask. If need arise, this can be replaced with variable width
54121 + * bitmask. */
54122 +typedef struct reiser4_flags_stat {
54123 + /* 0 */ __le32 flags;
54124 + /* 4 */
54125 +} PACKED reiser4_flags_stat;
54126 +
54127 +typedef struct reiser4_capabilities_stat {
54128 + /* 0 */ __le32 effective;
54129 + /* 8 */ __le32 permitted;
54130 + /* 16 */
54131 +} PACKED reiser4_capabilities_stat;
54132 +
54133 +typedef struct reiser4_cluster_stat {
54134 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54135 + /* 0 */ d8 cluster_shift;
54136 + /* 1 */
54137 +} PACKED reiser4_cluster_stat;
54138 +
54139 +typedef struct reiser4_crypto_stat {
54140 + /* secret key size, bits */
54141 + /* 0 */ d16 keysize;
54142 + /* secret key id */
54143 + /* 2 */ d8 keyid[0];
54144 + /* 2 */
54145 +} PACKED reiser4_crypto_stat;
54146 +
54147 +typedef struct reiser4_large_times_stat {
54148 + /* access time */
54149 + /* 0 */ d32 atime;
54150 + /* modification time */
54151 + /* 4 */ d32 mtime;
54152 + /* change time */
54153 + /* 8 */ d32 ctime;
54154 + /* 12 */
54155 +} PACKED reiser4_large_times_stat;
54156 +
54157 +/* this structure is filled by sd_item_stat */
54158 +typedef struct sd_stat {
54159 + int dirs;
54160 + int files;
54161 + int others;
54162 +} sd_stat;
54163 +
54164 +/* plugin->item.common.* */
54165 +extern void print_sd(const char *prefix, coord_t * coord);
54166 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
54167 +
54168 +/* plugin->item.s.sd.* */
54169 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54170 +extern int save_len_static_sd(struct inode *inode);
54171 +extern int save_static_sd(struct inode *inode, char **area);
54172 +
54173 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54174 +#endif
54175 +
54176 +/* Make Linus happy.
54177 + Local variables:
54178 + c-indentation-style: "K&R"
54179 + mode-name: "LC"
54180 + c-basic-offset: 8
54181 + tab-width: 8
54182 + fill-column: 120
54183 + End:
54184 +*/
54185 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c linux-2.6.20/fs/reiser4/plugin/item/tail.c
54186 --- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
54187 +++ linux-2.6.20/fs/reiser4/plugin/item/tail.c 2007-05-06 14:50:43.823014469 +0400
54188 @@ -0,0 +1,812 @@
54189 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54190 +
54191 +#include "item.h"
54192 +#include "../../inode.h"
54193 +#include "../../page_cache.h"
54194 +#include "../../carry.h"
54195 +#include "../../vfs_ops.h"
54196 +
54197 +#include <linux/quotaops.h>
54198 +#include <asm/uaccess.h>
54199 +#include <linux/swap.h>
54200 +#include <linux/writeback.h>
54201 +
54202 +/* plugin->u.item.b.max_key_inside */
54203 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54204 +{
54205 + item_key_by_coord(coord, key);
54206 + set_key_offset(key, get_key_offset(reiser4_max_key()));
54207 + return key;
54208 +}
54209 +
54210 +/* plugin->u.item.b.can_contain_key */
54211 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54212 + const reiser4_item_data *data)
54213 +{
54214 + reiser4_key item_key;
54215 +
54216 + if (item_plugin_by_coord(coord) != data->iplug)
54217 + return 0;
54218 +
54219 + item_key_by_coord(coord, &item_key);
54220 + if (get_key_locality(key) != get_key_locality(&item_key) ||
54221 + get_key_objectid(key) != get_key_objectid(&item_key))
54222 + return 0;
54223 +
54224 + return 1;
54225 +}
54226 +
54227 +/* plugin->u.item.b.mergeable
54228 + first item is of tail type */
54229 +/* Audited by: green(2002.06.14) */
54230 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
54231 +{
54232 + reiser4_key key1, key2;
54233 +
54234 + assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54235 + UNIX_FILE_METADATA_ITEM_TYPE));
54236 + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54237 +
54238 + if (item_id_by_coord(p2) != FORMATTING_ID) {
54239 + /* second item is of another type */
54240 + return 0;
54241 + }
54242 +
54243 + item_key_by_coord(p1, &key1);
54244 + item_key_by_coord(p2, &key2);
54245 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
54246 + get_key_objectid(&key1) != get_key_objectid(&key2)
54247 + || get_key_type(&key1) != get_key_type(&key2)) {
54248 + /* items of different objects */
54249 + return 0;
54250 + }
54251 + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54252 + /* not adjacent items */
54253 + return 0;
54254 + }
54255 + return 1;
54256 +}
54257 +
54258 +/* plugin->u.item.b.print
54259 + plugin->u.item.b.check */
54260 +
54261 +/* plugin->u.item.b.nr_units */
54262 +pos_in_node_t nr_units_tail(const coord_t * coord)
54263 +{
54264 + return item_length_by_coord(coord);
54265 +}
54266 +
54267 +/* plugin->u.item.b.lookup */
54268 +lookup_result
54269 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54270 +{
54271 + reiser4_key item_key;
54272 + __u64 lookuped, offset;
54273 + unsigned nr_units;
54274 +
54275 + item_key_by_coord(coord, &item_key);
54276 + offset = get_key_offset(item_key_by_coord(coord, &item_key));
54277 + nr_units = nr_units_tail(coord);
54278 +
54279 + /* key we are looking for must be greater than key of item @coord */
54280 + assert("vs-416", keygt(key, &item_key));
54281 +
54282 + /* offset we are looking for */
54283 + lookuped = get_key_offset(key);
54284 +
54285 + if (lookuped >= offset && lookuped < offset + nr_units) {
54286 + /* byte we are looking for is in this item */
54287 + coord->unit_pos = lookuped - offset;
54288 + coord->between = AT_UNIT;
54289 + return CBK_COORD_FOUND;
54290 + }
54291 +
54292 + /* set coord after last unit */
54293 + coord->unit_pos = nr_units - 1;
54294 + coord->between = AFTER_UNIT;
54295 + return bias ==
54296 + FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54297 +}
54298 +
54299 +/* plugin->u.item.b.paste */
54300 +int
54301 +paste_tail(coord_t *coord, reiser4_item_data *data,
54302 + carry_plugin_info *info UNUSED_ARG)
54303 +{
54304 + unsigned old_item_length;
54305 + char *item;
54306 +
54307 + /* length the item had before resizing has been performed */
54308 + old_item_length = item_length_by_coord(coord) - data->length;
54309 +
54310 + /* tail items never get pasted in the middle */
54311 + assert("vs-363",
54312 + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54313 + (coord->unit_pos == old_item_length - 1 &&
54314 + coord->between == AFTER_UNIT) ||
54315 + (coord->unit_pos == 0 && old_item_length == 0
54316 + && coord->between == AT_UNIT));
54317 +
54318 + item = item_body_by_coord(coord);
54319 + if (coord->unit_pos == 0)
54320 + /* make space for pasted data when pasting at the beginning of
54321 + the item */
54322 + memmove(item + data->length, item, old_item_length);
54323 +
54324 + if (coord->between == AFTER_UNIT)
54325 + coord->unit_pos++;
54326 +
54327 + if (data->data) {
54328 + assert("vs-554", data->user == 0 || data->user == 1);
54329 + if (data->user) {
54330 + assert("nikita-3035", reiser4_schedulable());
54331 + /* copy from user space */
54332 + if (__copy_from_user(item + coord->unit_pos,
54333 + (const char __user *)data->data,
54334 + (unsigned)data->length))
54335 + return RETERR(-EFAULT);
54336 + } else
54337 + /* copy from kernel space */
54338 + memcpy(item + coord->unit_pos, data->data,
54339 + (unsigned)data->length);
54340 + } else {
54341 + memset(item + coord->unit_pos, 0, (unsigned)data->length);
54342 + }
54343 + return 0;
54344 +}
54345 +
54346 +/* plugin->u.item.b.fast_paste */
54347 +
54348 +/* plugin->u.item.b.can_shift
54349 + number of units is returned via return value, number of bytes via @size. For
54350 + tail items they coincide */
54351 +int
54352 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54353 + znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54354 + unsigned *size, unsigned want)
54355 +{
54356 + /* make sure that that we do not want to shift more than we have */
54357 + assert("vs-364", want > 0
54358 + && want <= (unsigned)item_length_by_coord(source));
54359 +
54360 + *size = min(want, free_space);
54361 + return *size;
54362 +}
54363 +
54364 +/* plugin->u.item.b.copy_units */
54365 +void
54366 +copy_units_tail(coord_t * target, coord_t * source,
54367 + unsigned from, unsigned count,
54368 + shift_direction where_is_free_space,
54369 + unsigned free_space UNUSED_ARG)
54370 +{
54371 + /* make sure that item @target is expanded already */
54372 + assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54373 + assert("vs-370", free_space >= count);
54374 +
54375 + if (where_is_free_space == SHIFT_LEFT) {
54376 + /* append item @target with @count first bytes of @source */
54377 + assert("vs-365", from == 0);
54378 +
54379 + memcpy((char *)item_body_by_coord(target) +
54380 + item_length_by_coord(target) - count,
54381 + (char *)item_body_by_coord(source), count);
54382 + } else {
54383 + /* target item is moved to right already */
54384 + reiser4_key key;
54385 +
54386 + assert("vs-367",
54387 + (unsigned)item_length_by_coord(source) == from + count);
54388 +
54389 + memcpy((char *)item_body_by_coord(target),
54390 + (char *)item_body_by_coord(source) + from, count);
54391 +
54392 + /* new units are inserted before first unit in an item,
54393 + therefore, we have to update item key */
54394 + item_key_by_coord(source, &key);
54395 + set_key_offset(&key, get_key_offset(&key) + from);
54396 +
54397 + node_plugin_by_node(target->node)->update_item_key(target, &key,
54398 + NULL /*info */);
54399 + }
54400 +}
54401 +
54402 +/* plugin->u.item.b.create_hook */
54403 +
54404 +/* item_plugin->b.kill_hook
54405 + this is called when @count units starting from @from-th one are going to be removed
54406 + */
54407 +int
54408 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54409 + pos_in_node_t count, struct carry_kill_data *kdata)
54410 +{
54411 + reiser4_key key;
54412 + loff_t start, end;
54413 +
54414 + assert("vs-1577", kdata);
54415 + assert("vs-1579", kdata->inode);
54416 +
54417 + item_key_by_coord(coord, &key);
54418 + start = get_key_offset(&key) + from;
54419 + end = start + count;
54420 + fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54421 + return 0;
54422 +}
54423 +
54424 +/* plugin->u.item.b.shift_hook */
54425 +
54426 +/* helper for kill_units_tail and cut_units_tail */
54427 +static int
54428 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54429 + reiser4_key * smallest_removed, reiser4_key * new_first)
54430 +{
54431 + pos_in_node_t count;
54432 +
54433 + /* this method is only called to remove part of item */
54434 + assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54435 + /* tails items are never cut from the middle of an item */
54436 + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54437 + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54438 +
54439 + count = to - from + 1;
54440 +
54441 + if (smallest_removed) {
54442 + /* store smallest key removed */
54443 + item_key_by_coord(coord, smallest_removed);
54444 + set_key_offset(smallest_removed,
54445 + get_key_offset(smallest_removed) + from);
54446 + }
54447 + if (new_first) {
54448 + /* head of item is cut */
54449 + assert("vs-1529", from == 0);
54450 +
54451 + item_key_by_coord(coord, new_first);
54452 + set_key_offset(new_first,
54453 + get_key_offset(new_first) + from + count);
54454 + }
54455 +
54456 + if (REISER4_DEBUG)
54457 + memset((char *)item_body_by_coord(coord) + from, 0, count);
54458 + return count;
54459 +}
54460 +
54461 +/* plugin->u.item.b.cut_units */
54462 +int
54463 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54464 + struct carry_cut_data *cdata UNUSED_ARG,
54465 + reiser4_key * smallest_removed, reiser4_key * new_first)
54466 +{
54467 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54468 +}
54469 +
54470 +/* plugin->u.item.b.kill_units */
54471 +int
54472 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54473 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54474 + reiser4_key * new_first)
54475 +{
54476 + kill_hook_tail(coord, from, to - from + 1, kdata);
54477 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54478 +}
54479 +
54480 +/* plugin->u.item.b.unit_key */
54481 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54482 +{
54483 + assert("vs-375", coord_is_existing_unit(coord));
54484 +
54485 + item_key_by_coord(coord, key);
54486 + set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54487 +
54488 + return key;
54489 +}
54490 +
54491 +/* plugin->u.item.b.estimate
54492 + plugin->u.item.b.item_data_by_flow */
54493 +
54494 +/* tail redpage function. It is called from readpage_tail(). */
54495 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54496 +{
54497 + tap_t tap;
54498 + int result;
54499 + coord_t coord;
54500 + lock_handle lh;
54501 + int count, mapped;
54502 + struct inode *inode;
54503 + char *pagedata;
54504 +
54505 + /* saving passed coord in order to do not move it by tap. */
54506 + init_lh(&lh);
54507 + copy_lh(&lh, uf_coord->lh);
54508 + inode = page->mapping->host;
54509 + coord_dup(&coord, &uf_coord->coord);
54510 +
54511 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54512 +
54513 + if ((result = reiser4_tap_load(&tap)))
54514 + goto out_tap_done;
54515 +
54516 + /* lookup until page is filled up. */
54517 + for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54518 + /* number of bytes to be copied to page */
54519 + count = item_length_by_coord(&coord) - coord.unit_pos;
54520 + if (count > PAGE_CACHE_SIZE - mapped)
54521 + count = PAGE_CACHE_SIZE - mapped;
54522 +
54523 + /* attach @page to address space and get data address */
54524 + pagedata = kmap_atomic(page, KM_USER0);
54525 +
54526 + /* copy tail item to page */
54527 + memcpy(pagedata + mapped,
54528 + ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54529 + count);
54530 + mapped += count;
54531 +
54532 + flush_dcache_page(page);
54533 +
54534 + /* dettach page from address space */
54535 + kunmap_atomic(pagedata, KM_USER0);
54536 +
54537 + /* Getting next tail item. */
54538 + if (mapped < PAGE_CACHE_SIZE) {
54539 + /*
54540 + * unlock page in order to avoid keep it locked
54541 + * during tree lookup, which takes long term locks
54542 + */
54543 + unlock_page(page);
54544 +
54545 + /* getting right neighbour. */
54546 + result = go_dir_el(&tap, RIGHT_SIDE, 0);
54547 +
54548 + /* lock page back */
54549 + lock_page(page);
54550 + if (PageUptodate(page)) {
54551 + /*
54552 + * another thread read the page, we have
54553 + * nothing to do
54554 + */
54555 + result = 0;
54556 + goto out_unlock_page;
54557 + }
54558 +
54559 + if (result) {
54560 + if (result == -E_NO_NEIGHBOR) {
54561 + /*
54562 + * rigth neighbor is not a formatted
54563 + * node
54564 + */
54565 + result = 0;
54566 + goto done;
54567 + } else {
54568 + goto out_tap_relse;
54569 + }
54570 + } else {
54571 + if (!inode_file_plugin(inode)->
54572 + owns_item(inode, &coord)) {
54573 + /* item of another file is found */
54574 + result = 0;
54575 + goto done;
54576 + }
54577 + }
54578 + }
54579 + }
54580 +
54581 + done:
54582 + if (mapped != PAGE_CACHE_SIZE) {
54583 + pagedata = kmap_atomic(page, KM_USER0);
54584 + memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
54585 + flush_dcache_page(page);
54586 + kunmap_atomic(pagedata, KM_USER0);
54587 + }
54588 + SetPageUptodate(page);
54589 + out_unlock_page:
54590 + unlock_page(page);
54591 + out_tap_relse:
54592 + reiser4_tap_relse(&tap);
54593 + out_tap_done:
54594 + reiser4_tap_done(&tap);
54595 + return result;
54596 +}
54597 +
54598 +/*
54599 + plugin->s.file.readpage
54600 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54601 + or
54602 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54603 +
54604 + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54605 + item. */
54606 +int readpage_tail(void *vp, struct page *page)
54607 +{
54608 + uf_coord_t *uf_coord = vp;
54609 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
54610 + ON_DEBUG(reiser4_key key);
54611 +
54612 + assert("umka-2515", PageLocked(page));
54613 + assert("umka-2516", !PageUptodate(page));
54614 + assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54615 + assert("umka-2518", page->mapping && page->mapping->host);
54616 +
54617 + assert("umka-2519", znode_is_loaded(coord->node));
54618 + assert("umka-2520", item_is_tail(coord));
54619 + assert("umka-2521", coord_is_existing_unit(coord));
54620 + assert("umka-2522", znode_is_rlocked(coord->node));
54621 + assert("umka-2523",
54622 + page->mapping->host->i_ino ==
54623 + get_key_objectid(item_key_by_coord(coord, &key)));
54624 +
54625 + return do_readpage_tail(uf_coord, page);
54626 +}
54627 +
54628 +/**
54629 + * overwrite_tail
54630 + * @flow:
54631 + * @coord:
54632 + *
54633 + * Overwrites tail item or its part by user data. Returns number of bytes
54634 + * written or error code.
54635 + */
54636 +static int overwrite_tail(flow_t *flow, coord_t *coord)
54637 +{
54638 + unsigned count;
54639 +
54640 + assert("vs-570", flow->user == 1);
54641 + assert("vs-946", flow->data);
54642 + assert("vs-947", coord_is_existing_unit(coord));
54643 + assert("vs-948", znode_is_write_locked(coord->node));
54644 + assert("nikita-3036", reiser4_schedulable());
54645 +
54646 + count = item_length_by_coord(coord) - coord->unit_pos;
54647 + if (count > flow->length)
54648 + count = flow->length;
54649 +
54650 + if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54651 + (const char __user *)flow->data, count))
54652 + return RETERR(-EFAULT);
54653 +
54654 + znode_make_dirty(coord->node);
54655 + return count;
54656 +}
54657 +
54658 +/**
54659 + * insert_first_tail
54660 + * @inode:
54661 + * @flow:
54662 + * @coord:
54663 + * @lh:
54664 + *
54665 + * Returns number of bytes written or error code.
54666 + */
54667 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54668 + coord_t *coord, lock_handle *lh)
54669 +{
54670 + int result;
54671 + loff_t to_write;
54672 + unix_file_info_t *uf_info;
54673 +
54674 + if (get_key_offset(&flow->key) != 0) {
54675 + /*
54676 + * file is empty and we have to write not to the beginning of
54677 + * file. Create a hole at the beginning of file. On success
54678 + * insert_flow returns 0 as number of written bytes which is
54679 + * what we have to return on padding a file with holes
54680 + */
54681 + flow->data = NULL;
54682 + flow->length = get_key_offset(&flow->key);
54683 + set_key_offset(&flow->key, 0);
54684 + /*
54685 + * holes in files built of tails are stored just like if there
54686 + * were real data which are all zeros. Therefore we have to
54687 + * allocate quota here as well
54688 + */
54689 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54690 + return RETERR(-EDQUOT);
54691 + result = reiser4_insert_flow(coord, lh, flow);
54692 + if (flow->length)
54693 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54694 +
54695 + uf_info = unix_file_inode_data(inode);
54696 +
54697 + /*
54698 + * first item insertion is only possible when writing to empty
54699 + * file or performing tail conversion
54700 + */
54701 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
54702 + (reiser4_inode_get_flag(inode,
54703 + REISER4_PART_MIXED) &&
54704 + reiser4_inode_get_flag(inode,
54705 + REISER4_PART_IN_CONV))));
54706 + /* if file was empty - update its state */
54707 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54708 + uf_info->container = UF_CONTAINER_TAILS;
54709 + return result;
54710 + }
54711 +
54712 + /* check quota before appending data */
54713 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54714 + return RETERR(-EDQUOT);
54715 +
54716 + to_write = flow->length;
54717 + result = reiser4_insert_flow(coord, lh, flow);
54718 + if (flow->length)
54719 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54720 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54721 +}
54722 +
54723 +/**
54724 + * append_tail
54725 + * @inode:
54726 + * @flow:
54727 + * @coord:
54728 + * @lh:
54729 + *
54730 + * Returns number of bytes written or error code.
54731 + */
54732 +static ssize_t append_tail(struct inode *inode,
54733 + flow_t *flow, coord_t *coord, lock_handle *lh)
54734 +{
54735 + int result;
54736 + reiser4_key append_key;
54737 + loff_t to_write;
54738 +
54739 + if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54740 + flow->data = NULL;
54741 + flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54742 + set_key_offset(&flow->key, get_key_offset(&append_key));
54743 + /*
54744 + * holes in files built of tails are stored just like if there
54745 + * were real data which are all zeros. Therefore we have to
54746 + * allocate quota here as well
54747 + */
54748 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54749 + return RETERR(-EDQUOT);
54750 + result = reiser4_insert_flow(coord, lh, flow);
54751 + if (flow->length)
54752 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54753 + return result;
54754 + }
54755 +
54756 + /* check quota before appending data */
54757 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54758 + return RETERR(-EDQUOT);
54759 +
54760 + to_write = flow->length;
54761 + result = reiser4_insert_flow(coord, lh, flow);
54762 + if (flow->length)
54763 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54764 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54765 +}
54766 +
54767 +/**
54768 + * write_tail_reserve_space - reserve space for tail write operation
54769 + * @inode:
54770 + *
54771 + * Estimates and reserves space which may be required for writing one flow to a
54772 + * file
54773 + */
54774 +static int write_extent_reserve_space(struct inode *inode)
54775 +{
54776 + __u64 count;
54777 + reiser4_tree *tree;
54778 +
54779 + /*
54780 + * to write one flow to a file by tails we have to reserve disk space for:
54781 +
54782 + * 1. find_file_item may have to insert empty node to the tree (empty
54783 + * leaf node between two extent items). This requires 1 block and
54784 + * number of blocks which are necessary to perform insertion of an
54785 + * internal item into twig level.
54786 + *
54787 + * 2. flow insertion
54788 + *
54789 + * 3. stat data update
54790 + */
54791 + tree = reiser4_tree_by_inode(inode);
54792 + count = estimate_one_insert_item(tree) +
54793 + estimate_insert_flow(tree->height) +
54794 + estimate_one_insert_item(tree);
54795 + grab_space_enable();
54796 + return reiser4_grab_space(count, 0 /* flags */);
54797 +}
54798 +
54799 +#define PAGE_PER_FLOW 4
54800 +
54801 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
54802 +{
54803 + loff_t faulted;
54804 + int to_fault;
54805 +
54806 + if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54807 + count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54808 + faulted = 0;
54809 + while (count > 0) {
54810 + to_fault = PAGE_CACHE_SIZE;
54811 + if (count < to_fault)
54812 + to_fault = count;
54813 + fault_in_pages_readable(buf + faulted, to_fault);
54814 + count -= to_fault;
54815 + faulted += to_fault;
54816 + }
54817 + return faulted;
54818 +}
54819 +
54820 +/**
54821 + * reiser4_write_extent - write method of tail item plugin
54822 + * @file: file to write to
54823 + * @buf: address of user-space buffer
54824 + * @count: number of bytes to write
54825 + * @pos: position in file to write to
54826 + *
54827 + * Returns number of written bytes or error code.
54828 + */
54829 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54830 + size_t count, loff_t *pos)
54831 +{
54832 + struct inode *inode;
54833 + struct hint hint;
54834 + int result;
54835 + flow_t flow;
54836 + coord_t *coord;
54837 + lock_handle *lh;
54838 + znode *loaded;
54839 +
54840 + inode = file->f_dentry->d_inode;
54841 +
54842 + if (write_extent_reserve_space(inode))
54843 + return RETERR(-ENOSPC);
54844 +
54845 + result = load_file_hint(file, &hint);
54846 + BUG_ON(result != 0);
54847 +
54848 + flow.length = faultin_user_pages(buf, count);
54849 + flow.user = 1;
54850 + memcpy(&flow.data, &buf, sizeof(buf));
54851 + flow.op = WRITE_OP;
54852 + key_by_inode_and_offset_common(inode, *pos, &flow.key);
54853 +
54854 + result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54855 + if (IS_CBKERR(result))
54856 + return result;
54857 +
54858 + coord = &hint.ext_coord.coord;
54859 + lh = hint.ext_coord.lh;
54860 +
54861 + result = zload(coord->node);
54862 + BUG_ON(result != 0);
54863 + loaded = coord->node;
54864 +
54865 + if (coord->between == AFTER_UNIT) {
54866 + /* append with data or hole */
54867 + result = append_tail(inode, &flow, coord, lh);
54868 + } else if (coord->between == AT_UNIT) {
54869 + /* overwrite */
54870 + result = overwrite_tail(&flow, coord);
54871 + } else {
54872 + /* no items of this file yet. insert data or hole */
54873 + result = insert_first_tail(inode, &flow, coord, lh);
54874 + }
54875 + zrelse(loaded);
54876 + if (result < 0) {
54877 + done_lh(lh);
54878 + return result;
54879 + }
54880 +
54881 + /* seal and unlock znode */
54882 + hint.ext_coord.valid = 0;
54883 + if (hint.ext_coord.valid)
54884 + reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
54885 + else
54886 + reiser4_unset_hint(&hint);
54887 +
54888 + save_file_hint(file, &hint);
54889 + return result;
54890 +}
54891 +
54892 +#if REISER4_DEBUG
54893 +
54894 +static int
54895 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54896 +{
54897 + reiser4_key item_key;
54898 +
54899 + assert("vs-1356", coord_is_existing_unit(coord));
54900 + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54901 + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54902 + return get_key_offset(key) ==
54903 + get_key_offset(&item_key) + coord->unit_pos;
54904 +
54905 +}
54906 +
54907 +#endif
54908 +
54909 +/* plugin->u.item.s.file.read */
54910 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
54911 +{
54912 + unsigned count;
54913 + int item_length;
54914 + coord_t *coord;
54915 + uf_coord_t *uf_coord;
54916 +
54917 + uf_coord = &hint->ext_coord;
54918 + coord = &uf_coord->coord;
54919 +
54920 + assert("vs-571", f->user == 1);
54921 + assert("vs-571", f->data);
54922 + assert("vs-967", coord && coord->node);
54923 + assert("vs-1117", znode_is_rlocked(coord->node));
54924 + assert("vs-1118", znode_is_loaded(coord->node));
54925 +
54926 + assert("nikita-3037", reiser4_schedulable());
54927 + assert("vs-1357", coord_matches_key_tail(coord, &f->key));
54928 +
54929 + /* calculate number of bytes to read off the item */
54930 + item_length = item_length_by_coord(coord);
54931 + count = item_length_by_coord(coord) - coord->unit_pos;
54932 + if (count > f->length)
54933 + count = f->length;
54934 +
54935 + /* user page has to be brought in so that major page fault does not
54936 + * occur here when longtem lock is held */
54937 + if (__copy_to_user((char __user *)f->data,
54938 + ((char *)item_body_by_coord(coord) + coord->unit_pos),
54939 + count))
54940 + return RETERR(-EFAULT);
54941 +
54942 + /* probably mark_page_accessed() should only be called if
54943 + * coord->unit_pos is zero. */
54944 + mark_page_accessed(znode_page(coord->node));
54945 + move_flow_forward(f, count);
54946 +
54947 + coord->unit_pos += count;
54948 + if (item_length == coord->unit_pos) {
54949 + coord->unit_pos--;
54950 + coord->between = AFTER_UNIT;
54951 + }
54952 +
54953 + return 0;
54954 +}
54955 +
54956 +/*
54957 + plugin->u.item.s.file.append_key
54958 + key of first byte which is the next to last byte by addressed by this item
54959 +*/
54960 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
54961 +{
54962 + item_key_by_coord(coord, key);
54963 + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
54964 + return key;
54965 +}
54966 +
54967 +/* plugin->u.item.s.file.init_coord_extension */
54968 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
54969 +{
54970 + uf_coord->valid = 1;
54971 +}
54972 +
54973 +/*
54974 + plugin->u.item.s.file.get_block
54975 +*/
54976 +int
54977 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
54978 +{
54979 + assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
54980 +
54981 + if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
54982 + /* if node has'nt obtainet its block number yet, return 0.
54983 + * Lets avoid upsetting users with some cosmic numbers beyond
54984 + * the device capacity.*/
54985 + *block = 0;
54986 + else
54987 + *block = *znode_get_block(coord->node);
54988 + return 0;
54989 +}
54990 +
54991 +/*
54992 + * Local variables:
54993 + * c-indentation-style: "K&R"
54994 + * mode-name: "LC"
54995 + * c-basic-offset: 8
54996 + * tab-width: 8
54997 + * fill-column: 79
54998 + * scroll-step: 1
54999 + * End:
55000 + */
55001 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h linux-2.6.20/fs/reiser4/plugin/item/tail.h
55002 --- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
55003 +++ linux-2.6.20/fs/reiser4/plugin/item/tail.h 2007-05-06 14:50:43.827015719 +0400
55004 @@ -0,0 +1,58 @@
55005 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55006 +
55007 +#if !defined( __REISER4_TAIL_H__ )
55008 +#define __REISER4_TAIL_H__
55009 +
55010 +typedef struct {
55011 + int not_used;
55012 +} tail_coord_extension_t;
55013 +
55014 +struct cut_list;
55015 +
55016 +/* plugin->u.item.b.* */
55017 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55018 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55019 + const reiser4_item_data *);
55020 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
55021 +pos_in_node_t nr_units_tail(const coord_t *);
55022 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55023 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55024 +int can_shift_tail(unsigned free_space, coord_t * source,
55025 + znode * target, shift_direction, unsigned *size,
55026 + unsigned want);
55027 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55028 + unsigned count, shift_direction, unsigned free_space);
55029 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55030 + struct carry_kill_data *);
55031 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55032 + struct carry_cut_data *, reiser4_key * smallest_removed,
55033 + reiser4_key * new_first);
55034 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55035 + struct carry_kill_data *, reiser4_key * smallest_removed,
55036 + reiser4_key * new_first);
55037 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55038 +
55039 +/* plugin->u.item.s.* */
55040 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55041 + size_t count, loff_t *pos);
55042 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
55043 +int readpage_tail(void *vp, struct page *page);
55044 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55045 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55046 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55047 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
55048 + hint_t *, int back_to_dirty, int set_hint);
55049 +
55050 +/* __REISER4_TAIL_H__ */
55051 +#endif
55052 +
55053 +/* Make Linus happy.
55054 + Local variables:
55055 + c-indentation-style: "K&R"
55056 + mode-name: "LC"
55057 + c-basic-offset: 8
55058 + tab-width: 8
55059 + fill-column: 120
55060 + scroll-step: 1
55061 + End:
55062 +*/
55063 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/Makefile linux-2.6.20/fs/reiser4/plugin/Makefile
55064 --- linux-2.6.20.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
55065 +++ linux-2.6.20/fs/reiser4/plugin/Makefile 2007-05-06 14:50:43.827015719 +0400
55066 @@ -0,0 +1,26 @@
55067 +obj-$(CONFIG_REISER4_FS) += plugins.o
55068 +
55069 +plugins-objs := \
55070 + plugin.o \
55071 + plugin_set.o \
55072 + object.o \
55073 + inode_ops.o \
55074 + inode_ops_rename.o \
55075 + file_ops.o \
55076 + file_ops_readdir.o \
55077 + file_plugin_common.o \
55078 + dir_plugin_common.o \
55079 + digest.o \
55080 + hash.o \
55081 + fibration.o \
55082 + tail_policy.o \
55083 + regular.o
55084 +
55085 +obj-$(CONFIG_REISER4_FS) += item/
55086 +obj-$(CONFIG_REISER4_FS) += file/
55087 +obj-$(CONFIG_REISER4_FS) += dir/
55088 +obj-$(CONFIG_REISER4_FS) += node/
55089 +obj-$(CONFIG_REISER4_FS) += compress/
55090 +obj-$(CONFIG_REISER4_FS) += space/
55091 +obj-$(CONFIG_REISER4_FS) += disk_format/
55092 +obj-$(CONFIG_REISER4_FS) += security/
55093 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile linux-2.6.20/fs/reiser4/plugin/node/Makefile
55094 --- linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
55095 +++ linux-2.6.20/fs/reiser4/plugin/node/Makefile 2007-05-06 14:50:43.827015719 +0400
55096 @@ -0,0 +1,5 @@
55097 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
55098 +
55099 +node_plugins-objs := \
55100 + node.o \
55101 + node40.o
55102 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c linux-2.6.20/fs/reiser4/plugin/node/node40.c
55103 --- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
55104 +++ linux-2.6.20/fs/reiser4/plugin/node/node40.c 2007-05-06 14:50:43.831016969 +0400
55105 @@ -0,0 +1,2924 @@
55106 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55107 +
55108 +#include "../../debug.h"
55109 +#include "../../key.h"
55110 +#include "../../coord.h"
55111 +#include "../plugin_header.h"
55112 +#include "../item/item.h"
55113 +#include "node.h"
55114 +#include "node40.h"
55115 +#include "../plugin.h"
55116 +#include "../../jnode.h"
55117 +#include "../../znode.h"
55118 +#include "../../pool.h"
55119 +#include "../../carry.h"
55120 +#include "../../tap.h"
55121 +#include "../../tree.h"
55122 +#include "../../super.h"
55123 +#include "../../reiser4.h"
55124 +
55125 +#include <asm/uaccess.h>
55126 +#include <linux/types.h>
55127 +#include <linux/prefetch.h>
55128 +
55129 +/* leaf 40 format:
55130 +
55131 + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
55132 + plugin_id (16) key
55133 + free_space (16) pluginid (16)
55134 + free_space_start (16) offset (16)
55135 + level (8)
55136 + num_items (16)
55137 + magic (32)
55138 + flush_time (32)
55139 +*/
55140 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
55141 +/* magic number that is stored in ->magic field of node header */
55142 +static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
55143 +
55144 +static int prepare_for_update(znode * left, znode * right,
55145 + carry_plugin_info * info);
55146 +
55147 +/* header of node of reiser40 format is at the beginning of node */
55148 +static inline node40_header *node40_node_header(const znode * node /* node to
55149 + * query */ )
55150 +{
55151 + assert("nikita-567", node != NULL);
55152 + assert("nikita-568", znode_page(node) != NULL);
55153 + assert("nikita-569", zdata(node) != NULL);
55154 + return (node40_header *) zdata(node);
55155 +}
55156 +
55157 +/* functions to get/set fields of node40_header */
55158 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
55159 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
55160 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
55161 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
55162 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
55163 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
55164 +
55165 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
55166 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
55167 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
55168 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
55169 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
55170 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
55171 +
55172 +/* plugin field of node header should be read/set by
55173 + plugin_by_disk_id/save_disk_plugin */
55174 +
55175 +/* array of item headers is at the end of node */
55176 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
55177 +{
55178 + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
55179 +}
55180 +
55181 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
55182 + */
55183 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
55184 +{
55185 + return (item_header40 *) (zdata(coord->node) +
55186 + znode_size(coord->node)) - (coord->item_pos) -
55187 + 1;
55188 +}
55189 +
55190 +/* functions to get/set fields of item_header40 */
55191 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
55192 +
55193 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
55194 +
55195 +/* plugin field of item header should be read/set by
55196 + plugin_by_disk_id/save_disk_plugin */
55197 +
55198 +/* plugin methods */
55199 +
55200 +/* plugin->u.node.item_overhead
55201 + look for description of this method in plugin/node/node.h */
55202 +size_t
55203 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
55204 +{
55205 + return sizeof(item_header40);
55206 +}
55207 +
55208 +/* plugin->u.node.free_space
55209 + look for description of this method in plugin/node/node.h */
55210 +size_t free_space_node40(znode * node)
55211 +{
55212 + assert("nikita-577", node != NULL);
55213 + assert("nikita-578", znode_is_loaded(node));
55214 + assert("nikita-579", zdata(node) != NULL);
55215 +
55216 + return nh40_get_free_space(node40_node_header(node));
55217 +}
55218 +
55219 +/* private inline version of node40_num_of_items() for use in this file. This
55220 + is necessary, because address of node40_num_of_items() is taken and it is
55221 + never inlined as a result. */
55222 +static inline short node40_num_of_items_internal(const znode * node)
55223 +{
55224 + return nh40_get_num_items(node40_node_header(node));
55225 +}
55226 +
55227 +#if REISER4_DEBUG
55228 +static inline void check_num_items(const znode * node)
55229 +{
55230 + assert("nikita-2749",
55231 + node40_num_of_items_internal(node) == node->nr_items);
55232 + assert("nikita-2746", znode_is_write_locked(node));
55233 +}
55234 +#else
55235 +#define check_num_items(node) noop
55236 +#endif
55237 +
55238 +/* plugin->u.node.num_of_items
55239 + look for description of this method in plugin/node/node.h */
55240 +int num_of_items_node40(const znode * node)
55241 +{
55242 + return node40_num_of_items_internal(node);
55243 +}
55244 +
55245 +static void
55246 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55247 +{
55248 + assert("nikita-2751", node != NULL);
55249 + assert("nikita-2750", nh == node40_node_header(node));
55250 +
55251 + check_num_items(node);
55252 + nh40_set_num_items(nh, value);
55253 + node->nr_items = value;
55254 + check_num_items(node);
55255 +}
55256 +
55257 +/* plugin->u.node.item_by_coord
55258 + look for description of this method in plugin/node/node.h */
55259 +char *item_by_coord_node40(const coord_t * coord)
55260 +{
55261 + item_header40 *ih;
55262 + char *p;
55263 +
55264 + /* @coord is set to existing item */
55265 + assert("nikita-596", coord != NULL);
55266 + assert("vs-255", coord_is_existing_item(coord));
55267 +
55268 + ih = node40_ih_at_coord(coord);
55269 + p = zdata(coord->node) + ih40_get_offset(ih);
55270 + return p;
55271 +}
55272 +
55273 +/* plugin->u.node.length_by_coord
55274 + look for description of this method in plugin/node/node.h */
55275 +int length_by_coord_node40(const coord_t * coord)
55276 +{
55277 + item_header40 *ih;
55278 + int result;
55279 +
55280 + /* @coord is set to existing item */
55281 + assert("vs-256", coord != NULL);
55282 + assert("vs-257", coord_is_existing_item(coord));
55283 +
55284 + ih = node40_ih_at_coord(coord);
55285 + if ((int)coord->item_pos ==
55286 + node40_num_of_items_internal(coord->node) - 1)
55287 + result =
55288 + nh40_get_free_space_start(node40_node_header(coord->node)) -
55289 + ih40_get_offset(ih);
55290 + else
55291 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55292 +
55293 + return result;
55294 +}
55295 +
55296 +static pos_in_node_t
55297 +node40_item_length(const znode * node, pos_in_node_t item_pos)
55298 +{
55299 + item_header40 *ih;
55300 + pos_in_node_t result;
55301 +
55302 + /* @coord is set to existing item */
55303 + assert("vs-256", node != NULL);
55304 + assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55305 +
55306 + ih = node40_ih_at(node, item_pos);
55307 + if (item_pos == node40_num_of_items_internal(node) - 1)
55308 + result =
55309 + nh40_get_free_space_start(node40_node_header(node)) -
55310 + ih40_get_offset(ih);
55311 + else
55312 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55313 +
55314 + return result;
55315 +}
55316 +
55317 +/* plugin->u.node.plugin_by_coord
55318 + look for description of this method in plugin/node/node.h */
55319 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
55320 +{
55321 + item_header40 *ih;
55322 + item_plugin *result;
55323 +
55324 + /* @coord is set to existing item */
55325 + assert("vs-258", coord != NULL);
55326 + assert("vs-259", coord_is_existing_item(coord));
55327 +
55328 + ih = node40_ih_at_coord(coord);
55329 + /* pass NULL in stead of current tree. This is time critical call. */
55330 + result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55331 + return result;
55332 +}
55333 +
55334 +/* plugin->u.node.key_at
55335 + look for description of this method in plugin/node/node.h */
55336 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55337 +{
55338 + item_header40 *ih;
55339 +
55340 + assert("nikita-1765", coord_is_existing_item(coord));
55341 +
55342 + /* @coord is set to existing item */
55343 + ih = node40_ih_at_coord(coord);
55344 + memcpy(key, &ih->key, sizeof(reiser4_key));
55345 + return key;
55346 +}
55347 +
55348 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55349 +
55350 +#define NODE_INCSTAT(n, counter) \
55351 + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55352 +
55353 +#define NODE_ADDSTAT(n, counter, val) \
55354 + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55355 +
55356 +/* plugin->u.node.lookup
55357 + look for description of this method in plugin/node/node.h */
55358 +node_search_result lookup_node40(znode * node /* node to query */ ,
55359 + const reiser4_key * key /* key to look for */ ,
55360 + lookup_bias bias /* search bias */ ,
55361 + coord_t * coord /* resulting coord */ )
55362 +{
55363 + int left;
55364 + int right;
55365 + int found;
55366 + int items;
55367 +
55368 + item_header40 *lefth;
55369 + item_header40 *righth;
55370 +
55371 + item_plugin *iplug;
55372 + item_header40 *bstop;
55373 + item_header40 *ih;
55374 + cmp_t order;
55375 +
55376 + assert("nikita-583", node != NULL);
55377 + assert("nikita-584", key != NULL);
55378 + assert("nikita-585", coord != NULL);
55379 + assert("nikita-2693", znode_is_any_locked(node));
55380 + cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55381 +
55382 + items = node_num_items(node);
55383 +
55384 + if (unlikely(items == 0)) {
55385 + coord_init_first_unit(coord, node);
55386 + return NS_NOT_FOUND;
55387 + }
55388 +
55389 + /* binary search for item that can contain given key */
55390 + left = 0;
55391 + right = items - 1;
55392 + coord->node = node;
55393 + coord_clear_iplug(coord);
55394 + found = 0;
55395 +
55396 + lefth = node40_ih_at(node, left);
55397 + righth = node40_ih_at(node, right);
55398 +
55399 + /* It is known that for small arrays sequential search is on average
55400 + more efficient than binary. This is because sequential search is
55401 + coded as tight loop that can be better optimized by compilers and
55402 + for small array size gain from this optimization makes sequential
55403 + search the winner. Another, maybe more important, reason for this,
55404 + is that sequential array is more CPU cache friendly, whereas binary
55405 + search effectively destroys CPU caching.
55406 +
55407 + Critical here is the notion of "smallness". Reasonable value of
55408 + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55409 + fs/reiser4/ulevel/ulevel.c:test_search().
55410 +
55411 + Don't try to further optimize sequential search by scanning from
55412 + right to left in attempt to use more efficient loop termination
55413 + condition (comparison with 0). This doesn't work.
55414 +
55415 + */
55416 +
55417 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55418 + int median;
55419 + item_header40 *medianh;
55420 +
55421 + median = (left + right) / 2;
55422 + medianh = node40_ih_at(node, median);
55423 +
55424 + assert("nikita-1084", median >= 0);
55425 + assert("nikita-1085", median < items);
55426 + switch (keycmp(key, &medianh->key)) {
55427 + case LESS_THAN:
55428 + right = median;
55429 + righth = medianh;
55430 + break;
55431 + default:
55432 + wrong_return_value("nikita-586", "keycmp");
55433 + case GREATER_THAN:
55434 + left = median;
55435 + lefth = medianh;
55436 + break;
55437 + case EQUAL_TO:
55438 + do {
55439 + --median;
55440 + /* headers are ordered from right to left */
55441 + ++medianh;
55442 + } while (median >= 0 && keyeq(key, &medianh->key));
55443 + right = left = median + 1;
55444 + ih = lefth = righth = medianh - 1;
55445 + found = 1;
55446 + break;
55447 + }
55448 + }
55449 + /* sequential scan. Item headers, and, therefore, keys are stored at
55450 + the rightmost part of a node from right to left. We are trying to
55451 + access memory from left to right, and hence, scan in _descending_
55452 + order of item numbers.
55453 + */
55454 + if (!found) {
55455 + for (left = right, ih = righth; left >= 0; ++ih, --left) {
55456 + cmp_t comparison;
55457 +
55458 + prefetchkey(&(ih + 1)->key);
55459 + comparison = keycmp(&ih->key, key);
55460 + if (comparison == GREATER_THAN)
55461 + continue;
55462 + if (comparison == EQUAL_TO) {
55463 + found = 1;
55464 + do {
55465 + --left;
55466 + ++ih;
55467 + } while (left >= 0 && keyeq(&ih->key, key));
55468 + ++left;
55469 + --ih;
55470 + } else {
55471 + assert("nikita-1256", comparison == LESS_THAN);
55472 + }
55473 + break;
55474 + }
55475 + if (unlikely(left < 0))
55476 + left = 0;
55477 + }
55478 +
55479 + assert("nikita-3212", right >= left);
55480 + assert("nikita-3214",
55481 + equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55482 +
55483 + coord_set_item_pos(coord, left);
55484 + coord->unit_pos = 0;
55485 + coord->between = AT_UNIT;
55486 +
55487 + /* key < leftmost key in a mode or node is corrupted and keys
55488 + are not sorted */
55489 + bstop = node40_ih_at(node, (unsigned)left);
55490 + order = keycmp(&bstop->key, key);
55491 + if (unlikely(order == GREATER_THAN)) {
55492 + if (unlikely(left != 0)) {
55493 + /* screw up */
55494 + warning("nikita-587", "Key less than %i key in a node",
55495 + left);
55496 + reiser4_print_key("key", key);
55497 + reiser4_print_key("min", &bstop->key);
55498 + print_coord_content("coord", coord);
55499 + return RETERR(-EIO);
55500 + } else {
55501 + coord->between = BEFORE_UNIT;
55502 + return NS_NOT_FOUND;
55503 + }
55504 + }
55505 + /* left <= key, ok */
55506 + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55507 +
55508 + if (unlikely(iplug == NULL)) {
55509 + warning("nikita-588", "Unknown plugin %i",
55510 + le16_to_cpu(get_unaligned(&bstop->plugin_id)));
55511 + reiser4_print_key("key", key);
55512 + print_coord_content("coord", coord);
55513 + return RETERR(-EIO);
55514 + }
55515 +
55516 + coord_set_iplug(coord, iplug);
55517 +
55518 + /* if exact key from item header was found by binary search, no
55519 + further checks are necessary. */
55520 + if (found) {
55521 + assert("nikita-1259", order == EQUAL_TO);
55522 + return NS_FOUND;
55523 + }
55524 + if (iplug->b.max_key_inside != NULL) {
55525 + reiser4_key max_item_key;
55526 +
55527 + /* key > max_item_key --- outside of an item */
55528 + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55529 + coord->unit_pos = 0;
55530 + coord->between = AFTER_ITEM;
55531 + /* FIXME-VS: key we are looking for does not fit into
55532 + found item. Return NS_NOT_FOUND then. Without that
55533 + the following case does not work: there is extent of
55534 + file 10000, 10001. File 10000, 10002 has been just
55535 + created. When writing to position 0 in that file -
55536 + traverse_tree will stop here on twig level. When we
55537 + want it to go down to leaf level
55538 + */
55539 + return NS_NOT_FOUND;
55540 + }
55541 + }
55542 +
55543 + if (iplug->b.lookup != NULL) {
55544 + return iplug->b.lookup(key, bias, coord);
55545 + } else {
55546 + assert("nikita-1260", order == LESS_THAN);
55547 + coord->between = AFTER_UNIT;
55548 + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
55549 + }
55550 +}
55551 +
55552 +#undef NODE_ADDSTAT
55553 +#undef NODE_INCSTAT
55554 +
55555 +/* plugin->u.node.estimate
55556 + look for description of this method in plugin/node/node.h */
55557 +size_t estimate_node40(znode * node)
55558 +{
55559 + size_t result;
55560 +
55561 + assert("nikita-597", node != NULL);
55562 +
55563 + result = free_space_node40(node) - sizeof(item_header40);
55564 +
55565 + return (result > 0) ? result : 0;
55566 +}
55567 +
55568 +/* plugin->u.node.check
55569 + look for description of this method in plugin/node/node.h */
55570 +int check_node40(const znode * node /* node to check */ ,
55571 + __u32 flags /* check flags */ ,
55572 + const char **error /* where to store error message */ )
55573 +{
55574 + int nr_items;
55575 + int i;
55576 + reiser4_key prev;
55577 + unsigned old_offset;
55578 + tree_level level;
55579 + coord_t coord;
55580 + int result;
55581 +
55582 + assert("nikita-580", node != NULL);
55583 + assert("nikita-581", error != NULL);
55584 + assert("nikita-2948", znode_is_loaded(node));
55585 +
55586 + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
55587 + return 0;
55588 +
55589 + assert("nikita-582", zdata(node) != NULL);
55590 +
55591 + nr_items = node40_num_of_items_internal(node);
55592 + if (nr_items < 0) {
55593 + *error = "Negative number of items";
55594 + return -1;
55595 + }
55596 +
55597 + if (flags & REISER4_NODE_DKEYS)
55598 + prev = *znode_get_ld_key((znode *) node);
55599 + else
55600 + prev = *reiser4_min_key();
55601 +
55602 + old_offset = 0;
55603 + coord_init_zero(&coord);
55604 + coord.node = (znode *) node;
55605 + coord.unit_pos = 0;
55606 + coord.between = AT_UNIT;
55607 + level = znode_get_level(node);
55608 + for (i = 0; i < nr_items; i++) {
55609 + item_header40 *ih;
55610 + reiser4_key unit_key;
55611 + unsigned j;
55612 +
55613 + ih = node40_ih_at(node, (unsigned)i);
55614 + coord_set_item_pos(&coord, i);
55615 + if ((ih40_get_offset(ih) >=
55616 + znode_size(node) - nr_items * sizeof(item_header40)) ||
55617 + (ih40_get_offset(ih) < sizeof(node40_header))) {
55618 + *error = "Offset is out of bounds";
55619 + return -1;
55620 + }
55621 + if (ih40_get_offset(ih) <= old_offset) {
55622 + *error = "Offsets are in wrong order";
55623 + return -1;
55624 + }
55625 + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
55626 + *error = "Wrong offset of first item";
55627 + return -1;
55628 + }
55629 + old_offset = ih40_get_offset(ih);
55630 +
55631 + if (keygt(&prev, &ih->key)) {
55632 + *error = "Keys are in wrong order";
55633 + return -1;
55634 + }
55635 + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
55636 + *error = "Wrong key of first unit";
55637 + return -1;
55638 + }
55639 + prev = ih->key;
55640 + for (j = 0; j < coord_num_units(&coord); ++j) {
55641 + coord.unit_pos = j;
55642 + unit_key_by_coord(&coord, &unit_key);
55643 + if (keygt(&prev, &unit_key)) {
55644 + *error = "Unit keys are in wrong order";
55645 + return -1;
55646 + }
55647 + prev = unit_key;
55648 + }
55649 + coord.unit_pos = 0;
55650 + if (level != TWIG_LEVEL && item_is_extent(&coord)) {
55651 + *error = "extent on the wrong level";
55652 + return -1;
55653 + }
55654 + if (level == LEAF_LEVEL && item_is_internal(&coord)) {
55655 + *error = "internal item on the wrong level";
55656 + return -1;
55657 + }
55658 + if (level != LEAF_LEVEL &&
55659 + !item_is_internal(&coord) && !item_is_extent(&coord)) {
55660 + *error = "wrong item on the internal level";
55661 + return -1;
55662 + }
55663 + if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
55664 + *error = "non-internal item on the internal level";
55665 + return -1;
55666 + }
55667 +#if REISER4_DEBUG
55668 + if (item_plugin_by_coord(&coord)->b.check
55669 + && item_plugin_by_coord(&coord)->b.check(&coord, error))
55670 + return -1;
55671 +#endif
55672 + if (i) {
55673 + coord_t prev_coord;
55674 + /* two neighboring items can not be mergeable */
55675 + coord_dup(&prev_coord, &coord);
55676 + coord_prev_item(&prev_coord);
55677 + if (are_items_mergeable(&prev_coord, &coord)) {
55678 + *error = "mergeable items in one node";
55679 + return -1;
55680 + }
55681 +
55682 + }
55683 + }
55684 +
55685 + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
55686 + coord_t coord;
55687 + item_plugin *iplug;
55688 +
55689 + coord_init_last_unit(&coord, node);
55690 + iplug = item_plugin_by_coord(&coord);
55691 + if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
55692 + iplug->s.file.append_key != NULL) {
55693 + reiser4_key mkey;
55694 +
55695 + iplug->s.file.append_key(&coord, &mkey);
55696 + set_key_offset(&mkey, get_key_offset(&mkey) - 1);
55697 + read_lock_dk(current_tree);
55698 + result = keygt(&mkey, znode_get_rd_key((znode *) node));
55699 + read_unlock_dk(current_tree);
55700 + if (result) {
55701 + *error = "key of rightmost item is too large";
55702 + return -1;
55703 + }
55704 + }
55705 + }
55706 + if (flags & REISER4_NODE_DKEYS) {
55707 + read_lock_tree(current_tree);
55708 + read_lock_dk(current_tree);
55709 +
55710 + flags |= REISER4_NODE_TREE_STABLE;
55711 +
55712 + if (keygt(&prev, znode_get_rd_key((znode *) node))) {
55713 + if (flags & REISER4_NODE_TREE_STABLE) {
55714 + *error = "Last key is greater than rdkey";
55715 + read_unlock_dk(current_tree);
55716 + read_unlock_tree(current_tree);
55717 + return -1;
55718 + }
55719 + }
55720 + if (keygt
55721 + (znode_get_ld_key((znode *) node),
55722 + znode_get_rd_key((znode *) node))) {
55723 + *error = "ldkey is greater than rdkey";
55724 + read_unlock_dk(current_tree);
55725 + read_unlock_tree(current_tree);
55726 + return -1;
55727 + }
55728 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
55729 + (node->left != NULL) &&
55730 + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
55731 + ergo(flags & REISER4_NODE_TREE_STABLE,
55732 + !keyeq(znode_get_rd_key(node->left),
55733 + znode_get_ld_key((znode *) node)))
55734 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55735 + keygt(znode_get_rd_key(node->left),
55736 + znode_get_ld_key((znode *) node)))) {
55737 + *error = "left rdkey or ldkey is wrong";
55738 + read_unlock_dk(current_tree);
55739 + read_unlock_tree(current_tree);
55740 + return -1;
55741 + }
55742 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
55743 + (node->right != NULL) &&
55744 + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
55745 + ergo(flags & REISER4_NODE_TREE_STABLE,
55746 + !keyeq(znode_get_rd_key((znode *) node),
55747 + znode_get_ld_key(node->right)))
55748 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55749 + keygt(znode_get_rd_key((znode *) node),
55750 + znode_get_ld_key(node->right)))) {
55751 + *error = "rdkey or right ldkey is wrong";
55752 + read_unlock_dk(current_tree);
55753 + read_unlock_tree(current_tree);
55754 + return -1;
55755 + }
55756 +
55757 + read_unlock_dk(current_tree);
55758 + read_unlock_tree(current_tree);
55759 + }
55760 +
55761 + return 0;
55762 +}
55763 +
55764 +/* plugin->u.node.parse
55765 + look for description of this method in plugin/node/node.h */
55766 +int parse_node40(znode * node /* node to parse */ )
55767 +{
55768 + node40_header *header;
55769 + int result;
55770 + d8 level;
55771 +
55772 + header = node40_node_header((znode *) node);
55773 + result = -EIO;
55774 + level = nh40_get_level(header);
55775 + if (unlikely(((__u8) znode_get_level(node)) != level))
55776 + warning("nikita-494", "Wrong level found in node: %i != %i",
55777 + znode_get_level(node), level);
55778 + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
55779 + warning("nikita-495",
55780 + "Wrong magic in tree node: want %x, got %x",
55781 + REISER4_NODE_MAGIC, nh40_get_magic(header));
55782 + else {
55783 + node->nr_items = node40_num_of_items_internal(node);
55784 + result = 0;
55785 + }
55786 + return RETERR(result);
55787 +}
55788 +
55789 +/* plugin->u.node.init
55790 + look for description of this method in plugin/node/node.h */
55791 +int init_node40(znode * node /* node to initialise */ )
55792 +{
55793 + node40_header *header;
55794 +
55795 + assert("nikita-570", node != NULL);
55796 + assert("nikita-572", zdata(node) != NULL);
55797 +
55798 + header = node40_node_header(node);
55799 + memset(header, 0, sizeof(node40_header));
55800 + nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
55801 + nh40_set_free_space_start(header, sizeof(node40_header));
55802 + /* sane hypothesis: 0 in CPU format is 0 in disk format */
55803 + /* items: 0 */
55804 + save_plugin_id(node_plugin_to_plugin(node->nplug),
55805 + &header->common_header.plugin_id);
55806 + nh40_set_level(header, znode_get_level(node));
55807 + nh40_set_magic(header, REISER4_NODE_MAGIC);
55808 + node->nr_items = 0;
55809 + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
55810 +
55811 + /* flags: 0 */
55812 + return 0;
55813 +}
55814 +
55815 +#ifdef GUESS_EXISTS
55816 +int guess_node40(const znode * node /* node to guess plugin of */ )
55817 +{
55818 + node40_header *nethack;
55819 +
55820 + assert("nikita-1058", node != NULL);
55821 + nethack = node40_node_header(node);
55822 + return
55823 + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
55824 + (plugin_by_disk_id(znode_get_tree(node),
55825 + REISER4_NODE_PLUGIN_TYPE,
55826 + &nethack->common_header.plugin_id)->h.id ==
55827 + NODE40_ID);
55828 +}
55829 +#endif
55830 +
55831 +/* plugin->u.node.chage_item_size
55832 + look for description of this method in plugin/node/node.h */
55833 +void change_item_size_node40(coord_t * coord, int by)
55834 +{
55835 + node40_header *nh;
55836 + item_header40 *ih;
55837 + char *item_data;
55838 + int item_length;
55839 + unsigned i;
55840 +
55841 + /* make sure that @item is coord of existing item */
55842 + assert("vs-210", coord_is_existing_item(coord));
55843 +
55844 + nh = node40_node_header(coord->node);
55845 +
55846 + item_data = item_by_coord_node40(coord);
55847 + item_length = length_by_coord_node40(coord);
55848 +
55849 + /* move item bodies */
55850 + ih = node40_ih_at_coord(coord);
55851 + memmove(item_data + item_length + by, item_data + item_length,
55852 + nh40_get_free_space_start(node40_node_header(coord->node)) -
55853 + (ih40_get_offset(ih) + item_length));
55854 +
55855 + /* update offsets of moved items */
55856 + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
55857 + ih = node40_ih_at(coord->node, i);
55858 + ih40_set_offset(ih, ih40_get_offset(ih) + by);
55859 + }
55860 +
55861 + /* update node header */
55862 + nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
55863 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
55864 +}
55865 +
55866 +static int should_notify_parent(const znode * node)
55867 +{
55868 + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
55869 + return !disk_addr_eq(znode_get_block(node),
55870 + &znode_get_tree(node)->root_block);
55871 +}
55872 +
55873 +/* plugin->u.node.create_item
55874 + look for description of this method in plugin/node/node.h */
55875 +int
55876 +create_item_node40(coord_t *target, const reiser4_key *key,
55877 + reiser4_item_data *data, carry_plugin_info *info)
55878 +{
55879 + node40_header *nh;
55880 + item_header40 *ih;
55881 + unsigned offset;
55882 + unsigned i;
55883 +
55884 + nh = node40_node_header(target->node);
55885 +
55886 + assert("vs-212", coord_is_between_items(target));
55887 + /* node must have enough free space */
55888 + assert("vs-254",
55889 + free_space_node40(target->node) >=
55890 + data->length + sizeof(item_header40));
55891 + assert("vs-1410", data->length >= 0);
55892 +
55893 + if (coord_set_to_right(target))
55894 + /* there are not items to the right of @target, so, new item
55895 + will be inserted after last one */
55896 + coord_set_item_pos(target, nh40_get_num_items(nh));
55897 +
55898 + if (target->item_pos < nh40_get_num_items(nh)) {
55899 + /* there are items to be moved to prepare space for new
55900 + item */
55901 + ih = node40_ih_at_coord(target);
55902 + /* new item will start at this offset */
55903 + offset = ih40_get_offset(ih);
55904 +
55905 + memmove(zdata(target->node) + offset + data->length,
55906 + zdata(target->node) + offset,
55907 + nh40_get_free_space_start(nh) - offset);
55908 + /* update headers of moved items */
55909 + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
55910 + ih = node40_ih_at(target->node, i);
55911 + ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
55912 + }
55913 +
55914 + /* @ih is set to item header of the last item, move item headers */
55915 + memmove(ih - 1, ih,
55916 + sizeof(item_header40) * (nh40_get_num_items(nh) -
55917 + target->item_pos));
55918 + } else {
55919 + /* new item will start at this offset */
55920 + offset = nh40_get_free_space_start(nh);
55921 + }
55922 +
55923 + /* make item header for the new item */
55924 + ih = node40_ih_at_coord(target);
55925 + memcpy(&ih->key, key, sizeof(reiser4_key));
55926 + ih40_set_offset(ih, offset);
55927 + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
55928 +
55929 + /* update node header */
55930 + nh40_set_free_space(nh,
55931 + nh40_get_free_space(nh) - data->length -
55932 + sizeof(item_header40));
55933 + nh40_set_free_space_start(nh,
55934 + nh40_get_free_space_start(nh) + data->length);
55935 + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
55936 +
55937 + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
55938 + target->unit_pos = 0;
55939 + target->between = AT_UNIT;
55940 + coord_clear_iplug(target);
55941 +
55942 + /* initialize item */
55943 + if (data->iplug->b.init != NULL) {
55944 + data->iplug->b.init(target, NULL, data);
55945 + }
55946 + /* copy item body */
55947 + if (data->iplug->b.paste != NULL) {
55948 + data->iplug->b.paste(target, data, info);
55949 + } else if (data->data != NULL) {
55950 + if (data->user) {
55951 + /* AUDIT: Are we really should not check that pointer
55952 + from userspace was valid and data bytes were
55953 + available? How will we return -EFAULT of some kind
55954 + without this check? */
55955 + assert("nikita-3038", reiser4_schedulable());
55956 + /* copy data from user space */
55957 + __copy_from_user(zdata(target->node) + offset,
55958 + (const char __user *)data->data,
55959 + (unsigned)data->length);
55960 + } else
55961 + /* copy from kernel space */
55962 + memcpy(zdata(target->node) + offset, data->data,
55963 + (unsigned)data->length);
55964 + }
55965 +
55966 + if (target->item_pos == 0) {
55967 + /* left delimiting key has to be updated */
55968 + prepare_for_update(NULL, target->node, info);
55969 + }
55970 +
55971 + if (item_plugin_by_coord(target)->b.create_hook != NULL) {
55972 + item_plugin_by_coord(target)->b.create_hook(target, data->arg);
55973 + }
55974 +
55975 + return 0;
55976 +}
55977 +
55978 +/* plugin->u.node.update_item_key
55979 + look for description of this method in plugin/node/node.h */
55980 +void
55981 +update_item_key_node40(coord_t * target, const reiser4_key * key,
55982 + carry_plugin_info * info)
55983 +{
55984 + item_header40 *ih;
55985 +
55986 + ih = node40_ih_at_coord(target);
55987 + memcpy(&ih->key, key, sizeof(reiser4_key));
55988 +
55989 + if (target->item_pos == 0) {
55990 + prepare_for_update(NULL, target->node, info);
55991 + }
55992 +}
55993 +
55994 +/* this bits encode cut mode */
55995 +#define CMODE_TAIL 1
55996 +#define CMODE_WHOLE 2
55997 +#define CMODE_HEAD 4
55998 +
55999 +struct cut40_info {
56000 + int mode;
56001 + pos_in_node_t tail_removed; /* position of item which gets tail removed */
56002 + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
56003 + pos_in_node_t removed_count; /* number of items removed completely */
56004 + pos_in_node_t head_removed; /* position of item which gets head removed */
56005 +
56006 + pos_in_node_t freed_space_start;
56007 + pos_in_node_t freed_space_end;
56008 + pos_in_node_t first_moved;
56009 + pos_in_node_t head_removed_location;
56010 +};
56011 +
56012 +static void init_cinfo(struct cut40_info *cinfo)
56013 +{
56014 + cinfo->mode = 0;
56015 + cinfo->tail_removed = MAX_POS_IN_NODE;
56016 + cinfo->first_removed = MAX_POS_IN_NODE;
56017 + cinfo->removed_count = MAX_POS_IN_NODE;
56018 + cinfo->head_removed = MAX_POS_IN_NODE;
56019 + cinfo->freed_space_start = MAX_POS_IN_NODE;
56020 + cinfo->freed_space_end = MAX_POS_IN_NODE;
56021 + cinfo->first_moved = MAX_POS_IN_NODE;
56022 + cinfo->head_removed_location = MAX_POS_IN_NODE;
56023 +}
56024 +
56025 +/* complete cut_node40/kill_node40 content by removing the gap created by */
56026 +static void compact(znode * node, struct cut40_info *cinfo)
56027 +{
56028 + node40_header *nh;
56029 + item_header40 *ih;
56030 + pos_in_node_t freed;
56031 + pos_in_node_t pos, nr_items;
56032 +
56033 + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56034 + cinfo->freed_space_end != MAX_POS_IN_NODE &&
56035 + cinfo->first_moved != MAX_POS_IN_NODE));
56036 + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56037 +
56038 + nh = node40_node_header(node);
56039 + nr_items = nh40_get_num_items(nh);
56040 +
56041 + /* remove gap made up by removal */
56042 + memmove(zdata(node) + cinfo->freed_space_start,
56043 + zdata(node) + cinfo->freed_space_end,
56044 + nh40_get_free_space_start(nh) - cinfo->freed_space_end);
56045 +
56046 + /* update item headers of moved items - change their locations */
56047 + pos = cinfo->first_moved;
56048 + ih = node40_ih_at(node, pos);
56049 + if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
56050 + assert("vs-1580", pos == cinfo->head_removed);
56051 + ih40_set_offset(ih, cinfo->head_removed_location);
56052 + pos++;
56053 + ih--;
56054 + }
56055 +
56056 + freed = cinfo->freed_space_end - cinfo->freed_space_start;
56057 + for (; pos < nr_items; pos++, ih--) {
56058 + assert("vs-1581", ih == node40_ih_at(node, pos));
56059 + ih40_set_offset(ih, ih40_get_offset(ih) - freed);
56060 + }
56061 +
56062 + /* free space start moved to right */
56063 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
56064 +
56065 + if (cinfo->removed_count != MAX_POS_IN_NODE) {
56066 + /* number of items changed. Remove item headers of those items */
56067 + ih = node40_ih_at(node, nr_items - 1);
56068 + memmove(ih + cinfo->removed_count, ih,
56069 + sizeof(item_header40) * (nr_items -
56070 + cinfo->removed_count -
56071 + cinfo->first_removed));
56072 + freed += sizeof(item_header40) * cinfo->removed_count;
56073 + node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
56074 + }
56075 +
56076 + /* total amount of free space increased */
56077 + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
56078 +}
56079 +
56080 +int shrink_item_node40(coord_t * coord, int delta)
56081 +{
56082 + node40_header *nh;
56083 + item_header40 *ih;
56084 + pos_in_node_t pos;
56085 + pos_in_node_t nr_items;
56086 + char *end;
56087 + znode *node;
56088 + int off;
56089 +
56090 + assert("nikita-3487", coord != NULL);
56091 + assert("nikita-3488", delta >= 0);
56092 +
56093 + node = coord->node;
56094 + nh = node40_node_header(node);
56095 + nr_items = nh40_get_num_items(nh);
56096 +
56097 + ih = node40_ih_at_coord(coord);
56098 + assert("nikita-3489", delta <= length_by_coord_node40(coord));
56099 + off = ih40_get_offset(ih) + length_by_coord_node40(coord);
56100 + end = zdata(node) + off;
56101 +
56102 + /* remove gap made up by removal */
56103 + memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
56104 +
56105 + /* update item headers of moved items - change their locations */
56106 + pos = coord->item_pos + 1;
56107 + ih = node40_ih_at(node, pos);
56108 + for (; pos < nr_items; pos++, ih--) {
56109 + assert("nikita-3490", ih == node40_ih_at(node, pos));
56110 + ih40_set_offset(ih, ih40_get_offset(ih) - delta);
56111 + }
56112 +
56113 + /* free space start moved to left */
56114 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
56115 + /* total amount of free space increased */
56116 + nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
56117 + /*
56118 + * This method does _not_ changes number of items. Hence, it cannot
56119 + * make node empty. Also it doesn't remove items at all, which means
56120 + * that no keys have to be updated either.
56121 + */
56122 + return 0;
56123 +}
56124 +
56125 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
56126 + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
56127 + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
56128 + getting head cut. Function returns 0 in this case */
56129 +static int
56130 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
56131 +{
56132 + reiser4_key left_key, right_key;
56133 + reiser4_key min_from_key, max_to_key;
56134 + const reiser4_key *from_key, *to_key;
56135 +
56136 + init_cinfo(cinfo);
56137 +
56138 + /* calculate minimal key stored in first item of items to be cut (params->from) */
56139 + item_key_by_coord(params->from, &min_from_key);
56140 + /* and max key stored in last item of items to be cut (params->to) */
56141 + max_item_key_by_coord(params->to, &max_to_key);
56142 +
56143 + /* if cut key range is not defined in input parameters - define it using cut coord range */
56144 + if (params->from_key == NULL) {
56145 + assert("vs-1513", params->to_key == NULL);
56146 + unit_key_by_coord(params->from, &left_key);
56147 + from_key = &left_key;
56148 + max_unit_key_by_coord(params->to, &right_key);
56149 + to_key = &right_key;
56150 + } else {
56151 + from_key = params->from_key;
56152 + to_key = params->to_key;
56153 + }
56154 +
56155 + if (params->from->item_pos == params->to->item_pos) {
56156 + if (keylt(&min_from_key, from_key)
56157 + && keylt(to_key, &max_to_key))
56158 + return 1;
56159 +
56160 + if (keygt(from_key, &min_from_key)) {
56161 + /* tail of item is to be cut cut */
56162 + cinfo->tail_removed = params->from->item_pos;
56163 + cinfo->mode |= CMODE_TAIL;
56164 + } else if (keylt(to_key, &max_to_key)) {
56165 + /* head of item is to be cut */
56166 + cinfo->head_removed = params->from->item_pos;
56167 + cinfo->mode |= CMODE_HEAD;
56168 + } else {
56169 + /* item is removed completely */
56170 + cinfo->first_removed = params->from->item_pos;
56171 + cinfo->removed_count = 1;
56172 + cinfo->mode |= CMODE_WHOLE;
56173 + }
56174 + } else {
56175 + cinfo->first_removed = params->from->item_pos + 1;
56176 + cinfo->removed_count =
56177 + params->to->item_pos - params->from->item_pos - 1;
56178 +
56179 + if (keygt(from_key, &min_from_key)) {
56180 + /* first item is not cut completely */
56181 + cinfo->tail_removed = params->from->item_pos;
56182 + cinfo->mode |= CMODE_TAIL;
56183 + } else {
56184 + cinfo->first_removed--;
56185 + cinfo->removed_count++;
56186 + }
56187 + if (keylt(to_key, &max_to_key)) {
56188 + /* last item is not cut completely */
56189 + cinfo->head_removed = params->to->item_pos;
56190 + cinfo->mode |= CMODE_HEAD;
56191 + } else {
56192 + cinfo->removed_count++;
56193 + }
56194 + if (cinfo->removed_count)
56195 + cinfo->mode |= CMODE_WHOLE;
56196 + }
56197 +
56198 + return 0;
56199 +}
56200 +
56201 +static void
56202 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
56203 + carry_kill_data * kdata)
56204 +{
56205 + coord_t coord;
56206 + item_plugin *iplug;
56207 + pos_in_node_t pos;
56208 +
56209 + coord.node = node;
56210 + coord.unit_pos = 0;
56211 + coord.between = AT_UNIT;
56212 + for (pos = 0; pos < count; pos++) {
56213 + coord_set_item_pos(&coord, from + pos);
56214 + coord.unit_pos = 0;
56215 + coord.between = AT_UNIT;
56216 + iplug = item_plugin_by_coord(&coord);
56217 + if (iplug->b.kill_hook) {
56218 + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56219 + kdata);
56220 + }
56221 + }
56222 +}
56223 +
56224 +/* this is used to kill item partially */
56225 +static pos_in_node_t
56226 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56227 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56228 +{
56229 + struct carry_kill_data *kdata;
56230 + item_plugin *iplug;
56231 +
56232 + kdata = data;
56233 + iplug = item_plugin_by_coord(coord);
56234 +
56235 + assert("vs-1524", iplug->b.kill_units);
56236 + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56237 + new_first_key);
56238 +}
56239 +
56240 +/* call item plugin to cut tail of file */
56241 +static pos_in_node_t
56242 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56243 +{
56244 + struct carry_kill_data *kdata;
56245 + pos_in_node_t to;
56246 +
56247 + kdata = data;
56248 + to = coord_last_unit_pos(coord);
56249 + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56250 + NULL);
56251 +}
56252 +
56253 +/* call item plugin to cut head of item */
56254 +static pos_in_node_t
56255 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56256 + reiser4_key * new_first_key)
56257 +{
56258 + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56259 + new_first_key);
56260 +}
56261 +
56262 +/* this is used to cut item partially */
56263 +static pos_in_node_t
56264 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56265 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56266 +{
56267 + carry_cut_data *cdata;
56268 + item_plugin *iplug;
56269 +
56270 + cdata = data;
56271 + iplug = item_plugin_by_coord(coord);
56272 + assert("vs-302", iplug->b.cut_units);
56273 + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56274 + new_first_key);
56275 +}
56276 +
56277 +/* call item plugin to cut tail of file */
56278 +static pos_in_node_t
56279 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56280 +{
56281 + carry_cut_data *cdata;
56282 + pos_in_node_t to;
56283 +
56284 + cdata = data;
56285 + to = coord_last_unit_pos(cdata->params.from);
56286 + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56287 +}
56288 +
56289 +/* call item plugin to cut head of item */
56290 +static pos_in_node_t
56291 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56292 + reiser4_key * new_first_key)
56293 +{
56294 + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56295 + new_first_key);
56296 +}
56297 +
56298 +/* this returns 1 of key of first item changed, 0 - if it did not */
56299 +static int
56300 +prepare_for_compact(struct cut40_info *cinfo,
56301 + const struct cut_kill_params *params, int is_cut,
56302 + void *data, carry_plugin_info * info)
56303 +{
56304 + znode *node;
56305 + item_header40 *ih;
56306 + pos_in_node_t freed;
56307 + pos_in_node_t item_pos;
56308 + coord_t coord;
56309 + reiser4_key new_first_key;
56310 + pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56311 + void *, reiser4_key *, reiser4_key *);
56312 + pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56313 + pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56314 + reiser4_key *);
56315 + int retval;
56316 +
56317 + retval = 0;
56318 +
56319 + node = params->from->node;
56320 +
56321 + assert("vs-184", node == params->to->node);
56322 + assert("vs-312", !node_is_empty(node));
56323 + assert("vs-297",
56324 + coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56325 +
56326 + if (is_cut) {
56327 + kill_units_f = cut_units;
56328 + kill_tail_f = cut_tail;
56329 + kill_head_f = cut_head;
56330 + } else {
56331 + kill_units_f = kill_units;
56332 + kill_tail_f = kill_tail;
56333 + kill_head_f = kill_head;
56334 + }
56335 +
56336 + if (parse_cut(cinfo, params) == 1) {
56337 + /* cut from the middle of item */
56338 + freed =
56339 + kill_units_f(params->from, params->from->unit_pos,
56340 + params->to->unit_pos, data,
56341 + params->smallest_removed, NULL);
56342 +
56343 + item_pos = params->from->item_pos;
56344 + ih = node40_ih_at(node, item_pos);
56345 + cinfo->freed_space_start =
56346 + ih40_get_offset(ih) + node40_item_length(node,
56347 + item_pos) - freed;
56348 + cinfo->freed_space_end = cinfo->freed_space_start + freed;
56349 + cinfo->first_moved = item_pos + 1;
56350 + } else {
56351 + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56352 + cinfo->first_removed != MAX_POS_IN_NODE ||
56353 + cinfo->head_removed != MAX_POS_IN_NODE));
56354 +
56355 + switch (cinfo->mode) {
56356 + case CMODE_TAIL:
56357 + /* one item gets cut partially from its end */
56358 + assert("vs-1562",
56359 + cinfo->tail_removed == params->from->item_pos);
56360 +
56361 + freed =
56362 + kill_tail_f(params->from, data,
56363 + params->smallest_removed);
56364 +
56365 + item_pos = cinfo->tail_removed;
56366 + ih = node40_ih_at(node, item_pos);
56367 + cinfo->freed_space_start =
56368 + ih40_get_offset(ih) + node40_item_length(node,
56369 + item_pos) -
56370 + freed;
56371 + cinfo->freed_space_end =
56372 + cinfo->freed_space_start + freed;
56373 + cinfo->first_moved = cinfo->tail_removed + 1;
56374 + break;
56375 +
56376 + case CMODE_WHOLE:
56377 + /* one or more items get removed completely */
56378 + assert("vs-1563",
56379 + cinfo->first_removed == params->from->item_pos);
56380 + assert("vs-1564", cinfo->removed_count > 0
56381 + && cinfo->removed_count != MAX_POS_IN_NODE);
56382 +
56383 + /* call kill hook for all items removed completely */
56384 + if (is_cut == 0)
56385 + call_kill_hooks(node, cinfo->first_removed,
56386 + cinfo->removed_count, data);
56387 +
56388 + item_pos = cinfo->first_removed;
56389 + ih = node40_ih_at(node, item_pos);
56390 +
56391 + if (params->smallest_removed)
56392 + memcpy(params->smallest_removed, &ih->key,
56393 + sizeof(reiser4_key));
56394 +
56395 + cinfo->freed_space_start = ih40_get_offset(ih);
56396 +
56397 + item_pos += (cinfo->removed_count - 1);
56398 + ih -= (cinfo->removed_count - 1);
56399 + cinfo->freed_space_end =
56400 + ih40_get_offset(ih) + node40_item_length(node,
56401 + item_pos);
56402 + cinfo->first_moved = item_pos + 1;
56403 + if (cinfo->first_removed == 0)
56404 + /* key of first item of the node changes */
56405 + retval = 1;
56406 + break;
56407 +
56408 + case CMODE_HEAD:
56409 + /* one item gets cut partially from its head */
56410 + assert("vs-1565",
56411 + cinfo->head_removed == params->from->item_pos);
56412 +
56413 + freed =
56414 + kill_head_f(params->to, data,
56415 + params->smallest_removed,
56416 + &new_first_key);
56417 +
56418 + item_pos = cinfo->head_removed;
56419 + ih = node40_ih_at(node, item_pos);
56420 + cinfo->freed_space_start = ih40_get_offset(ih);
56421 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56422 + cinfo->first_moved = cinfo->head_removed + 1;
56423 +
56424 + /* item head is removed, therefore, item key changed */
56425 + coord.node = node;
56426 + coord_set_item_pos(&coord, item_pos);
56427 + coord.unit_pos = 0;
56428 + coord.between = AT_UNIT;
56429 + update_item_key_node40(&coord, &new_first_key, NULL);
56430 + if (item_pos == 0)
56431 + /* key of first item of the node changes */
56432 + retval = 1;
56433 + break;
56434 +
56435 + case CMODE_TAIL | CMODE_WHOLE:
56436 + /* one item gets cut from its end and one or more items get removed completely */
56437 + assert("vs-1566",
56438 + cinfo->tail_removed == params->from->item_pos);
56439 + assert("vs-1567",
56440 + cinfo->first_removed == cinfo->tail_removed + 1);
56441 + assert("vs-1564", cinfo->removed_count > 0
56442 + && cinfo->removed_count != MAX_POS_IN_NODE);
56443 +
56444 + freed =
56445 + kill_tail_f(params->from, data,
56446 + params->smallest_removed);
56447 +
56448 + item_pos = cinfo->tail_removed;
56449 + ih = node40_ih_at(node, item_pos);
56450 + cinfo->freed_space_start =
56451 + ih40_get_offset(ih) + node40_item_length(node,
56452 + item_pos) -
56453 + freed;
56454 +
56455 + /* call kill hook for all items removed completely */
56456 + if (is_cut == 0)
56457 + call_kill_hooks(node, cinfo->first_removed,
56458 + cinfo->removed_count, data);
56459 +
56460 + item_pos += cinfo->removed_count;
56461 + ih -= cinfo->removed_count;
56462 + cinfo->freed_space_end =
56463 + ih40_get_offset(ih) + node40_item_length(node,
56464 + item_pos);
56465 + cinfo->first_moved = item_pos + 1;
56466 + break;
56467 +
56468 + case CMODE_WHOLE | CMODE_HEAD:
56469 + /* one or more items get removed completely and one item gets cut partially from its head */
56470 + assert("vs-1568",
56471 + cinfo->first_removed == params->from->item_pos);
56472 + assert("vs-1564", cinfo->removed_count > 0
56473 + && cinfo->removed_count != MAX_POS_IN_NODE);
56474 + assert("vs-1569",
56475 + cinfo->head_removed ==
56476 + cinfo->first_removed + cinfo->removed_count);
56477 +
56478 + /* call kill hook for all items removed completely */
56479 + if (is_cut == 0)
56480 + call_kill_hooks(node, cinfo->first_removed,
56481 + cinfo->removed_count, data);
56482 +
56483 + item_pos = cinfo->first_removed;
56484 + ih = node40_ih_at(node, item_pos);
56485 +
56486 + if (params->smallest_removed)
56487 + memcpy(params->smallest_removed, &ih->key,
56488 + sizeof(reiser4_key));
56489 +
56490 + freed =
56491 + kill_head_f(params->to, data, NULL, &new_first_key);
56492 +
56493 + cinfo->freed_space_start = ih40_get_offset(ih);
56494 +
56495 + ih = node40_ih_at(node, cinfo->head_removed);
56496 + /* this is the most complex case. Item which got head removed and items which are to be moved
56497 + intact change their location differently. */
56498 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56499 + cinfo->first_moved = cinfo->head_removed;
56500 + cinfo->head_removed_location = cinfo->freed_space_start;
56501 +
56502 + /* item head is removed, therefore, item key changed */
56503 + coord.node = node;
56504 + coord_set_item_pos(&coord, cinfo->head_removed);
56505 + coord.unit_pos = 0;
56506 + coord.between = AT_UNIT;
56507 + update_item_key_node40(&coord, &new_first_key, NULL);
56508 +
56509 + assert("vs-1579", cinfo->first_removed == 0);
56510 + /* key of first item of the node changes */
56511 + retval = 1;
56512 + break;
56513 +
56514 + case CMODE_TAIL | CMODE_HEAD:
56515 + /* one item get cut from its end and its neighbor gets cut from its tail */
56516 + impossible("vs-1576", "this can not happen currently");
56517 + break;
56518 +
56519 + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56520 + impossible("vs-1577", "this can not happen currently");
56521 + break;
56522 + default:
56523 + impossible("vs-1578", "unexpected cut mode");
56524 + break;
56525 + }
56526 + }
56527 + return retval;
56528 +}
56529 +
56530 +/* plugin->u.node.kill
56531 + return value is number of items removed completely */
56532 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56533 +{
56534 + znode *node;
56535 + struct cut40_info cinfo;
56536 + int first_key_changed;
56537 +
56538 + node = kdata->params.from->node;
56539 +
56540 + first_key_changed =
56541 + prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
56542 + info);
56543 + compact(node, &cinfo);
56544 +
56545 + if (info) {
56546 + /* it is not called by node40_shift, so we have to take care
56547 + of changes on upper levels */
56548 + if (node_is_empty(node)
56549 + && !(kdata->flags & DELETE_RETAIN_EMPTY))
56550 + /* all contents of node is deleted */
56551 + prepare_removal_node40(node, info);
56552 + else if (first_key_changed) {
56553 + prepare_for_update(NULL, node, info);
56554 + }
56555 + }
56556 +
56557 + coord_clear_iplug(kdata->params.from);
56558 + coord_clear_iplug(kdata->params.to);
56559 +
56560 + znode_make_dirty(node);
56561 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56562 +}
56563 +
56564 +/* plugin->u.node.cut
56565 + return value is number of items removed completely */
56566 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
56567 +{
56568 + znode *node;
56569 + struct cut40_info cinfo;
56570 + int first_key_changed;
56571 +
56572 + node = cdata->params.from->node;
56573 +
56574 + first_key_changed =
56575 + prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
56576 + info);
56577 + compact(node, &cinfo);
56578 +
56579 + if (info) {
56580 + /* it is not called by node40_shift, so we have to take care
56581 + of changes on upper levels */
56582 + if (node_is_empty(node))
56583 + /* all contents of node is deleted */
56584 + prepare_removal_node40(node, info);
56585 + else if (first_key_changed) {
56586 + prepare_for_update(NULL, node, info);
56587 + }
56588 + }
56589 +
56590 + coord_clear_iplug(cdata->params.from);
56591 + coord_clear_iplug(cdata->params.to);
56592 +
56593 + znode_make_dirty(node);
56594 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56595 +}
56596 +
56597 +/* this structure is used by shift method of node40 plugin */
56598 +struct shift_params {
56599 + shift_direction pend; /* when @pend == append - we are shifting to
56600 + left, when @pend == prepend - to right */
56601 + coord_t wish_stop; /* when shifting to left this is last unit we
56602 + want shifted, when shifting to right - this
56603 + is set to unit we want to start shifting
56604 + from */
56605 + znode *target;
56606 + int everything; /* it is set to 1 if everything we have to shift is
56607 + shifted, 0 - otherwise */
56608 +
56609 + /* FIXME-VS: get rid of read_stop */
56610 +
56611 + /* these are set by estimate_shift */
56612 + coord_t real_stop; /* this will be set to last unit which will be
56613 + really shifted */
56614 +
56615 + /* coordinate in source node before operation of unit which becomes
56616 + first after shift to left of last after shift to right */
56617 + union {
56618 + coord_t future_first;
56619 + coord_t future_last;
56620 + } u;
56621 +
56622 + unsigned merging_units; /* number of units of first item which have to
56623 + be merged with last item of target node */
56624 + unsigned merging_bytes; /* number of bytes in those units */
56625 +
56626 + unsigned entire; /* items shifted in their entirety */
56627 + unsigned entire_bytes; /* number of bytes in those items */
56628 +
56629 + unsigned part_units; /* number of units of partially copied item */
56630 + unsigned part_bytes; /* number of bytes in those units */
56631 +
56632 + unsigned shift_bytes; /* total number of bytes in items shifted (item
56633 + headers not included) */
56634 +
56635 +};
56636 +
56637 +static int item_creation_overhead(coord_t *item)
56638 +{
56639 + return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
56640 +}
56641 +
56642 +/* how many units are there in @source starting from source->unit_pos
56643 + but not further than @stop_coord */
56644 +static int
56645 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
56646 +{
56647 + if (pend == SHIFT_LEFT) {
56648 + assert("vs-181", source->unit_pos == 0);
56649 + } else {
56650 + assert("vs-182",
56651 + source->unit_pos == coord_last_unit_pos(source));
56652 + }
56653 +
56654 + if (source->item_pos != stop_coord->item_pos) {
56655 + /* @source and @stop_coord are different items */
56656 + return coord_last_unit_pos(source) + 1;
56657 + }
56658 +
56659 + if (pend == SHIFT_LEFT) {
56660 + return stop_coord->unit_pos + 1;
56661 + } else {
56662 + return source->unit_pos - stop_coord->unit_pos + 1;
56663 + }
56664 +}
56665 +
56666 +/* this calculates what can be copied from @shift->wish_stop.node to
56667 + @shift->target */
56668 +static void
56669 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
56670 +{
56671 + unsigned target_free_space, size;
56672 + pos_in_node_t stop_item; /* item which estimating should not consider */
56673 + unsigned want; /* number of units of item we want shifted */
56674 + coord_t source; /* item being estimated */
56675 + item_plugin *iplug;
56676 +
56677 + /* shifting to left/right starts from first/last units of
56678 + @shift->wish_stop.node */
56679 + if (shift->pend == SHIFT_LEFT) {
56680 + coord_init_first_unit(&source, shift->wish_stop.node);
56681 + } else {
56682 + coord_init_last_unit(&source, shift->wish_stop.node);
56683 + }
56684 + shift->real_stop = source;
56685 +
56686 + /* free space in target node and number of items in source */
56687 + target_free_space = znode_free_space(shift->target);
56688 +
56689 + shift->everything = 0;
56690 + if (!node_is_empty(shift->target)) {
56691 + /* target node is not empty, check for boundary items
56692 + mergeability */
56693 + coord_t to;
56694 +
56695 + /* item we try to merge @source with */
56696 + if (shift->pend == SHIFT_LEFT) {
56697 + coord_init_last_unit(&to, shift->target);
56698 + } else {
56699 + coord_init_first_unit(&to, shift->target);
56700 + }
56701 +
56702 + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
56703 + &source) :
56704 + are_items_mergeable(&source, &to)) {
56705 + /* how many units of @source do we want to merge to
56706 + item @to */
56707 + want =
56708 + wanted_units(&source, &shift->wish_stop,
56709 + shift->pend);
56710 +
56711 + /* how many units of @source we can merge to item
56712 + @to */
56713 + iplug = item_plugin_by_coord(&source);
56714 + if (iplug->b.can_shift != NULL)
56715 + shift->merging_units =
56716 + iplug->b.can_shift(target_free_space,
56717 + &source, shift->target,
56718 + shift->pend, &size,
56719 + want);
56720 + else {
56721 + shift->merging_units = 0;
56722 + size = 0;
56723 + }
56724 + shift->merging_bytes = size;
56725 + shift->shift_bytes += size;
56726 + /* update stop coord to be set to last unit of @source
56727 + we can merge to @target */
56728 + if (shift->merging_units)
56729 + /* at least one unit can be shifted */
56730 + shift->real_stop.unit_pos =
56731 + (shift->merging_units - source.unit_pos -
56732 + 1) * shift->pend;
56733 + else {
56734 + /* nothing can be shifted */
56735 + if (shift->pend == SHIFT_LEFT)
56736 + coord_init_before_first_item(&shift->
56737 + real_stop,
56738 + source.
56739 + node);
56740 + else
56741 + coord_init_after_last_item(&shift->
56742 + real_stop,
56743 + source.node);
56744 + }
56745 + assert("nikita-2081", shift->real_stop.unit_pos + 1);
56746 +
56747 + if (shift->merging_units != want) {
56748 + /* we could not copy as many as we want, so,
56749 + there is no reason for estimating any
56750 + longer */
56751 + return;
56752 + }
56753 +
56754 + target_free_space -= size;
56755 + coord_add_item_pos(&source, shift->pend);
56756 + }
56757 + }
56758 +
56759 + /* number of item nothing of which we want to shift */
56760 + stop_item = shift->wish_stop.item_pos + shift->pend;
56761 +
56762 + /* calculate how many items can be copied into given free
56763 + space as whole */
56764 + for (; source.item_pos != stop_item;
56765 + coord_add_item_pos(&source, shift->pend)) {
56766 + if (shift->pend == SHIFT_RIGHT)
56767 + source.unit_pos = coord_last_unit_pos(&source);
56768 +
56769 + /* how many units of @source do we want to copy */
56770 + want = wanted_units(&source, &shift->wish_stop, shift->pend);
56771 +
56772 + if (want == coord_last_unit_pos(&source) + 1) {
56773 + /* we want this item to be copied entirely */
56774 + size =
56775 + item_length_by_coord(&source) +
56776 + item_creation_overhead(&source);
56777 + if (size <= target_free_space) {
56778 + /* item fits into target node as whole */
56779 + target_free_space -= size;
56780 + shift->shift_bytes +=
56781 + size - item_creation_overhead(&source);
56782 + shift->entire_bytes +=
56783 + size - item_creation_overhead(&source);
56784 + shift->entire++;
56785 +
56786 + /* update shift->real_stop coord to be set to
56787 + last unit of @source we can merge to
56788 + @target */
56789 + shift->real_stop = source;
56790 + if (shift->pend == SHIFT_LEFT)
56791 + shift->real_stop.unit_pos =
56792 + coord_last_unit_pos(&shift->
56793 + real_stop);
56794 + else
56795 + shift->real_stop.unit_pos = 0;
56796 + continue;
56797 + }
56798 + }
56799 +
56800 + /* we reach here only for an item which does not fit into
56801 + target node in its entirety. This item may be either
56802 + partially shifted, or not shifted at all. We will have to
56803 + create new item in target node, so decrease amout of free
56804 + space by an item creation overhead. We can reach here also
56805 + if stop coord is in this item */
56806 + if (target_free_space >=
56807 + (unsigned)item_creation_overhead(&source)) {
56808 + target_free_space -= item_creation_overhead(&source);
56809 + iplug = item_plugin_by_coord(&source);
56810 + if (iplug->b.can_shift) {
56811 + shift->part_units = iplug->b.can_shift(target_free_space,
56812 + &source,
56813 + NULL, /* target */
56814 + shift->pend,
56815 + &size,
56816 + want);
56817 + } else {
56818 + target_free_space = 0;
56819 + shift->part_units = 0;
56820 + size = 0;
56821 + }
56822 + } else {
56823 + target_free_space = 0;
56824 + shift->part_units = 0;
56825 + size = 0;
56826 + }
56827 + shift->part_bytes = size;
56828 + shift->shift_bytes += size;
56829 +
56830 + /* set @shift->real_stop to last unit of @source we can merge
56831 + to @shift->target */
56832 + if (shift->part_units) {
56833 + shift->real_stop = source;
56834 + shift->real_stop.unit_pos =
56835 + (shift->part_units - source.unit_pos -
56836 + 1) * shift->pend;
56837 + assert("nikita-2082", shift->real_stop.unit_pos + 1);
56838 + }
56839 +
56840 + if (want != shift->part_units)
56841 + /* not everything wanted were shifted */
56842 + return;
56843 + break;
56844 + }
56845 +
56846 + shift->everything = 1;
56847 +}
56848 +
56849 +static void
56850 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
56851 + shift_direction dir, unsigned free_space)
56852 +{
56853 + item_plugin *iplug;
56854 +
56855 + assert("nikita-1463", target != NULL);
56856 + assert("nikita-1464", source != NULL);
56857 + assert("nikita-1465", from + count <= coord_num_units(source));
56858 +
56859 + iplug = item_plugin_by_coord(source);
56860 + assert("nikita-1468", iplug == item_plugin_by_coord(target));
56861 + iplug->b.copy_units(target, source, from, count, dir, free_space);
56862 +
56863 + if (dir == SHIFT_RIGHT) {
56864 + /* FIXME-VS: this looks not necessary. update_item_key was
56865 + called already by copy_units method */
56866 + reiser4_key split_key;
56867 +
56868 + assert("nikita-1469", target->unit_pos == 0);
56869 +
56870 + unit_key_by_coord(target, &split_key);
56871 + node_plugin_by_coord(target)->update_item_key(target,
56872 + &split_key, NULL);
56873 + }
56874 +}
56875 +
56876 +/* copy part of @shift->real_stop.node starting either from its beginning or
56877 + from its end and ending at @shift->real_stop to either the end or the
56878 + beginning of @shift->target */
56879 +static void copy(struct shift_params *shift)
56880 +{
56881 + node40_header *nh;
56882 + coord_t from;
56883 + coord_t to;
56884 + item_header40 *from_ih, *to_ih;
56885 + int free_space_start;
56886 + int new_items;
56887 + unsigned old_items;
56888 + int old_offset;
56889 + unsigned i;
56890 +
56891 + nh = node40_node_header(shift->target);
56892 + free_space_start = nh40_get_free_space_start(nh);
56893 + old_items = nh40_get_num_items(nh);
56894 + new_items = shift->entire + (shift->part_units ? 1 : 0);
56895 + assert("vs-185",
56896 + shift->shift_bytes ==
56897 + shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
56898 +
56899 + from = shift->wish_stop;
56900 +
56901 + coord_init_first_unit(&to, shift->target);
56902 +
56903 + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
56904 + hence to.between is set to EMPTY_NODE above. Looks like we want it
56905 + to be AT_UNIT.
56906 +
56907 + Oh, wonders of ->betweeness...
56908 +
56909 + */
56910 + to.between = AT_UNIT;
56911 +
56912 + if (shift->pend == SHIFT_LEFT) {
56913 + /* copying to left */
56914 +
56915 + coord_set_item_pos(&from, 0);
56916 + from_ih = node40_ih_at(from.node, 0);
56917 +
56918 + coord_set_item_pos(&to,
56919 + node40_num_of_items_internal(to.node) - 1);
56920 + if (shift->merging_units) {
56921 + /* expand last item, so that plugin methods will see
56922 + correct data */
56923 + free_space_start += shift->merging_bytes;
56924 + nh40_set_free_space_start(nh,
56925 + (unsigned)free_space_start);
56926 + nh40_set_free_space(nh,
56927 + nh40_get_free_space(nh) -
56928 + shift->merging_bytes);
56929 +
56930 + /* appending last item of @target */
56931 + copy_units(&to, &from, 0, /* starting from 0-th unit */
56932 + shift->merging_units, SHIFT_LEFT,
56933 + shift->merging_bytes);
56934 + coord_inc_item_pos(&from);
56935 + from_ih--;
56936 + coord_inc_item_pos(&to);
56937 + }
56938 +
56939 + to_ih = node40_ih_at(shift->target, old_items);
56940 + if (shift->entire) {
56941 + /* copy @entire items entirely */
56942 +
56943 + /* copy item headers */
56944 + memcpy(to_ih - shift->entire + 1,
56945 + from_ih - shift->entire + 1,
56946 + shift->entire * sizeof(item_header40));
56947 + /* update item header offset */
56948 + old_offset = ih40_get_offset(from_ih);
56949 + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
56950 + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
56951 + ih40_set_offset(to_ih,
56952 + ih40_get_offset(from_ih) -
56953 + old_offset + free_space_start);
56954 +
56955 + /* copy item bodies */
56956 + memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
56957 + shift->entire_bytes);
56958 +
56959 + coord_add_item_pos(&from, (int)shift->entire);
56960 + coord_add_item_pos(&to, (int)shift->entire);
56961 + }
56962 +
56963 + nh40_set_free_space_start(nh,
56964 + free_space_start +
56965 + shift->shift_bytes -
56966 + shift->merging_bytes);
56967 + nh40_set_free_space(nh,
56968 + nh40_get_free_space(nh) -
56969 + (shift->shift_bytes - shift->merging_bytes +
56970 + sizeof(item_header40) * new_items));
56971 +
56972 + /* update node header */
56973 + node40_set_num_items(shift->target, nh, old_items + new_items);
56974 + assert("vs-170",
56975 + nh40_get_free_space(nh) < znode_size(shift->target));
56976 +
56977 + if (shift->part_units) {
56978 + /* copy heading part (@part units) of @source item as
56979 + a new item into @target->node */
56980 +
56981 + /* copy item header of partially copied item */
56982 + coord_set_item_pos(&to,
56983 + node40_num_of_items_internal(to.node)
56984 + - 1);
56985 + memcpy(to_ih, from_ih, sizeof(item_header40));
56986 + ih40_set_offset(to_ih,
56987 + nh40_get_free_space_start(nh) -
56988 + shift->part_bytes);
56989 + if (item_plugin_by_coord(&to)->b.init)
56990 + item_plugin_by_coord(&to)->b.init(&to, &from,
56991 + NULL);
56992 + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
56993 + shift->part_bytes);
56994 + }
56995 +
56996 + } else {
56997 + /* copying to right */
56998 +
56999 + coord_set_item_pos(&from,
57000 + node40_num_of_items_internal(from.node) - 1);
57001 + from_ih = node40_ih_at_coord(&from);
57002 +
57003 + coord_set_item_pos(&to, 0);
57004 +
57005 + /* prepare space for new items */
57006 + memmove(zdata(to.node) + sizeof(node40_header) +
57007 + shift->shift_bytes,
57008 + zdata(to.node) + sizeof(node40_header),
57009 + free_space_start - sizeof(node40_header));
57010 + /* update item headers of moved items */
57011 + to_ih = node40_ih_at(to.node, 0);
57012 + /* first item gets @merging_bytes longer. free space appears
57013 + at its beginning */
57014 + if (!node_is_empty(to.node))
57015 + ih40_set_offset(to_ih,
57016 + ih40_get_offset(to_ih) +
57017 + shift->shift_bytes -
57018 + shift->merging_bytes);
57019 +
57020 + for (i = 1; i < old_items; i++)
57021 + ih40_set_offset(to_ih - i,
57022 + ih40_get_offset(to_ih - i) +
57023 + shift->shift_bytes);
57024 +
57025 + /* move item headers to make space for new items */
57026 + memmove(to_ih - old_items + 1 - new_items,
57027 + to_ih - old_items + 1,
57028 + sizeof(item_header40) * old_items);
57029 + to_ih -= (new_items - 1);
57030 +
57031 + nh40_set_free_space_start(nh,
57032 + free_space_start +
57033 + shift->shift_bytes);
57034 + nh40_set_free_space(nh,
57035 + nh40_get_free_space(nh) -
57036 + (shift->shift_bytes +
57037 + sizeof(item_header40) * new_items));
57038 +
57039 + /* update node header */
57040 + node40_set_num_items(shift->target, nh, old_items + new_items);
57041 + assert("vs-170",
57042 + nh40_get_free_space(nh) < znode_size(shift->target));
57043 +
57044 + if (shift->merging_units) {
57045 + coord_add_item_pos(&to, new_items);
57046 + to.unit_pos = 0;
57047 + to.between = AT_UNIT;
57048 + /* prepend first item of @to */
57049 + copy_units(&to, &from,
57050 + coord_last_unit_pos(&from) -
57051 + shift->merging_units + 1,
57052 + shift->merging_units, SHIFT_RIGHT,
57053 + shift->merging_bytes);
57054 + coord_dec_item_pos(&from);
57055 + from_ih++;
57056 + }
57057 +
57058 + if (shift->entire) {
57059 + /* copy @entire items entirely */
57060 +
57061 + /* copy item headers */
57062 + memcpy(to_ih, from_ih,
57063 + shift->entire * sizeof(item_header40));
57064 +
57065 + /* update item header offset */
57066 + old_offset =
57067 + ih40_get_offset(from_ih + shift->entire - 1);
57068 + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
57069 + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
57070 + ih40_set_offset(to_ih,
57071 + ih40_get_offset(from_ih) -
57072 + old_offset +
57073 + sizeof(node40_header) +
57074 + shift->part_bytes);
57075 + /* copy item bodies */
57076 + coord_add_item_pos(&from, -(int)(shift->entire - 1));
57077 + memcpy(zdata(to.node) + sizeof(node40_header) +
57078 + shift->part_bytes, item_by_coord_node40(&from),
57079 + shift->entire_bytes);
57080 + coord_dec_item_pos(&from);
57081 + }
57082 +
57083 + if (shift->part_units) {
57084 + coord_set_item_pos(&to, 0);
57085 + to.unit_pos = 0;
57086 + to.between = AT_UNIT;
57087 + /* copy heading part (@part units) of @source item as
57088 + a new item into @target->node */
57089 +
57090 + /* copy item header of partially copied item */
57091 + memcpy(to_ih, from_ih, sizeof(item_header40));
57092 + ih40_set_offset(to_ih, sizeof(node40_header));
57093 + if (item_plugin_by_coord(&to)->b.init)
57094 + item_plugin_by_coord(&to)->b.init(&to, &from,
57095 + NULL);
57096 + copy_units(&to, &from,
57097 + coord_last_unit_pos(&from) -
57098 + shift->part_units + 1, shift->part_units,
57099 + SHIFT_RIGHT, shift->part_bytes);
57100 + }
57101 + }
57102 +}
57103 +
57104 +/* remove everything either before or after @fact_stop. Number of items
57105 + removed completely is returned */
57106 +static int delete_copied(struct shift_params *shift)
57107 +{
57108 + coord_t from;
57109 + coord_t to;
57110 + struct carry_cut_data cdata;
57111 +
57112 + if (shift->pend == SHIFT_LEFT) {
57113 + /* we were shifting to left, remove everything from the
57114 + beginning of @shift->wish_stop->node upto
57115 + @shift->wish_stop */
57116 + coord_init_first_unit(&from, shift->real_stop.node);
57117 + to = shift->real_stop;
57118 +
57119 + /* store old coordinate of unit which will be first after
57120 + shift to left */
57121 + shift->u.future_first = to;
57122 + coord_next_unit(&shift->u.future_first);
57123 + } else {
57124 + /* we were shifting to right, remove everything from
57125 + @shift->stop_coord upto to end of
57126 + @shift->stop_coord->node */
57127 + from = shift->real_stop;
57128 + coord_init_last_unit(&to, from.node);
57129 +
57130 + /* store old coordinate of unit which will be last after
57131 + shift to right */
57132 + shift->u.future_last = from;
57133 + coord_prev_unit(&shift->u.future_last);
57134 + }
57135 +
57136 + cdata.params.from = &from;
57137 + cdata.params.to = &to;
57138 + cdata.params.from_key = NULL;
57139 + cdata.params.to_key = NULL;
57140 + cdata.params.smallest_removed = NULL;
57141 + return cut_node40(&cdata, NULL);
57142 +}
57143 +
57144 +/* something was moved between @left and @right. Add carry operation to @info
57145 + list to have carry to update delimiting key between them */
57146 +static int
57147 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
57148 +{
57149 + carry_op *op;
57150 + carry_node *cn;
57151 +
57152 + if (info == NULL)
57153 + /* nowhere to send operation to. */
57154 + return 0;
57155 +
57156 + if (!should_notify_parent(right))
57157 + return 0;
57158 +
57159 + op = node_post_carry(info, COP_UPDATE, right, 1);
57160 + if (IS_ERR(op) || op == NULL)
57161 + return op ? PTR_ERR(op) : -EIO;
57162 +
57163 + if (left != NULL) {
57164 + carry_node *reference;
57165 +
57166 + if (info->doing)
57167 + reference = insert_carry_node(info->doing,
57168 + info->todo, left);
57169 + else
57170 + reference = op->node;
57171 + assert("nikita-2992", reference != NULL);
57172 + cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
57173 + if (IS_ERR(cn))
57174 + return PTR_ERR(cn);
57175 + cn->parent = 1;
57176 + cn->node = left;
57177 + if (ZF_ISSET(left, JNODE_ORPHAN))
57178 + cn->left_before = 1;
57179 + op->u.update.left = cn;
57180 + } else
57181 + op->u.update.left = NULL;
57182 + return 0;
57183 +}
57184 +
57185 +/* plugin->u.node.prepare_removal
57186 + to delete a pointer to @empty from the tree add corresponding carry
57187 + operation (delete) to @info list */
57188 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
57189 +{
57190 + carry_op *op;
57191 + reiser4_tree *tree;
57192 +
57193 + if (!should_notify_parent(empty))
57194 + return 0;
57195 + /* already on a road to Styx */
57196 + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
57197 + return 0;
57198 + op = node_post_carry(info, COP_DELETE, empty, 1);
57199 + if (IS_ERR(op) || op == NULL)
57200 + return RETERR(op ? PTR_ERR(op) : -EIO);
57201 +
57202 + op->u.delete.child = NULL;
57203 + op->u.delete.flags = 0;
57204 +
57205 + /* fare thee well */
57206 + tree = znode_get_tree(empty);
57207 + read_lock_tree(tree);
57208 + write_lock_dk(tree);
57209 + znode_set_ld_key(empty, znode_get_rd_key(empty));
57210 + if (znode_is_left_connected(empty) && empty->left)
57211 + znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57212 + write_unlock_dk(tree);
57213 + read_unlock_tree(tree);
57214 +
57215 + ZF_SET(empty, JNODE_HEARD_BANSHEE);
57216 + return 0;
57217 +}
57218 +
57219 +/* something were shifted from @insert_coord->node to @shift->target, update
57220 + @insert_coord correspondingly */
57221 +static void
57222 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57223 + int including_insert_coord)
57224 +{
57225 + /* item plugin was invalidated by shifting */
57226 + coord_clear_iplug(insert_coord);
57227 +
57228 + if (node_is_empty(shift->wish_stop.node)) {
57229 + assert("vs-242", shift->everything);
57230 + if (including_insert_coord) {
57231 + if (shift->pend == SHIFT_RIGHT) {
57232 + /* set @insert_coord before first unit of
57233 + @shift->target node */
57234 + coord_init_before_first_item(insert_coord,
57235 + shift->target);
57236 + } else {
57237 + /* set @insert_coord after last in target node */
57238 + coord_init_after_last_item(insert_coord,
57239 + shift->target);
57240 + }
57241 + } else {
57242 + /* set @insert_coord inside of empty node. There is
57243 + only one possible coord within an empty
57244 + node. init_first_unit will set that coord */
57245 + coord_init_first_unit(insert_coord,
57246 + shift->wish_stop.node);
57247 + }
57248 + return;
57249 + }
57250 +
57251 + if (shift->pend == SHIFT_RIGHT) {
57252 + /* there was shifting to right */
57253 + if (shift->everything) {
57254 + /* everything wanted was shifted */
57255 + if (including_insert_coord) {
57256 + /* @insert_coord is set before first unit of
57257 + @to node */
57258 + coord_init_before_first_item(insert_coord,
57259 + shift->target);
57260 + insert_coord->between = BEFORE_UNIT;
57261 + } else {
57262 + /* @insert_coord is set after last unit of
57263 + @insert->node */
57264 + coord_init_last_unit(insert_coord,
57265 + shift->wish_stop.node);
57266 + insert_coord->between = AFTER_UNIT;
57267 + }
57268 + }
57269 + return;
57270 + }
57271 +
57272 + /* there was shifting to left */
57273 + if (shift->everything) {
57274 + /* everything wanted was shifted */
57275 + if (including_insert_coord) {
57276 + /* @insert_coord is set after last unit in @to node */
57277 + coord_init_after_last_item(insert_coord, shift->target);
57278 + } else {
57279 + /* @insert_coord is set before first unit in the same
57280 + node */
57281 + coord_init_before_first_item(insert_coord,
57282 + shift->wish_stop.node);
57283 + }
57284 + return;
57285 + }
57286 +
57287 + /* FIXME-VS: the code below is complicated because with between ==
57288 + AFTER_ITEM unit_pos is set to 0 */
57289 +
57290 + if (!removed) {
57291 + /* no items were shifted entirely */
57292 + assert("vs-195", shift->merging_units == 0
57293 + || shift->part_units == 0);
57294 +
57295 + if (shift->real_stop.item_pos == insert_coord->item_pos) {
57296 + if (shift->merging_units) {
57297 + if (insert_coord->between == AFTER_UNIT) {
57298 + assert("nikita-1441",
57299 + insert_coord->unit_pos >=
57300 + shift->merging_units);
57301 + insert_coord->unit_pos -=
57302 + shift->merging_units;
57303 + } else if (insert_coord->between == BEFORE_UNIT) {
57304 + assert("nikita-2090",
57305 + insert_coord->unit_pos >
57306 + shift->merging_units);
57307 + insert_coord->unit_pos -=
57308 + shift->merging_units;
57309 + }
57310 +
57311 + assert("nikita-2083",
57312 + insert_coord->unit_pos + 1);
57313 + } else {
57314 + if (insert_coord->between == AFTER_UNIT) {
57315 + assert("nikita-1442",
57316 + insert_coord->unit_pos >=
57317 + shift->part_units);
57318 + insert_coord->unit_pos -=
57319 + shift->part_units;
57320 + } else if (insert_coord->between == BEFORE_UNIT) {
57321 + assert("nikita-2089",
57322 + insert_coord->unit_pos >
57323 + shift->part_units);
57324 + insert_coord->unit_pos -=
57325 + shift->part_units;
57326 + }
57327 +
57328 + assert("nikita-2084",
57329 + insert_coord->unit_pos + 1);
57330 + }
57331 + }
57332 + return;
57333 + }
57334 +
57335 + /* we shifted to left and there was no enough space for everything */
57336 + switch (insert_coord->between) {
57337 + case AFTER_UNIT:
57338 + case BEFORE_UNIT:
57339 + if (shift->real_stop.item_pos == insert_coord->item_pos)
57340 + insert_coord->unit_pos -= shift->part_units;
57341 + case AFTER_ITEM:
57342 + coord_add_item_pos(insert_coord, -removed);
57343 + break;
57344 + default:
57345 + impossible("nikita-2087", "not ready");
57346 + }
57347 + assert("nikita-2085", insert_coord->unit_pos + 1);
57348 +}
57349 +
57350 +static int call_shift_hooks(struct shift_params *shift)
57351 +{
57352 + unsigned i, shifted;
57353 + coord_t coord;
57354 + item_plugin *iplug;
57355 +
57356 + assert("vs-275", !node_is_empty(shift->target));
57357 +
57358 + /* number of items shift touches */
57359 + shifted =
57360 + shift->entire + (shift->merging_units ? 1 : 0) +
57361 + (shift->part_units ? 1 : 0);
57362 +
57363 + if (shift->pend == SHIFT_LEFT) {
57364 + /* moved items are at the end */
57365 + coord_init_last_unit(&coord, shift->target);
57366 + coord.unit_pos = 0;
57367 +
57368 + assert("vs-279", shift->pend == 1);
57369 + for (i = 0; i < shifted; i++) {
57370 + unsigned from, count;
57371 +
57372 + iplug = item_plugin_by_coord(&coord);
57373 + if (i == 0 && shift->part_units) {
57374 + assert("vs-277",
57375 + coord_num_units(&coord) ==
57376 + shift->part_units);
57377 + count = shift->part_units;
57378 + from = 0;
57379 + } else if (i == shifted - 1 && shift->merging_units) {
57380 + count = shift->merging_units;
57381 + from = coord_num_units(&coord) - count;
57382 + } else {
57383 + count = coord_num_units(&coord);
57384 + from = 0;
57385 + }
57386 +
57387 + if (iplug->b.shift_hook) {
57388 + iplug->b.shift_hook(&coord, from, count,
57389 + shift->wish_stop.node);
57390 + }
57391 + coord_add_item_pos(&coord, -shift->pend);
57392 + }
57393 + } else {
57394 + /* moved items are at the beginning */
57395 + coord_init_first_unit(&coord, shift->target);
57396 +
57397 + assert("vs-278", shift->pend == -1);
57398 + for (i = 0; i < shifted; i++) {
57399 + unsigned from, count;
57400 +
57401 + iplug = item_plugin_by_coord(&coord);
57402 + if (i == 0 && shift->part_units) {
57403 + assert("vs-277",
57404 + coord_num_units(&coord) ==
57405 + shift->part_units);
57406 + count = coord_num_units(&coord);
57407 + from = 0;
57408 + } else if (i == shifted - 1 && shift->merging_units) {
57409 + count = shift->merging_units;
57410 + from = 0;
57411 + } else {
57412 + count = coord_num_units(&coord);
57413 + from = 0;
57414 + }
57415 +
57416 + if (iplug->b.shift_hook) {
57417 + iplug->b.shift_hook(&coord, from, count,
57418 + shift->wish_stop.node);
57419 + }
57420 + coord_add_item_pos(&coord, -shift->pend);
57421 + }
57422 + }
57423 +
57424 + return 0;
57425 +}
57426 +
57427 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57428 +static int
57429 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
57430 +{
57431 + assert("vs-944", shift->real_stop.node == old->node);
57432 +
57433 + if (shift->real_stop.item_pos < old->item_pos)
57434 + return 0;
57435 + if (shift->real_stop.item_pos == old->item_pos) {
57436 + if (shift->real_stop.unit_pos < old->unit_pos)
57437 + return 0;
57438 + }
57439 + return 1;
57440 +}
57441 +
57442 +/* shift to right is completed. Return 1 if unit @old was moved to right
57443 + neighbor */
57444 +static int
57445 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
57446 +{
57447 + assert("vs-944", shift->real_stop.node == old->node);
57448 +
57449 + if (shift->real_stop.item_pos > old->item_pos)
57450 + return 0;
57451 + if (shift->real_stop.item_pos == old->item_pos) {
57452 + if (shift->real_stop.unit_pos > old->unit_pos)
57453 + return 0;
57454 + }
57455 + return 1;
57456 +}
57457 +
57458 +/* coord @old was set in node from which shift was performed. What was shifted
57459 + is stored in @shift. Update @old correspondingly to performed shift */
57460 +static coord_t *adjust_coord2(const struct shift_params *shift,
57461 + const coord_t * old, coord_t * new)
57462 +{
57463 + coord_clear_iplug(new);
57464 + new->between = old->between;
57465 +
57466 + coord_clear_iplug(new);
57467 + if (old->node == shift->target) {
57468 + if (shift->pend == SHIFT_LEFT) {
57469 + /* coord which is set inside of left neighbor does not
57470 + change during shift to left */
57471 + coord_dup(new, old);
57472 + return new;
57473 + }
57474 + new->node = old->node;
57475 + coord_set_item_pos(new,
57476 + old->item_pos + shift->entire +
57477 + (shift->part_units ? 1 : 0));
57478 + new->unit_pos = old->unit_pos;
57479 + if (old->item_pos == 0 && shift->merging_units)
57480 + new->unit_pos += shift->merging_units;
57481 + return new;
57482 + }
57483 +
57484 + assert("vs-977", old->node == shift->wish_stop.node);
57485 + if (shift->pend == SHIFT_LEFT) {
57486 + if (unit_moved_left(shift, old)) {
57487 + /* unit @old moved to left neighbor. Calculate its
57488 + coordinate there */
57489 + new->node = shift->target;
57490 + coord_set_item_pos(new,
57491 + node_num_items(shift->target) -
57492 + shift->entire -
57493 + (shift->part_units ? 1 : 0) +
57494 + old->item_pos);
57495 +
57496 + new->unit_pos = old->unit_pos;
57497 + if (shift->merging_units) {
57498 + coord_dec_item_pos(new);
57499 + if (old->item_pos == 0) {
57500 + /* unit_pos only changes if item got
57501 + merged */
57502 + new->unit_pos =
57503 + coord_num_units(new) -
57504 + (shift->merging_units -
57505 + old->unit_pos);
57506 + }
57507 + }
57508 + } else {
57509 + /* unit @old did not move to left neighbor.
57510 +
57511 + Use _nocheck, because @old is outside of its node.
57512 + */
57513 + coord_dup_nocheck(new, old);
57514 + coord_add_item_pos(new,
57515 + -shift->u.future_first.item_pos);
57516 + if (new->item_pos == 0)
57517 + new->unit_pos -= shift->u.future_first.unit_pos;
57518 + }
57519 + } else {
57520 + if (unit_moved_right(shift, old)) {
57521 + /* unit @old moved to right neighbor */
57522 + new->node = shift->target;
57523 + coord_set_item_pos(new,
57524 + old->item_pos -
57525 + shift->real_stop.item_pos);
57526 + if (new->item_pos == 0) {
57527 + /* unit @old might change unit pos */
57528 + coord_set_item_pos(new,
57529 + old->unit_pos -
57530 + shift->real_stop.unit_pos);
57531 + }
57532 + } else {
57533 + /* unit @old did not move to right neighbor, therefore
57534 + it did not change */
57535 + coord_dup(new, old);
57536 + }
57537 + }
57538 + coord_set_iplug(new, item_plugin_by_coord(new));
57539 + return new;
57540 +}
57541 +
57542 +/* this is called when shift is completed (something of source node is copied
57543 + to target and deleted in source) to update all taps set in current
57544 + context */
57545 +static void update_taps(const struct shift_params *shift)
57546 +{
57547 + tap_t *tap;
57548 + coord_t new;
57549 +
57550 + for_all_taps(tap) {
57551 + /* update only taps set to nodes participating in shift */
57552 + if (tap->coord->node == shift->wish_stop.node
57553 + || tap->coord->node == shift->target)
57554 + tap_to_coord(tap,
57555 + adjust_coord2(shift, tap->coord, &new));
57556 + }
57557 +}
57558 +
57559 +#if REISER4_DEBUG
57560 +
57561 +struct shift_check {
57562 + reiser4_key key;
57563 + __u16 plugin_id;
57564 + union {
57565 + __u64 bytes;
57566 + __u64 entries;
57567 + void *unused;
57568 + } u;
57569 +};
57570 +
57571 +void *shift_check_prepare(const znode * left, const znode * right)
57572 +{
57573 + pos_in_node_t i, nr_items;
57574 + int mergeable;
57575 + struct shift_check *data;
57576 + item_header40 *ih;
57577 +
57578 + if (node_is_empty(left) || node_is_empty(right))
57579 + mergeable = 0;
57580 + else {
57581 + coord_t l, r;
57582 +
57583 + coord_init_last_unit(&l, left);
57584 + coord_init_first_unit(&r, right);
57585 + mergeable = are_items_mergeable(&l, &r);
57586 + }
57587 + nr_items =
57588 + node40_num_of_items_internal(left) +
57589 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57590 + data =
57591 + kmalloc(sizeof(struct shift_check) * nr_items,
57592 + reiser4_ctx_gfp_mask_get());
57593 + if (data != NULL) {
57594 + coord_t coord;
57595 + pos_in_node_t item_pos;
57596 +
57597 + coord_init_first_unit(&coord, left);
57598 + i = 0;
57599 +
57600 + for (item_pos = 0;
57601 + item_pos < node40_num_of_items_internal(left);
57602 + item_pos++) {
57603 +
57604 + coord_set_item_pos(&coord, item_pos);
57605 + ih = node40_ih_at_coord(&coord);
57606 +
57607 + data[i].key = ih->key;
57608 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57609 + switch (data[i].plugin_id) {
57610 + case CTAIL_ID:
57611 + case FORMATTING_ID:
57612 + data[i].u.bytes = coord_num_units(&coord);
57613 + break;
57614 + case EXTENT_POINTER_ID:
57615 + data[i].u.bytes =
57616 + reiser4_extent_size(&coord,
57617 + coord_num_units(&coord));
57618 + break;
57619 + case COMPOUND_DIR_ID:
57620 + data[i].u.entries = coord_num_units(&coord);
57621 + break;
57622 + default:
57623 + data[i].u.unused = NULL;
57624 + break;
57625 + }
57626 + i++;
57627 + }
57628 +
57629 + coord_init_first_unit(&coord, right);
57630 +
57631 + if (mergeable) {
57632 + assert("vs-1609", i != 0);
57633 +
57634 + ih = node40_ih_at_coord(&coord);
57635 +
57636 + assert("vs-1589",
57637 + data[i - 1].plugin_id ==
57638 + le16_to_cpu(get_unaligned(&ih->plugin_id)));
57639 + switch (data[i - 1].plugin_id) {
57640 + case CTAIL_ID:
57641 + case FORMATTING_ID:
57642 + data[i - 1].u.bytes += coord_num_units(&coord);
57643 + break;
57644 + case EXTENT_POINTER_ID:
57645 + data[i - 1].u.bytes +=
57646 + reiser4_extent_size(&coord,
57647 + coord_num_units(&coord));
57648 + break;
57649 + case COMPOUND_DIR_ID:
57650 + data[i - 1].u.entries +=
57651 + coord_num_units(&coord);
57652 + break;
57653 + default:
57654 + impossible("vs-1605", "wrong mergeable item");
57655 + break;
57656 + }
57657 + item_pos = 1;
57658 + } else
57659 + item_pos = 0;
57660 + for (; item_pos < node40_num_of_items_internal(right);
57661 + item_pos++) {
57662 +
57663 + assert("vs-1604", i < nr_items);
57664 + coord_set_item_pos(&coord, item_pos);
57665 + ih = node40_ih_at_coord(&coord);
57666 +
57667 + data[i].key = ih->key;
57668 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57669 + switch (data[i].plugin_id) {
57670 + case CTAIL_ID:
57671 + case FORMATTING_ID:
57672 + data[i].u.bytes = coord_num_units(&coord);
57673 + break;
57674 + case EXTENT_POINTER_ID:
57675 + data[i].u.bytes =
57676 + reiser4_extent_size(&coord,
57677 + coord_num_units(&coord));
57678 + break;
57679 + case COMPOUND_DIR_ID:
57680 + data[i].u.entries = coord_num_units(&coord);
57681 + break;
57682 + default:
57683 + data[i].u.unused = NULL;
57684 + break;
57685 + }
57686 + i++;
57687 + }
57688 + assert("vs-1606", i == nr_items);
57689 + }
57690 + return data;
57691 +}
57692 +
57693 +void shift_check(void *vp, const znode * left, const znode * right)
57694 +{
57695 + pos_in_node_t i, nr_items;
57696 + coord_t coord;
57697 + __u64 last_bytes;
57698 + int mergeable;
57699 + item_header40 *ih;
57700 + pos_in_node_t item_pos;
57701 + struct shift_check *data;
57702 +
57703 + data = (struct shift_check *)vp;
57704 +
57705 + if (data == NULL)
57706 + return;
57707 +
57708 + if (node_is_empty(left) || node_is_empty(right))
57709 + mergeable = 0;
57710 + else {
57711 + coord_t l, r;
57712 +
57713 + coord_init_last_unit(&l, left);
57714 + coord_init_first_unit(&r, right);
57715 + mergeable = are_items_mergeable(&l, &r);
57716 + }
57717 +
57718 + nr_items =
57719 + node40_num_of_items_internal(left) +
57720 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57721 +
57722 + i = 0;
57723 + last_bytes = 0;
57724 +
57725 + coord_init_first_unit(&coord, left);
57726 +
57727 + for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
57728 + item_pos++) {
57729 +
57730 + coord_set_item_pos(&coord, item_pos);
57731 + ih = node40_ih_at_coord(&coord);
57732 +
57733 + assert("vs-1611", i == item_pos);
57734 + assert("vs-1590", keyeq(&ih->key, &data[i].key));
57735 + assert("vs-1591",
57736 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57737 + if ((i < (node40_num_of_items_internal(left) - 1))
57738 + || !mergeable) {
57739 + switch (data[i].plugin_id) {
57740 + case CTAIL_ID:
57741 + case FORMATTING_ID:
57742 + assert("vs-1592",
57743 + data[i].u.bytes ==
57744 + coord_num_units(&coord));
57745 + break;
57746 + case EXTENT_POINTER_ID:
57747 + assert("vs-1593",
57748 + data[i].u.bytes ==
57749 + reiser4_extent_size(&coord,
57750 + coord_num_units
57751 + (&coord)));
57752 + break;
57753 + case COMPOUND_DIR_ID:
57754 + assert("vs-1594",
57755 + data[i].u.entries ==
57756 + coord_num_units(&coord));
57757 + break;
57758 + default:
57759 + break;
57760 + }
57761 + }
57762 + if (item_pos == (node40_num_of_items_internal(left) - 1)
57763 + && mergeable) {
57764 + switch (data[i].plugin_id) {
57765 + case CTAIL_ID:
57766 + case FORMATTING_ID:
57767 + last_bytes = coord_num_units(&coord);
57768 + break;
57769 + case EXTENT_POINTER_ID:
57770 + last_bytes =
57771 + reiser4_extent_size(&coord,
57772 + coord_num_units(&coord));
57773 + break;
57774 + case COMPOUND_DIR_ID:
57775 + last_bytes = coord_num_units(&coord);
57776 + break;
57777 + default:
57778 + impossible("vs-1595", "wrong mergeable item");
57779 + break;
57780 + }
57781 + }
57782 + i++;
57783 + }
57784 +
57785 + coord_init_first_unit(&coord, right);
57786 + if (mergeable) {
57787 + ih = node40_ih_at_coord(&coord);
57788 +
57789 + assert("vs-1589",
57790 + data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
57791 + assert("vs-1608", last_bytes != 0);
57792 + switch (data[i - 1].plugin_id) {
57793 + case CTAIL_ID:
57794 + case FORMATTING_ID:
57795 + assert("vs-1596",
57796 + data[i - 1].u.bytes ==
57797 + last_bytes + coord_num_units(&coord));
57798 + break;
57799 +
57800 + case EXTENT_POINTER_ID:
57801 + assert("vs-1597",
57802 + data[i - 1].u.bytes ==
57803 + last_bytes + reiser4_extent_size(&coord,
57804 + coord_num_units
57805 + (&coord)));
57806 + break;
57807 +
57808 + case COMPOUND_DIR_ID:
57809 + assert("vs-1598",
57810 + data[i - 1].u.bytes ==
57811 + last_bytes + coord_num_units(&coord));
57812 + break;
57813 + default:
57814 + impossible("vs-1599", "wrong mergeable item");
57815 + break;
57816 + }
57817 + item_pos = 1;
57818 + } else
57819 + item_pos = 0;
57820 +
57821 + for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
57822 +
57823 + coord_set_item_pos(&coord, item_pos);
57824 + ih = node40_ih_at_coord(&coord);
57825 +
57826 + assert("vs-1612", keyeq(&ih->key, &data[i].key));
57827 + assert("vs-1613",
57828 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57829 + switch (data[i].plugin_id) {
57830 + case CTAIL_ID:
57831 + case FORMATTING_ID:
57832 + assert("vs-1600",
57833 + data[i].u.bytes == coord_num_units(&coord));
57834 + break;
57835 + case EXTENT_POINTER_ID:
57836 + assert("vs-1601",
57837 + data[i].u.bytes ==
57838 + reiser4_extent_size(&coord,
57839 + coord_num_units
57840 + (&coord)));
57841 + break;
57842 + case COMPOUND_DIR_ID:
57843 + assert("vs-1602",
57844 + data[i].u.entries == coord_num_units(&coord));
57845 + break;
57846 + default:
57847 + break;
57848 + }
57849 + i++;
57850 + }
57851 +
57852 + assert("vs-1603", i == nr_items);
57853 + kfree(data);
57854 +}
57855 +
57856 +#endif
57857 +
57858 +/* plugin->u.node.shift
57859 + look for description of this method in plugin/node/node.h */
57860 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
57861 + deleted from the tree if this is set to 1 */
57862 + int including_stop_coord, carry_plugin_info * info)
57863 +{
57864 + struct shift_params shift;
57865 + int result;
57866 + znode *left, *right;
57867 + znode *source;
57868 + int target_empty;
57869 +
57870 + assert("nikita-2161", coord_check(from));
57871 +
57872 + memset(&shift, 0, sizeof(shift));
57873 + shift.pend = pend;
57874 + shift.wish_stop = *from;
57875 + shift.target = to;
57876 +
57877 + assert("nikita-1473", znode_is_write_locked(from->node));
57878 + assert("nikita-1474", znode_is_write_locked(to));
57879 +
57880 + source = from->node;
57881 +
57882 + /* set @shift.wish_stop to rightmost/leftmost unit among units we want
57883 + shifted */
57884 + if (pend == SHIFT_LEFT) {
57885 + result = coord_set_to_left(&shift.wish_stop);
57886 + left = to;
57887 + right = from->node;
57888 + } else {
57889 + result = coord_set_to_right(&shift.wish_stop);
57890 + left = from->node;
57891 + right = to;
57892 + }
57893 +
57894 + if (result) {
57895 + /* move insertion coord even if there is nothing to move */
57896 + if (including_stop_coord) {
57897 + /* move insertion coord (@from) */
57898 + if (pend == SHIFT_LEFT) {
57899 + /* after last item in target node */
57900 + coord_init_after_last_item(from, to);
57901 + } else {
57902 + /* before first item in target node */
57903 + coord_init_before_first_item(from, to);
57904 + }
57905 + }
57906 +
57907 + if (delete_child && node_is_empty(shift.wish_stop.node))
57908 + result =
57909 + prepare_removal_node40(shift.wish_stop.node, info);
57910 + else
57911 + result = 0;
57912 + /* there is nothing to shift */
57913 + assert("nikita-2078", coord_check(from));
57914 + return result;
57915 + }
57916 +
57917 + target_empty = node_is_empty(to);
57918 +
57919 + /* when first node plugin with item body compression is implemented,
57920 + this must be changed to call node specific plugin */
57921 +
57922 + /* shift->stop_coord is updated to last unit which really will be
57923 + shifted */
57924 + estimate_shift(&shift, get_current_context());
57925 + if (!shift.shift_bytes) {
57926 + /* we could not shift anything */
57927 + assert("nikita-2079", coord_check(from));
57928 + return 0;
57929 + }
57930 +
57931 + copy(&shift);
57932 +
57933 + /* result value of this is important. It is used by adjust_coord below */
57934 + result = delete_copied(&shift);
57935 +
57936 + assert("vs-1610", result >= 0);
57937 + assert("vs-1471",
57938 + ((reiser4_context *) current->journal_info)->magic ==
57939 + context_magic);
57940 +
57941 + /* item which has been moved from one node to another might want to do
57942 + something on that event. This can be done by item's shift_hook
57943 + method, which will be now called for every moved items */
57944 + call_shift_hooks(&shift);
57945 +
57946 + assert("vs-1472",
57947 + ((reiser4_context *) current->journal_info)->magic ==
57948 + context_magic);
57949 +
57950 + update_taps(&shift);
57951 +
57952 + assert("vs-1473",
57953 + ((reiser4_context *) current->journal_info)->magic ==
57954 + context_magic);
57955 +
57956 + /* adjust @from pointer in accordance with @including_stop_coord flag
57957 + and amount of data which was really shifted */
57958 + adjust_coord(from, &shift, result, including_stop_coord);
57959 +
57960 + if (target_empty)
57961 + /*
57962 + * items were shifted into empty node. Update delimiting key.
57963 + */
57964 + result = prepare_for_update(NULL, left, info);
57965 +
57966 + /* add update operation to @info, which is the list of operations to
57967 + be performed on a higher level */
57968 + result = prepare_for_update(left, right, info);
57969 + if (!result && node_is_empty(source) && delete_child) {
57970 + /* all contents of @from->node is moved to @to and @from->node
57971 + has to be removed from the tree, so, on higher level we
57972 + will be removing the pointer to node @from->node */
57973 + result = prepare_removal_node40(source, info);
57974 + }
57975 + assert("nikita-2080", coord_check(from));
57976 + return result ? result : (int)shift.shift_bytes;
57977 +}
57978 +
57979 +/* plugin->u.node.fast_insert()
57980 + look for description of this method in plugin/node/node.h */
57981 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57982 +{
57983 + return 1;
57984 +}
57985 +
57986 +/* plugin->u.node.fast_paste()
57987 + look for description of this method in plugin/node/node.h */
57988 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57989 +{
57990 + return 1;
57991 +}
57992 +
57993 +/* plugin->u.node.fast_cut()
57994 + look for description of this method in plugin/node/node.h */
57995 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57996 +{
57997 + return 1;
57998 +}
57999 +
58000 +/* plugin->u.node.modify - not defined */
58001 +
58002 +/* plugin->u.node.max_item_size */
58003 +int max_item_size_node40(void)
58004 +{
58005 + return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58006 + sizeof(item_header40);
58007 +}
58008 +
58009 +/* plugin->u.node.set_item_plugin */
58010 +int set_item_plugin_node40(coord_t *coord, item_id id)
58011 +{
58012 + item_header40 *ih;
58013 +
58014 + ih = node40_ih_at_coord(coord);
58015 + put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58016 + coord->iplugid = id;
58017 + return 0;
58018 +}
58019 +
58020 +/*
58021 + Local variables:
58022 + c-indentation-style: "K&R"
58023 + mode-name: "LC"
58024 + c-basic-offset: 8
58025 + tab-width: 8
58026 + fill-column: 120
58027 + scroll-step: 1
58028 + End:
58029 +*/
58030 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h linux-2.6.20/fs/reiser4/plugin/node/node40.h
58031 --- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
58032 +++ linux-2.6.20/fs/reiser4/plugin/node/node40.h 2007-05-06 14:50:43.835018219 +0400
58033 @@ -0,0 +1,125 @@
58034 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58035 +
58036 +#if !defined( __REISER4_NODE40_H__ )
58037 +#define __REISER4_NODE40_H__
58038 +
58039 +#include "../../forward.h"
58040 +#include "../../dformat.h"
58041 +#include "node.h"
58042 +
58043 +#include <linux/types.h>
58044 +
58045 +/* format of node header for 40 node layouts. Keep bloat out of this struct. */
58046 +typedef struct node40_header {
58047 + /* identifier of node plugin. Must be located at the very beginning
58048 + of a node. */
58049 + common_node_header common_header; /* this is 16 bits */
58050 + /* number of items. Should be first element in the node header,
58051 + because we haven't yet finally decided whether it shouldn't go into
58052 + common_header.
58053 + */
58054 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
58055 + * node format at compile time, and it is this one, accesses do not function dereference when
58056 + * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
58057 + d16 nr_items;
58058 + /* free space in node measured in bytes */
58059 + d16 free_space;
58060 + /* offset to start of free space in node */
58061 + d16 free_space_start;
58062 + /* for reiser4_fsck. When information about what is a free
58063 + block is corrupted, and we try to recover everything even
58064 + if marked as freed, then old versions of data may
58065 + duplicate newer versions, and this field allows us to
58066 + restore the newer version. Also useful for when users
58067 + who don't have the new trashcan installed on their linux distro
58068 + delete the wrong files and send us desperate emails
58069 + offering $25 for them back. */
58070 +
58071 + /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
58072 + d32 magic;
58073 + /* flushstamp is made of mk_id and write_counter. mk_id is an
58074 + id generated randomly at mkreiserfs time. So we can just
58075 + skip all nodes with different mk_id. write_counter is d64
58076 + incrementing counter of writes on disk. It is used for
58077 + choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
58078 +
58079 + d32 mkfs_id;
58080 + d64 flush_id;
58081 + /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
58082 + and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
58083 + d16 flags;
58084 +
58085 + /* 1 is leaf level, 2 is twig level, root is the numerically
58086 + largest level */
58087 + d8 level;
58088 +
58089 + d8 pad;
58090 +} PACKED node40_header;
58091 +
58092 +/* item headers are not standard across all node layouts, pass
58093 + pos_in_node to functions instead */
58094 +typedef struct item_header40 {
58095 + /* key of item */
58096 + /* 0 */ reiser4_key key;
58097 + /* offset from start of a node measured in 8-byte chunks */
58098 + /* 24 */ d16 offset;
58099 + /* 26 */ d16 flags;
58100 + /* 28 */ d16 plugin_id;
58101 +} PACKED item_header40;
58102 +
58103 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
58104 +size_t free_space_node40(znode * node);
58105 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
58106 + lookup_bias bias, coord_t * coord);
58107 +int num_of_items_node40(const znode * node);
58108 +char *item_by_coord_node40(const coord_t * coord);
58109 +int length_by_coord_node40(const coord_t * coord);
58110 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
58111 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
58112 +size_t estimate_node40(znode * node);
58113 +int check_node40(const znode * node, __u32 flags, const char **error);
58114 +int parse_node40(znode * node);
58115 +int init_node40(znode * node);
58116 +#ifdef GUESS_EXISTS
58117 +int guess_node40(const znode * node);
58118 +#endif
58119 +void change_item_size_node40(coord_t * coord, int by);
58120 +int create_item_node40(coord_t * target, const reiser4_key * key,
58121 + reiser4_item_data * data, carry_plugin_info * info);
58122 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
58123 + carry_plugin_info * info);
58124 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
58125 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
58126 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
58127 + /* if @from->node becomes
58128 + empty - it will be deleted from
58129 + the tree if this is set to 1
58130 + */
58131 + int delete_child, int including_stop_coord,
58132 + carry_plugin_info * info);
58133 +
58134 +int fast_insert_node40(const coord_t * coord);
58135 +int fast_paste_node40(const coord_t * coord);
58136 +int fast_cut_node40(const coord_t * coord);
58137 +int max_item_size_node40(void);
58138 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
58139 +int set_item_plugin_node40(coord_t * coord, item_id id);
58140 +int shrink_item_node40(coord_t * coord, int delta);
58141 +
58142 +#if REISER4_DEBUG
58143 +void *shift_check_prepare(const znode *left, const znode *right);
58144 +void shift_check(void *vp, const znode *left, const znode *right);
58145 +#endif
58146 +
58147 +/* __REISER4_NODE40_H__ */
58148 +#endif
58149 +/*
58150 + Local variables:
58151 + c-indentation-style: "K&R"
58152 + mode-name: "LC"
58153 + c-basic-offset: 8
58154 + tab-width: 8
58155 + fill-column: 120
58156 + scroll-step: 1
58157 + End:
58158 +*/
58159 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.c linux-2.6.20/fs/reiser4/plugin/node/node.c
58160 --- linux-2.6.20.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
58161 +++ linux-2.6.20/fs/reiser4/plugin/node/node.c 2007-05-06 14:50:43.835018219 +0400
58162 @@ -0,0 +1,131 @@
58163 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58164 +
58165 +/* Node plugin interface.
58166 +
58167 + Description: The tree provides the abstraction of flows, which it
58168 + internally fragments into items which it stores in nodes.
58169 +
58170 + A key_atom is a piece of data bound to a single key.
58171 +
58172 + For reasonable space efficiency to be achieved it is often
58173 + necessary to store key_atoms in the nodes in the form of items, where
58174 + an item is a sequence of key_atoms of the same or similar type. It is
58175 + more space-efficient, because the item can implement (very)
58176 + efficient compression of key_atom's bodies using internal knowledge
58177 + about their semantics, and it can often avoid having a key for each
58178 + key_atom. Each type of item has specific operations implemented by its
58179 + item handler (see balance.c).
58180 +
58181 + Rationale: the rest of the code (specifically balancing routines)
58182 + accesses leaf level nodes through this interface. This way we can
58183 + implement various block layouts and even combine various layouts
58184 + within the same tree. Balancing/allocating algorithms should not
58185 + care about peculiarities of splitting/merging specific item types,
58186 + but rather should leave that to the item's item handler.
58187 +
58188 + Items, including those that provide the abstraction of flows, have
58189 + the property that if you move them in part or in whole to another
58190 + node, the balancing code invokes their is_left_mergeable()
58191 + item_operation to determine if they are mergeable with their new
58192 + neighbor in the node you have moved them to. For some items the
58193 + is_left_mergeable() function always returns null.
58194 +
58195 + When moving the bodies of items from one node to another:
58196 +
58197 + if a partial item is shifted to another node the balancing code invokes
58198 + an item handler method to handle the item splitting.
58199 +
58200 + if the balancing code needs to merge with an item in the node it
58201 + is shifting to, it will invoke an item handler method to handle
58202 + the item merging.
58203 +
58204 + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
58205 + adjusting the item headers after the move is done using the node handler.
58206 +*/
58207 +
58208 +#include "../../forward.h"
58209 +#include "../../debug.h"
58210 +#include "../../key.h"
58211 +#include "../../coord.h"
58212 +#include "../plugin_header.h"
58213 +#include "../item/item.h"
58214 +#include "node.h"
58215 +#include "../plugin.h"
58216 +#include "../../znode.h"
58217 +#include "../../tree.h"
58218 +#include "../../super.h"
58219 +#include "../../reiser4.h"
58220 +
58221 +/**
58222 + * leftmost_key_in_node - get the smallest key in node
58223 + * @node:
58224 + * @key: store result here
58225 + *
58226 + * Stores the leftmost key of @node in @key.
58227 + */
58228 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
58229 +{
58230 + assert("nikita-1634", node != NULL);
58231 + assert("nikita-1635", key != NULL);
58232 +
58233 + if (!node_is_empty(node)) {
58234 + coord_t first_item;
58235 +
58236 + coord_init_first_unit(&first_item, (znode *) node);
58237 + item_key_by_coord(&first_item, key);
58238 + } else
58239 + *key = *reiser4_max_key();
58240 + return key;
58241 +}
58242 +
58243 +node_plugin node_plugins[LAST_NODE_ID] = {
58244 + [NODE40_ID] = {
58245 + .h = {
58246 + .type_id = REISER4_NODE_PLUGIN_TYPE,
58247 + .id = NODE40_ID,
58248 + .pops = NULL,
58249 + .label = "unified",
58250 + .desc = "unified node layout",
58251 + .linkage = {NULL, NULL}
58252 + },
58253 + .item_overhead = item_overhead_node40,
58254 + .free_space = free_space_node40,
58255 + .lookup = lookup_node40,
58256 + .num_of_items = num_of_items_node40,
58257 + .item_by_coord = item_by_coord_node40,
58258 + .length_by_coord = length_by_coord_node40,
58259 + .plugin_by_coord = plugin_by_coord_node40,
58260 + .key_at = key_at_node40,
58261 + .estimate = estimate_node40,
58262 + .check = check_node40,
58263 + .parse = parse_node40,
58264 + .init = init_node40,
58265 +#ifdef GUESS_EXISTS
58266 + .guess = guess_node40,
58267 +#endif
58268 + .change_item_size = change_item_size_node40,
58269 + .create_item = create_item_node40,
58270 + .update_item_key = update_item_key_node40,
58271 + .cut_and_kill = kill_node40,
58272 + .cut = cut_node40,
58273 + .shift = shift_node40,
58274 + .shrink_item = shrink_item_node40,
58275 + .fast_insert = fast_insert_node40,
58276 + .fast_paste = fast_paste_node40,
58277 + .fast_cut = fast_cut_node40,
58278 + .max_item_size = max_item_size_node40,
58279 + .prepare_removal = prepare_removal_node40,
58280 + .set_item_plugin = set_item_plugin_node40
58281 + }
58282 +};
58283 +
58284 +/*
58285 + Local variables:
58286 + c-indentation-style: "K&R"
58287 + mode-name: "LC"
58288 + c-basic-offset: 8
58289 + tab-width: 8
58290 + fill-column: 120
58291 + scroll-step: 1
58292 + End:
58293 +*/
58294 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.h linux-2.6.20/fs/reiser4/plugin/node/node.h
58295 --- linux-2.6.20.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
58296 +++ linux-2.6.20/fs/reiser4/plugin/node/node.h 2007-05-06 14:50:43.835018219 +0400
58297 @@ -0,0 +1,272 @@
58298 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58299 +
58300 +/* We need a definition of the default node layout here. */
58301 +
58302 +/* Generally speaking, it is best to have free space in the middle of the
58303 + node so that two sets of things can grow towards it, and to have the
58304 + item bodies on the left so that the last one of them grows into free
58305 + space. We optimize for the case where we append new items to the end
58306 + of the node, or grow the last item, because it hurts nothing to so
58307 + optimize and it is a common special case to do massive insertions in
58308 + increasing key order (and one of cases more likely to have a real user
58309 + notice the delay time for).
58310 +
58311 + formatted leaf default layout: (leaf1)
58312 +
58313 + |node header:item bodies:free space:key + pluginid + item offset|
58314 +
58315 + We grow towards the middle, optimizing layout for the case where we
58316 + append new items to the end of the node. The node header is fixed
58317 + length. Keys, and item offsets plus pluginids for the items
58318 + corresponding to them are in increasing key order, and are fixed
58319 + length. Item offsets are relative to start of node (16 bits creating
58320 + a node size limit of 64k, 12 bits might be a better choice....). Item
58321 + bodies are in decreasing key order. Item bodies have a variable size.
58322 + There is a one to one to one mapping of keys to item offsets to item
58323 + bodies. Item offsets consist of pointers to the zeroth byte of the
58324 + item body. Item length equals the start of the next item minus the
58325 + start of this item, except the zeroth item whose length equals the end
58326 + of the node minus the start of that item (plus a byte). In other
58327 + words, the item length is not recorded anywhere, and it does not need
58328 + to be since it is computable.
58329 +
58330 + Leaf variable length items and keys layout : (lvar)
58331 +
58332 + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
58333 +
58334 + We grow towards the middle, optimizing layout for the case where we
58335 + append new items to the end of the node. The node header is fixed
58336 + length. Keys and item offsets for the items corresponding to them are
58337 + in increasing key order, and keys are variable length. Item offsets
58338 + are relative to start of node (16 bits). Item bodies are in
58339 + decreasing key order. Item bodies have a variable size. There is a
58340 + one to one to one mapping of keys to item offsets to item bodies.
58341 + Item offsets consist of pointers to the zeroth byte of the item body.
58342 + Item length equals the start of the next item's key minus the start of
58343 + this item, except the zeroth item whose length equals the end of the
58344 + node minus the start of that item (plus a byte).
58345 +
58346 + leaf compressed keys layout: (lcomp)
58347 +
58348 + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
58349 +
58350 + We grow towards the middle, optimizing layout for the case where we
58351 + append new items to the end of the node. The node header is fixed
58352 + length. Keys and item offsets for the items corresponding to them are
58353 + in increasing key order, and keys are variable length. The "key
58354 + inherit" field indicates how much of the key prefix is identical to
58355 + the previous key (stem compression as described in "Managing
58356 + Gigabytes" is used). key_inherit is a one byte integer. The
58357 + intra-node searches performed through this layout are linear searches,
58358 + and this is theorized to not hurt performance much due to the high
58359 + cost of processor stalls on modern CPUs, and the small number of keys
58360 + in a single node. Item offsets are relative to start of node (16
58361 + bits). Item bodies are in decreasing key order. Item bodies have a
58362 + variable size. There is a one to one to one mapping of keys to item
58363 + offsets to item bodies. Item offsets consist of pointers to the
58364 + zeroth byte of the item body. Item length equals the start of the
58365 + next item minus the start of this item, except the zeroth item whose
58366 + length equals the end of the node minus the start of that item (plus a
58367 + byte). In other words, item length and key length is not recorded
58368 + anywhere, and it does not need to be since it is computable.
58369 +
58370 + internal node default layout: (idef1)
58371 +
58372 + just like ldef1 except that item bodies are either blocknrs of
58373 + children or extents, and moving them may require updating parent
58374 + pointers in the nodes that they point to.
58375 +*/
58376 +
58377 +/* There is an inherent 3-way tradeoff between optimizing and
58378 + exchanging disks between different architectures and code
58379 + complexity. This is optimal and simple and inexchangeable.
58380 + Someone else can do the code for exchanging disks and make it
58381 + complex. It would not be that hard. Using other than the PAGE_SIZE
58382 + might be suboptimal.
58383 +*/
58384 +
58385 +#if !defined( __REISER4_NODE_H__ )
58386 +#define __REISER4_NODE_H__
58387 +
58388 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
58389 +
58390 +#include "../../dformat.h"
58391 +#include "../plugin_header.h"
58392 +
58393 +#include <linux/types.h>
58394 +
58395 +typedef enum {
58396 + NS_FOUND = 0,
58397 + NS_NOT_FOUND = -ENOENT
58398 +} node_search_result;
58399 +
58400 +/* Maximal possible space overhead for creation of new item in a node */
58401 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
58402 +
58403 +typedef enum {
58404 + REISER4_NODE_DKEYS = (1 << 0),
58405 + REISER4_NODE_TREE_STABLE = (1 << 1)
58406 +} reiser4_node_check_flag;
58407 +
58408 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
58409 +struct cut_list {
58410 + coord_t *from;
58411 + coord_t *to;
58412 + const reiser4_key *from_key;
58413 + const reiser4_key *to_key;
58414 + reiser4_key *smallest_removed;
58415 + carry_plugin_info *info;
58416 + __u32 flags;
58417 + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
58418 + lock_handle *left;
58419 + lock_handle *right;
58420 +};
58421 +
58422 +struct carry_cut_data;
58423 +struct carry_kill_data;
58424 +
58425 +/* The responsibility of the node plugin is to store and give access
58426 + to the sequence of items within the node. */
58427 +typedef struct node_plugin {
58428 + /* generic plugin fields */
58429 + plugin_header h;
58430 +
58431 + /* calculates the amount of space that will be required to store an
58432 + item which is in addition to the space consumed by the item body.
58433 + (the space consumed by the item body can be gotten by calling
58434 + item->estimate) */
58435 + size_t(*item_overhead) (const znode * node, flow_t * f);
58436 +
58437 + /* returns free space by looking into node (i.e., without using
58438 + znode->free_space). */
58439 + size_t(*free_space) (znode * node);
58440 + /* search within the node for the one item which might
58441 + contain the key, invoking item->search_within to search within
58442 + that item to see if it is in there */
58443 + node_search_result(*lookup) (znode * node, const reiser4_key * key,
58444 + lookup_bias bias, coord_t * coord);
58445 + /* number of items in node */
58446 + int (*num_of_items) (const znode * node);
58447 +
58448 + /* store information about item in @coord in @data */
58449 + /* break into several node ops, don't add any more uses of this before doing so */
58450 + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
58451 + char *(*item_by_coord) (const coord_t * coord);
58452 + int (*length_by_coord) (const coord_t * coord);
58453 + item_plugin *(*plugin_by_coord) (const coord_t * coord);
58454 +
58455 + /* store item key in @key */
58456 + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
58457 + /* conservatively estimate whether unit of what size can fit
58458 + into node. This estimation should be performed without
58459 + actually looking into the node's content (free space is saved in
58460 + znode). */
58461 + size_t(*estimate) (znode * node);
58462 +
58463 + /* performs every consistency check the node plugin author could
58464 + imagine. Optional. */
58465 + int (*check) (const znode * node, __u32 flags, const char **error);
58466 +
58467 + /* Called when node is read into memory and node plugin is
58468 + already detected. This should read some data into znode (like free
58469 + space counter) and, optionally, check data consistency.
58470 + */
58471 + int (*parse) (znode * node);
58472 + /* This method is called on a new node to initialise plugin specific
58473 + data (header, etc.) */
58474 + int (*init) (znode * node);
58475 + /* Check whether @node content conforms to this plugin format.
58476 + Probably only useful after support for old V3.x formats is added.
58477 + Uncomment after 4.0 only.
58478 + */
58479 + /* int ( *guess )( const znode *node ); */
58480 +#if REISER4_DEBUG
58481 + void (*print) (const char *prefix, const znode * node, __u32 flags);
58482 +#endif
58483 + /* change size of @item by @by bytes. @item->node has enough free
58484 + space. When @by > 0 - free space is appended to end of item. When
58485 + @by < 0 - item is truncated - it is assumed that last @by bytes if
58486 + the item are freed already */
58487 + void (*change_item_size) (coord_t * item, int by);
58488 +
58489 + /* create new item @length bytes long in coord @target */
58490 + int (*create_item) (coord_t * target, const reiser4_key * key,
58491 + reiser4_item_data * data, carry_plugin_info * info);
58492 +
58493 + /* update key of item. */
58494 + void (*update_item_key) (coord_t * target, const reiser4_key * key,
58495 + carry_plugin_info * info);
58496 +
58497 + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
58498 + int (*cut) (struct carry_cut_data *, carry_plugin_info *);
58499 +
58500 + /*
58501 + * shrink item pointed to by @coord by @delta bytes.
58502 + */
58503 + int (*shrink_item) (coord_t * coord, int delta);
58504 +
58505 + /* copy as much as possible but not more than up to @stop from
58506 + @stop->node to @target. If (pend == append) then data from beginning of
58507 + @stop->node are copied to the end of @target. If (pend == prepend) then
58508 + data from the end of @stop->node are copied to the beginning of
58509 + @target. Copied data are removed from @stop->node. Information
58510 + about what to do on upper level is stored in @todo */
58511 + int (*shift) (coord_t * stop, znode * target, shift_direction pend,
58512 + int delete_node, int including_insert_coord,
58513 + carry_plugin_info * info);
58514 + /* return true if this node allows skip carry() in some situations
58515 + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
58516 + emulation doesn't.
58517 +
58518 + This will speedup insertions that doesn't require updates to the
58519 + parent, by bypassing initialisation of carry() structures. It's
58520 + believed that majority of insertions will fit there.
58521 +
58522 + */
58523 + int (*fast_insert) (const coord_t * coord);
58524 + int (*fast_paste) (const coord_t * coord);
58525 + int (*fast_cut) (const coord_t * coord);
58526 + /* this limits max size of item which can be inserted into a node and
58527 + number of bytes item in a node may be appended with */
58528 + int (*max_item_size) (void);
58529 + int (*prepare_removal) (znode * empty, carry_plugin_info * info);
58530 + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
58531 + * files */
58532 + int (*set_item_plugin) (coord_t * coord, item_id);
58533 +} node_plugin;
58534 +
58535 +typedef enum {
58536 + /* standard unified node layout used for both leaf and internal
58537 + nodes */
58538 + NODE40_ID,
58539 + LAST_NODE_ID
58540 +} reiser4_node_id;
58541 +
58542 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
58543 +#if REISER4_DEBUG
58544 +extern void print_node_content(const char *prefix, const znode * node,
58545 + __u32 flags);
58546 +#endif
58547 +
58548 +extern void indent_znode(const znode * node);
58549 +
58550 +typedef struct common_node_header {
58551 + /*
58552 + * identifier of node plugin. Must be located at the very beginning of
58553 + * a node.
58554 + */
58555 + __le16 plugin_id;
58556 +} common_node_header;
58557 +
58558 +/* __REISER4_NODE_H__ */
58559 +#endif
58560 +/*
58561 + * Local variables:
58562 + * c-indentation-style: "K&R"
58563 + * mode-name: "LC"
58564 + * c-basic-offset: 8
58565 + * tab-width: 8
58566 + * fill-column: 79
58567 + * scroll-step: 1
58568 + * End:
58569 + */
58570 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.c linux-2.6.20/fs/reiser4/plugin/object.c
58571 --- linux-2.6.20.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
58572 +++ linux-2.6.20/fs/reiser4/plugin/object.c 2007-05-06 14:50:43.835018219 +0400
58573 @@ -0,0 +1,516 @@
58574 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58575 + * reiser4/README */
58576 +
58577 +/*
58578 + * Examples of object plugins: file, directory, symlink, special file.
58579 + *
58580 + * Plugins associated with inode:
58581 + *
58582 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
58583 + * stat-data. How we store this plugin in in-core inode is not
58584 + * important. Currently pointers are used, another variant is to store offsets
58585 + * and do array lookup on each access.
58586 + *
58587 + * Now, each inode has one selected plugin: object plugin that
58588 + * determines what type of file this object is: directory, regular etc.
58589 + *
58590 + * This main plugin can use other plugins that are thus subordinated to
58591 + * it. Directory instance of object plugin uses hash; regular file
58592 + * instance uses tail policy plugin.
58593 + *
58594 + * Object plugin is either taken from id in stat-data or guessed from
58595 + * i_mode bits. Once it is established we ask it to install its
58596 + * subordinate plugins, by looking again in stat-data or inheriting them
58597 + * from parent.
58598 + *
58599 + * How new inode is initialized during ->read_inode():
58600 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
58601 + * i_generation, capabilities etc.
58602 + * 2 read plugin id from stat data or try to guess plugin id
58603 + * from inode->i_mode bits if plugin id is missing.
58604 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58605 + *
58606 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58607 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
58608 + *
58609 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
58610 + * from stat-data or guessed from mode bits
58611 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58612 + * plugins from parent.
58613 + *
58614 + * Easy induction proves that on last step all plugins of inode would be
58615 + * initialized.
58616 + *
58617 + * When creating new object:
58618 + * 1 obtain object plugin id (see next period)
58619 + * NIKITA-FIXME-HANS: period?
58620 + * 2 ->install() this plugin
58621 + * 3 ->inherit() the rest from the parent
58622 + *
58623 + * We need some examples of creating an object with default and non-default
58624 + * plugin ids. Nikita, please create them.
58625 + */
58626 +
58627 +#include "../inode.h"
58628 +
58629 +static int _bugop(void)
58630 +{
58631 + BUG_ON(1);
58632 + return 0;
58633 +}
58634 +
58635 +#define bugop ((void *)_bugop)
58636 +
58637 +static int _dummyop(void)
58638 +{
58639 + return 0;
58640 +}
58641 +
58642 +#define dummyop ((void *)_dummyop)
58643 +
58644 +static int change_file(struct inode *inode,
58645 + reiser4_plugin * plugin,
58646 + pset_member memb)
58647 +{
58648 + /* cannot change object plugin of already existing object */
58649 + if (memb == PSET_FILE)
58650 + return RETERR(-EINVAL);
58651 +
58652 + /* Change PSET_CREATE */
58653 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
58654 +}
58655 +
58656 +static reiser4_plugin_ops file_plugin_ops = {
58657 + .change = change_file
58658 +};
58659 +
58660 +/*
58661 + * Definitions of object plugins.
58662 + */
58663 +
58664 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58665 + [UNIX_FILE_PLUGIN_ID] = {
58666 + .h = {
58667 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58668 + .id = UNIX_FILE_PLUGIN_ID,
58669 + .groups = (1 << REISER4_REGULAR_FILE),
58670 + .pops = &file_plugin_ops,
58671 + .label = "reg",
58672 + .desc = "regular file",
58673 + .linkage = {NULL, NULL},
58674 + },
58675 + .inode_ops = {
58676 + .permission = reiser4_permission_common,
58677 + .setattr = setattr_unix_file,
58678 + .getattr = reiser4_getattr_common
58679 + },
58680 + .file_ops = {
58681 + .llseek = generic_file_llseek,
58682 + .read = read_unix_file,
58683 + .write = write_unix_file,
58684 + .aio_read = generic_file_aio_read,
58685 + .ioctl = ioctl_unix_file,
58686 + .mmap = mmap_unix_file,
58687 + .open = open_unix_file,
58688 + .release = release_unix_file,
58689 + .fsync = sync_unix_file,
58690 + .sendfile = sendfile_unix_file
58691 + },
58692 + .as_ops = {
58693 + .writepage = reiser4_writepage,
58694 + .readpage = readpage_unix_file,
58695 + .sync_page = block_sync_page,
58696 + .writepages = writepages_unix_file,
58697 + .set_page_dirty = reiser4_set_page_dirty,
58698 + .readpages = readpages_unix_file,
58699 + .prepare_write = prepare_write_unix_file,
58700 + .commit_write = commit_write_unix_file,
58701 + .bmap = bmap_unix_file,
58702 + .invalidatepage = reiser4_invalidatepage,
58703 + .releasepage = reiser4_releasepage
58704 + },
58705 + .write_sd_by_inode = write_sd_by_inode_common,
58706 + .flow_by_inode = flow_by_inode_unix_file,
58707 + .key_by_inode = key_by_inode_and_offset_common,
58708 + .set_plug_in_inode = set_plug_in_inode_common,
58709 + .adjust_to_parent = adjust_to_parent_common,
58710 + .create_object = reiser4_create_object_common,
58711 + .delete_object = delete_object_unix_file,
58712 + .add_link = reiser4_add_link_common,
58713 + .rem_link = reiser4_rem_link_common,
58714 + .owns_item = owns_item_unix_file,
58715 + .can_add_link = can_add_link_common,
58716 + .detach = dummyop,
58717 + .bind = dummyop,
58718 + .safelink = safelink_common,
58719 + .estimate = {
58720 + .create = estimate_create_common,
58721 + .update = estimate_update_common,
58722 + .unlink = estimate_unlink_common
58723 + },
58724 + .init_inode_data = init_inode_data_unix_file,
58725 + .cut_tree_worker = cut_tree_worker_common,
58726 + .wire = {
58727 + .write = wire_write_common,
58728 + .read = wire_read_common,
58729 + .get = wire_get_common,
58730 + .size = wire_size_common,
58731 + .done = wire_done_common
58732 + }
58733 + },
58734 + [DIRECTORY_FILE_PLUGIN_ID] = {
58735 + .h = {
58736 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58737 + .id = DIRECTORY_FILE_PLUGIN_ID,
58738 + .groups = (1 << REISER4_DIRECTORY_FILE),
58739 + .pops = &file_plugin_ops,
58740 + .label = "dir",
58741 + .desc = "directory",
58742 + .linkage = {NULL, NULL}
58743 + },
58744 + .inode_ops = {.create = NULL},
58745 + .file_ops = {.owner = NULL},
58746 + .as_ops = {.writepage = NULL},
58747 +
58748 + .write_sd_by_inode = write_sd_by_inode_common,
58749 + .flow_by_inode = bugop,
58750 + .key_by_inode = bugop,
58751 + .set_plug_in_inode = set_plug_in_inode_common,
58752 + .adjust_to_parent = adjust_to_parent_common_dir,
58753 + .create_object = reiser4_create_object_common,
58754 + .delete_object = reiser4_delete_dir_common,
58755 + .add_link = reiser4_add_link_common,
58756 + .rem_link = rem_link_common_dir,
58757 + .owns_item = owns_item_common_dir,
58758 + .can_add_link = can_add_link_common,
58759 + .can_rem_link = can_rem_link_common_dir,
58760 + .detach = reiser4_detach_common_dir,
58761 + .bind = reiser4_bind_common_dir,
58762 + .safelink = safelink_common,
58763 + .estimate = {
58764 + .create = estimate_create_common_dir,
58765 + .update = estimate_update_common,
58766 + .unlink = estimate_unlink_common_dir
58767 + },
58768 + .wire = {
58769 + .write = wire_write_common,
58770 + .read = wire_read_common,
58771 + .get = wire_get_common,
58772 + .size = wire_size_common,
58773 + .done = wire_done_common
58774 + },
58775 + .init_inode_data = init_inode_ordering,
58776 + .cut_tree_worker = cut_tree_worker_common,
58777 + },
58778 + [SYMLINK_FILE_PLUGIN_ID] = {
58779 + .h = {
58780 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58781 + .id = SYMLINK_FILE_PLUGIN_ID,
58782 + .groups = (1 << REISER4_SYMLINK_FILE),
58783 + .pops = &file_plugin_ops,
58784 + .label = "symlink",
58785 + .desc = "symbolic link",
58786 + .linkage = {NULL,NULL}
58787 + },
58788 + .inode_ops = {
58789 + .readlink = generic_readlink,
58790 + .follow_link = reiser4_follow_link_common,
58791 + .permission = reiser4_permission_common,
58792 + .setattr = reiser4_setattr_common,
58793 + .getattr = reiser4_getattr_common
58794 + },
58795 + /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
58796 + .file_ops = {.owner = NULL},
58797 + .as_ops = {.writepage = NULL},
58798 +
58799 + .write_sd_by_inode = write_sd_by_inode_common,
58800 + .set_plug_in_inode = set_plug_in_inode_common,
58801 + .adjust_to_parent = adjust_to_parent_common,
58802 + .create_object = reiser4_create_symlink,
58803 + .delete_object = reiser4_delete_object_common,
58804 + .add_link = reiser4_add_link_common,
58805 + .rem_link = reiser4_rem_link_common,
58806 + .can_add_link = can_add_link_common,
58807 + .detach = dummyop,
58808 + .bind = dummyop,
58809 + .safelink = safelink_common,
58810 + .estimate = {
58811 + .create = estimate_create_common,
58812 + .update = estimate_update_common,
58813 + .unlink = estimate_unlink_common
58814 + },
58815 + .init_inode_data = init_inode_ordering,
58816 + .cut_tree_worker = cut_tree_worker_common,
58817 + .destroy_inode = destroy_inode_symlink,
58818 + .wire = {
58819 + .write = wire_write_common,
58820 + .read = wire_read_common,
58821 + .get = wire_get_common,
58822 + .size = wire_size_common,
58823 + .done = wire_done_common
58824 + }
58825 + },
58826 + [SPECIAL_FILE_PLUGIN_ID] = {
58827 + .h = {
58828 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58829 + .id = SPECIAL_FILE_PLUGIN_ID,
58830 + .groups = (1 << REISER4_SPECIAL_FILE),
58831 + .pops = &file_plugin_ops,
58832 + .label = "special",
58833 + .desc =
58834 + "special: fifo, device or socket",
58835 + .linkage = {NULL, NULL}
58836 + },
58837 + .inode_ops = {
58838 + .permission = reiser4_permission_common,
58839 + .setattr = reiser4_setattr_common,
58840 + .getattr = reiser4_getattr_common
58841 + },
58842 + /* file_ops of special files (sockets, block, char, fifo) are
58843 + initialized by init_special_inode. */
58844 + .file_ops = {.owner = NULL},
58845 + .as_ops = {.writepage = NULL},
58846 +
58847 + .write_sd_by_inode = write_sd_by_inode_common,
58848 + .set_plug_in_inode = set_plug_in_inode_common,
58849 + .adjust_to_parent = adjust_to_parent_common,
58850 + .create_object = reiser4_create_object_common,
58851 + .delete_object = reiser4_delete_object_common,
58852 + .add_link = reiser4_add_link_common,
58853 + .rem_link = reiser4_rem_link_common,
58854 + .owns_item = owns_item_common,
58855 + .can_add_link = can_add_link_common,
58856 + .detach = dummyop,
58857 + .bind = dummyop,
58858 + .safelink = safelink_common,
58859 + .estimate = {
58860 + .create = estimate_create_common,
58861 + .update = estimate_update_common,
58862 + .unlink = estimate_unlink_common
58863 + },
58864 + .init_inode_data = init_inode_ordering,
58865 + .cut_tree_worker = cut_tree_worker_common,
58866 + .wire = {
58867 + .write = wire_write_common,
58868 + .read = wire_read_common,
58869 + .get = wire_get_common,
58870 + .size = wire_size_common,
58871 + .done = wire_done_common
58872 + }
58873 + },
58874 + [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
58875 + .h = {
58876 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58877 + .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
58878 + .groups = (1 << REISER4_REGULAR_FILE),
58879 + .pops = &file_plugin_ops,
58880 + .label = "cryptcompress",
58881 + .desc = "cryptcompress file",
58882 + .linkage = {NULL, NULL}
58883 + },
58884 + .inode_ops = {
58885 + .permission = reiser4_permission_common,
58886 + .setattr = prot_setattr_cryptcompress,
58887 + .getattr = reiser4_getattr_common
58888 + },
58889 + .file_ops = {
58890 + .llseek = generic_file_llseek,
58891 + .read = prot_read_cryptcompress,
58892 + .write = prot_write_cryptcompress,
58893 + .aio_read = generic_file_aio_read,
58894 + .mmap = prot_mmap_cryptcompress,
58895 + .release = prot_release_cryptcompress,
58896 + .fsync = reiser4_sync_common,
58897 + .sendfile = prot_sendfile_cryptcompress
58898 + },
58899 + .as_ops = {
58900 + .writepage = reiser4_writepage,
58901 + .readpage = readpage_cryptcompress,
58902 + .sync_page = block_sync_page,
58903 + .writepages = writepages_cryptcompress,
58904 + .set_page_dirty = reiser4_set_page_dirty,
58905 + .readpages = readpages_cryptcompress,
58906 + .prepare_write = prepare_write_common,
58907 + .invalidatepage = reiser4_invalidatepage,
58908 + .releasepage = reiser4_releasepage
58909 + },
58910 + .write_sd_by_inode = write_sd_by_inode_common,
58911 + .flow_by_inode = flow_by_inode_cryptcompress,
58912 + .key_by_inode = key_by_inode_cryptcompress,
58913 + .set_plug_in_inode = set_plug_in_inode_common,
58914 + .adjust_to_parent = adjust_to_parent_cryptcompress,
58915 + .create_object = create_cryptcompress,
58916 + .open_object = open_object_cryptcompress,
58917 + .delete_object = delete_object_cryptcompress,
58918 + .add_link = reiser4_add_link_common,
58919 + .rem_link = reiser4_rem_link_common,
58920 + .owns_item = owns_item_common,
58921 + .can_add_link = can_add_link_common,
58922 + .detach = dummyop,
58923 + .bind = dummyop,
58924 + .safelink = safelink_common,
58925 + .estimate = {
58926 + .create = estimate_create_common,
58927 + .update = estimate_update_common,
58928 + .unlink = estimate_unlink_common
58929 + },
58930 + .init_inode_data = init_inode_data_cryptcompress,
58931 + .cut_tree_worker = cut_tree_worker_cryptcompress,
58932 + .destroy_inode = destroy_inode_cryptcompress,
58933 + .wire = {
58934 + .write = wire_write_common,
58935 + .read = wire_read_common,
58936 + .get = wire_get_common,
58937 + .size = wire_size_common,
58938 + .done = wire_done_common
58939 + }
58940 + }
58941 +};
58942 +
58943 +static int change_dir(struct inode *inode,
58944 + reiser4_plugin * plugin,
58945 + pset_member memb)
58946 +{
58947 + /* cannot change dir plugin of already existing object */
58948 + return RETERR(-EINVAL);
58949 +}
58950 +
58951 +static reiser4_plugin_ops dir_plugin_ops = {
58952 + .change = change_dir
58953 +};
58954 +
58955 +/*
58956 + * definition of directory plugins
58957 + */
58958 +
58959 +dir_plugin dir_plugins[LAST_DIR_ID] = {
58960 + /* standard hashed directory plugin */
58961 + [HASHED_DIR_PLUGIN_ID] = {
58962 + .h = {
58963 + .type_id = REISER4_DIR_PLUGIN_TYPE,
58964 + .id = HASHED_DIR_PLUGIN_ID,
58965 + .pops = &dir_plugin_ops,
58966 + .label = "dir",
58967 + .desc = "hashed directory",
58968 + .linkage = {NULL, NULL}
58969 + },
58970 + .inode_ops = {
58971 + .create = reiser4_create_common,
58972 + .lookup = reiser4_lookup_common,
58973 + .link = reiser4_link_common,
58974 + .unlink = reiser4_unlink_common,
58975 + .symlink = reiser4_symlink_common,
58976 + .mkdir = reiser4_mkdir_common,
58977 + .rmdir = reiser4_unlink_common,
58978 + .mknod = reiser4_mknod_common,
58979 + .rename = reiser4_rename_common,
58980 + .permission = reiser4_permission_common,
58981 + .setattr = reiser4_setattr_common,
58982 + .getattr = reiser4_getattr_common
58983 + },
58984 + .file_ops = {
58985 + .llseek = reiser4_llseek_dir_common,
58986 + .read = generic_read_dir,
58987 + .readdir = reiser4_readdir_common,
58988 + .release = reiser4_release_dir_common,
58989 + .fsync = reiser4_sync_common
58990 + },
58991 + .as_ops = {
58992 + .writepage = bugop,
58993 + .sync_page = bugop,
58994 + .writepages = dummyop,
58995 + .set_page_dirty = bugop,
58996 + .readpages = bugop,
58997 + .prepare_write = bugop,
58998 + .commit_write = bugop,
58999 + .bmap = bugop,
59000 + .invalidatepage = bugop,
59001 + .releasepage = bugop
59002 + },
59003 + .get_parent = get_parent_common,
59004 + .is_name_acceptable = is_name_acceptable_common,
59005 + .build_entry_key = build_entry_key_hashed,
59006 + .build_readdir_key = build_readdir_key_common,
59007 + .add_entry = reiser4_add_entry_common,
59008 + .rem_entry = reiser4_rem_entry_common,
59009 + .init = reiser4_dir_init_common,
59010 + .done = reiser4_dir_done_common,
59011 + .attach = reiser4_attach_common,
59012 + .detach = reiser4_detach_common,
59013 + .estimate = {
59014 + .add_entry = estimate_add_entry_common,
59015 + .rem_entry = estimate_rem_entry_common,
59016 + .unlink = dir_estimate_unlink_common
59017 + }
59018 + },
59019 + /* hashed directory for which seekdir/telldir are guaranteed to
59020 + * work. Brain-damage. */
59021 + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59022 + .h = {
59023 + .type_id = REISER4_DIR_PLUGIN_TYPE,
59024 + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59025 + .pops = &dir_plugin_ops,
59026 + .label = "dir32",
59027 + .desc = "directory hashed with 31 bit hash",
59028 + .linkage = {NULL, NULL}
59029 + },
59030 + .inode_ops = {
59031 + .create = reiser4_create_common,
59032 + .lookup = reiser4_lookup_common,
59033 + .link = reiser4_link_common,
59034 + .unlink = reiser4_unlink_common,
59035 + .symlink = reiser4_symlink_common,
59036 + .mkdir = reiser4_mkdir_common,
59037 + .rmdir = reiser4_unlink_common,
59038 + .mknod = reiser4_mknod_common,
59039 + .rename = reiser4_rename_common,
59040 + .permission = reiser4_permission_common,
59041 + .setattr = reiser4_setattr_common,
59042 + .getattr = reiser4_getattr_common
59043 + },
59044 + .file_ops = {
59045 + .llseek = reiser4_llseek_dir_common,
59046 + .read = generic_read_dir,
59047 + .readdir = reiser4_readdir_common,
59048 + .release = reiser4_release_dir_common,
59049 + .fsync = reiser4_sync_common
59050 + },
59051 + .as_ops = {
59052 + .writepage = bugop,
59053 + .sync_page = bugop,
59054 + .writepages = dummyop,
59055 + .set_page_dirty = bugop,
59056 + .readpages = bugop,
59057 + .prepare_write = bugop,
59058 + .commit_write = bugop,
59059 + .bmap = bugop,
59060 + .invalidatepage = bugop,
59061 + .releasepage = bugop
59062 + },
59063 + .get_parent = get_parent_common,
59064 + .is_name_acceptable = is_name_acceptable_common,
59065 + .build_entry_key = build_entry_key_seekable,
59066 + .build_readdir_key = build_readdir_key_common,
59067 + .add_entry = reiser4_add_entry_common,
59068 + .rem_entry = reiser4_rem_entry_common,
59069 + .init = reiser4_dir_init_common,
59070 + .done = reiser4_dir_done_common,
59071 + .attach = reiser4_attach_common,
59072 + .detach = reiser4_detach_common,
59073 + .estimate = {
59074 + .add_entry = estimate_add_entry_common,
59075 + .rem_entry = estimate_rem_entry_common,
59076 + .unlink = dir_estimate_unlink_common
59077 + }
59078 + }
59079 +};
59080 +
59081 +/* Make Linus happy.
59082 + Local variables:
59083 + c-indentation-style: "K&R"
59084 + mode-name: "LC"
59085 + c-basic-offset: 8
59086 + tab-width: 8
59087 + fill-column: 120
59088 + End:
59089 +*/
59090 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.h linux-2.6.20/fs/reiser4/plugin/object.h
59091 --- linux-2.6.20.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
59092 +++ linux-2.6.20/fs/reiser4/plugin/object.h 2007-05-06 14:50:43.839019469 +0400
59093 @@ -0,0 +1,121 @@
59094 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59095 + * reiser4/README */
59096 +
59097 +/* Declaration of object plugin functions. */
59098 +
59099 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59100 +#define __FS_REISER4_PLUGIN_OBJECT_H__
59101 +
59102 +#include "../type_safe_hash.h"
59103 +
59104 +/* common implementations of inode operations */
59105 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
59106 + int mode, struct nameidata *);
59107 +struct dentry * reiser4_lookup_common(struct inode *parent,
59108 + struct dentry *dentry,
59109 + struct nameidata *nameidata);
59110 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
59111 + struct dentry *newname);
59112 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
59113 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59114 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
59115 + const char *linkname);
59116 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
59117 + int mode, dev_t rdev);
59118 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
59119 + struct inode *new_dir, struct dentry *new_name);
59120 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
59121 +int reiser4_permission_common(struct inode *, int mask,
59122 + struct nameidata *nameidata);
59123 +int reiser4_setattr_common(struct dentry *, struct iattr *);
59124 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
59125 + struct kstat *);
59126 +
59127 +/* common implementations of file operations */
59128 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
59129 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
59130 +int reiser4_release_dir_common(struct inode *, struct file *);
59131 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
59132 +
59133 +/* common implementations of address space operations */
59134 +int prepare_write_common(struct file *, struct page *, unsigned from,
59135 + unsigned to);
59136 +
59137 +/* file plugin operations: common implementations */
59138 +int write_sd_by_inode_common(struct inode *);
59139 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59140 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59141 + reiser4_object_create_data *);
59142 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
59143 + struct inode *root);
59144 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59145 + struct inode *root);
59146 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59147 + struct inode *root);
59148 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
59149 + reiser4_object_create_data *);
59150 +int reiser4_delete_object_common(struct inode *);
59151 +int reiser4_delete_dir_common(struct inode *);
59152 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
59153 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
59154 +int rem_link_common_dir(struct inode *object, struct inode *parent);
59155 +int owns_item_common(const struct inode *, const coord_t *);
59156 +int owns_item_common_dir(const struct inode *, const coord_t *);
59157 +int can_add_link_common(const struct inode *);
59158 +int can_rem_link_common_dir(const struct inode *);
59159 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
59160 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
59161 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59162 +reiser4_block_nr estimate_create_common(const struct inode *);
59163 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
59164 +reiser4_block_nr estimate_update_common(const struct inode *);
59165 +reiser4_block_nr estimate_unlink_common(const struct inode *,
59166 + const struct inode *);
59167 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59168 + const struct inode *);
59169 +char *wire_write_common(struct inode *, char *start);
59170 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
59171 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59172 +int wire_size_common(struct inode *);
59173 +void wire_done_common(reiser4_object_on_wire *);
59174 +
59175 +/* dir plugin operations: common implementations */
59176 +struct dentry *get_parent_common(struct inode *child);
59177 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
59178 +void build_entry_key_common(const struct inode *,
59179 + const struct qstr *qname, reiser4_key *);
59180 +int build_readdir_key_common(struct file *dir, reiser4_key *);
59181 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
59182 + reiser4_object_create_data *, reiser4_dir_entry_desc *);
59183 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
59184 + reiser4_dir_entry_desc *);
59185 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
59186 + reiser4_object_create_data *);
59187 +int reiser4_dir_done_common(struct inode *);
59188 +int reiser4_attach_common(struct inode *child, struct inode *parent);
59189 +int reiser4_detach_common(struct inode *object, struct inode *parent);
59190 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
59191 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59192 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59193 + const struct inode *);
59194 +
59195 +/* these are essential parts of common implementations, they are to make
59196 + customized implementations easier */
59197 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59198 +
59199 +/* merely useful functions */
59200 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59201 + const reiser4_key *, int silent);
59202 +
59203 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59204 +#endif
59205 +
59206 +/* Make Linus happy.
59207 + Local variables:
59208 + c-indentation-style: "K&R"
59209 + mode-name: "LC"
59210 + c-basic-offset: 8
59211 + tab-width: 8
59212 + fill-column: 120
59213 + End:
59214 +*/
59215 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.c linux-2.6.20/fs/reiser4/plugin/plugin.c
59216 --- linux-2.6.20.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
59217 +++ linux-2.6.20/fs/reiser4/plugin/plugin.c 2007-05-06 14:50:43.839019469 +0400
59218 @@ -0,0 +1,578 @@
59219 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59220 + * reiser4/README */
59221 +
59222 +/* Basic plugin infrastructure, lookup etc. */
59223 +
59224 +/* PLUGINS:
59225 +
59226 + Plugins are internal Reiser4 "modules" or "objects" used to increase
59227 + extensibility and allow external users to easily adapt reiser4 to
59228 + their needs.
59229 +
59230 + Plugins are classified into several disjoint "types". Plugins
59231 + belonging to the particular plugin type are termed "instances" of
59232 + this type. Currently the following types are present:
59233 +
59234 + . object plugin
59235 + . hash plugin
59236 + . tail plugin
59237 + . perm plugin
59238 + . item plugin
59239 + . node layout plugin
59240 +
59241 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59242 +
59243 + Object (file) plugin determines how given file-system object serves
59244 + standard VFS requests for read, write, seek, mmap etc. Instances of
59245 + file plugins are: regular file, directory, symlink. Another example
59246 + of file plugin is audit plugin, that optionally records accesses to
59247 + underlying object and forwards requests to it.
59248 +
59249 + Hash plugins compute hashes used by reiser4 to store and locate
59250 + files within directories. Instances of hash plugin type are: r5,
59251 + tea, rupasov.
59252 +
59253 + Tail plugins (or, more precisely, tail policy plugins) determine
59254 + when last part of the file should be stored in a formatted item.
59255 +
59256 + Perm plugins control permissions granted for a process accessing a file.
59257 +
59258 + Scope and lookup:
59259 +
59260 + label such that pair ( type_label, plugin_label ) is unique. This
59261 + pair is a globally persistent and user-visible plugin
59262 + identifier. Internally kernel maintains plugins and plugin types in
59263 + arrays using an index into those arrays as plugin and plugin type
59264 + identifiers. File-system in turn, also maintains persistent
59265 + "dictionary" which is mapping from plugin label to numerical
59266 + identifier which is stored in file-system objects. That is, we
59267 + store the offset into the plugin array for that plugin type as the
59268 + plugin id in the stat data of the filesystem object.
59269 +
59270 + plugin_labels have meaning for the user interface that assigns
59271 + plugins to files, and may someday have meaning for dynamic loading of
59272 + plugins and for copying of plugins from one fs instance to
59273 + another by utilities like cp and tar.
59274 +
59275 + Internal kernel plugin type identifier (index in plugins[] array) is
59276 + of type reiser4_plugin_type. Set of available plugin types is
59277 + currently static, but dynamic loading doesn't seem to pose
59278 + insurmountable problems.
59279 +
59280 + Within each type plugins are addressed by the identifiers of type
59281 + reiser4_plugin_id (indices in
59282 + reiser4_plugin_type_data.builtin[]). Such identifiers are only
59283 + required to be unique within one type, not globally.
59284 +
59285 + Thus, plugin in memory is uniquely identified by the pair (type_id,
59286 + id).
59287 +
59288 + Usage:
59289 +
59290 + There exists only one instance of each plugin instance, but this
59291 + single instance can be associated with many entities (file-system
59292 + objects, items, nodes, transactions, file-descriptors etc.). Entity
59293 + to which plugin of given type is termed (due to the lack of
59294 + imagination) "subject" of this plugin type and, by abuse of
59295 + terminology, subject of particular instance of this type to which
59296 + it's attached currently. For example, inode is subject of object
59297 + plugin type. Inode representing directory is subject of directory
59298 + plugin, hash plugin type and some particular instance of hash plugin
59299 + type. Inode, representing regular file is subject of "regular file"
59300 + plugin, tail-policy plugin type etc.
59301 +
59302 + With each subject the plugin possibly stores some state. For example,
59303 + the state of a directory plugin (instance of object plugin type) is pointer
59304 + to hash plugin (if directories always use hashing that is). State of
59305 + audit plugin is file descriptor (struct file) of log file or some
59306 + magic value to do logging through printk().
59307 +
59308 + Interface:
59309 +
59310 + In addition to a scalar identifier, each plugin type and plugin
59311 + proper has a "label": short string and a "description"---longer
59312 + descriptive string. Labels and descriptions of plugin types are
59313 + hard-coded into plugins[] array, declared and defined in
59314 + plugin.c. Label and description of plugin are stored in .label and
59315 + .desc fields of reiser4_plugin_header respectively. It's possible to
59316 + locate plugin by the pair of labels.
59317 +
59318 + Features:
59319 +
59320 + . user-level plugin manipulations:
59321 + + reiser4("filename/..file_plugin<='audit'");
59322 + + write(open("filename/..file_plugin"), "audit", 8);
59323 +
59324 + . user level utilities lsplug and chplug to manipulate plugins.
59325 + Utilities are not of primary priority. Possibly they will be not
59326 + working on v4.0
59327 +
59328 +NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59329 +
59330 + . mount option "plug" to set-up plugins of root-directory.
59331 + "plug=foo:bar" will set "bar" as default plugin of type "foo".
59332 +
59333 + Limitations:
59334 +
59335 + . each plugin type has to provide at least one builtin
59336 + plugin. This is technical limitation and it can be lifted in the
59337 + future.
59338 +
59339 + TODO:
59340 +
59341 + New plugin types/plugings:
59342 + Things we should be able to separately choose to inherit:
59343 +
59344 + security plugins
59345 +
59346 + stat data
59347 +
59348 + file bodies
59349 +
59350 + file plugins
59351 +
59352 + dir plugins
59353 +
59354 + . perm:acl
59355 +
59356 + d audi---audit plugin intercepting and possibly logging all
59357 + accesses to object. Requires to put stub functions in file_operations
59358 + in stead of generic_file_*.
59359 +
59360 +NIKITA-FIXME-HANS: why make overflows a plugin?
59361 + . over---handle hash overflows
59362 +
59363 + . sqnt---handle different access patterns and instruments read-ahead
59364 +
59365 +NIKITA-FIXME-HANS: describe the line below in more detail.
59366 +
59367 + . hier---handle inheritance of plugins along file-system hierarchy
59368 +
59369 + Different kinds of inheritance: on creation vs. on access.
59370 + Compatible/incompatible plugins.
59371 + Inheritance for multi-linked files.
59372 + Layered plugins.
59373 + Notion of plugin context is abandoned.
59374 +
59375 +Each file is associated
59376 + with one plugin and dependant plugins (hash, etc.) are stored as
59377 + main plugin state. Now, if we have plugins used for regular files
59378 + but not for directories, how such plugins would be inherited?
59379 + . always store them with directories also
59380 +
59381 +NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful.
59382 +
59383 + . use inheritance hierarchy, independent of file-system namespace
59384 +
59385 +*/
59386 +
59387 +#include "../debug.h"
59388 +#include "../dformat.h"
59389 +#include "plugin_header.h"
59390 +#include "item/static_stat.h"
59391 +#include "node/node.h"
59392 +#include "security/perm.h"
59393 +#include "space/space_allocator.h"
59394 +#include "disk_format/disk_format.h"
59395 +#include "plugin.h"
59396 +#include "../reiser4.h"
59397 +#include "../jnode.h"
59398 +#include "../inode.h"
59399 +
59400 +#include <linux/fs.h> /* for struct super_block */
59401 +
59402 +/* public interface */
59403 +
59404 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59405 +int init_plugins(void);
59406 +int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59407 +int locate_plugin(struct inode *inode, plugin_locator * loc);
59408 +
59409 +/**
59410 + * init_plugins - initialize plugins
59411 + *
59412 + * Initializes plugin sub-system. It is part of reiser4 module
59413 + * initialization. For each plugin of each type init method is called and each
59414 + * plugin is put into list of plugins.
59415 + */
59416 +int init_plugins(void)
59417 +{
59418 + reiser4_plugin_type type_id;
59419 +
59420 + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59421 + reiser4_plugin_type_data *ptype;
59422 + int i;
59423 +
59424 + ptype = &plugins[type_id];
59425 + assert("nikita-3508", ptype->label != NULL);
59426 + assert("nikita-3509", ptype->type_id == type_id);
59427 +
59428 + INIT_LIST_HEAD(&ptype->plugins_list);
59429 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59430 + for (i = 0; i < ptype->builtin_num; ++i) {
59431 + reiser4_plugin *plugin;
59432 +
59433 + plugin = plugin_at(ptype, i);
59434 +
59435 + if (plugin->h.label == NULL)
59436 + /* uninitialized slot encountered */
59437 + continue;
59438 + assert("nikita-3445", plugin->h.type_id == type_id);
59439 + plugin->h.id = i;
59440 + if (plugin->h.pops != NULL &&
59441 + plugin->h.pops->init != NULL) {
59442 + int result;
59443 +
59444 + result = plugin->h.pops->init(plugin);
59445 + if (result != 0)
59446 + return result;
59447 + }
59448 + INIT_LIST_HEAD(&plugin->h.linkage);
59449 + list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59450 + }
59451 + }
59452 + return 0;
59453 +}
59454 +
59455 +/* true if plugin type id is valid */
59456 +int is_plugin_type_valid(reiser4_plugin_type type)
59457 +{
59458 + /* "type" is unsigned, so no comparison with 0 is
59459 + necessary */
59460 + return (type < REISER4_PLUGIN_TYPES);
59461 +}
59462 +
59463 +/* true if plugin id is valid */
59464 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
59465 +{
59466 + assert("nikita-1653", is_plugin_type_valid(type));
59467 + return id < plugins[type].builtin_num;
59468 +}
59469 +
59470 +/* return plugin by its @type and @id.
59471 +
59472 + Both arguments are checked for validness: this is supposed to be called
59473 + from user-level.
59474 +
59475 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59476 +user space, and passed to the filesystem by use of method files? Your
59477 +comment really confused me on the first reading....
59478 +
59479 +*/
59480 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
59481 + * unchecked */,
59482 + reiser4_plugin_id id /* plugin id,
59483 + * unchecked */)
59484 +{
59485 + if (is_plugin_type_valid(type)) {
59486 + if (is_plugin_id_valid(type, id))
59487 + return plugin_at(&plugins[type], id);
59488 + else
59489 + /* id out of bounds */
59490 + warning("nikita-2913",
59491 + "Invalid plugin id: [%i:%i]", type, id);
59492 + } else
59493 + /* type_id out of bounds */
59494 + warning("nikita-2914", "Invalid type_id: %i", type);
59495 + return NULL;
59496 +}
59497 +
59498 +/**
59499 + * save_plugin_id - store plugin id in disk format
59500 + * @plugin: plugin to convert
59501 + * @area: where to store result
59502 + *
59503 + * Puts id of @plugin in little endian format to address @area.
59504 + */
59505 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59506 + d16 *area /* where to store result */ )
59507 +{
59508 + assert("nikita-1261", plugin != NULL);
59509 + assert("nikita-1262", area != NULL);
59510 +
59511 + put_unaligned(cpu_to_le16(plugin->h.id), area);
59512 + return 0;
59513 +}
59514 +
59515 +/* list of all plugins of given type */
59516 +struct list_head *get_plugin_list(reiser4_plugin_type type)
59517 +{
59518 + assert("nikita-1056", is_plugin_type_valid(type));
59519 + return &plugins[type].plugins_list;
59520 +}
59521 +
59522 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
59523 +{
59524 + struct dentry *rootdir;
59525 + reiser4_inode *root;
59526 +
59527 + assert("edward-1443", memb != PSET_FILE);
59528 +
59529 + rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59530 + if (rootdir != NULL) {
59531 + root = reiser4_inode_data(rootdir->d_inode);
59532 + /*
59533 + * if inode is different from the default one, or we are
59534 + * changing plugin of root directory, update plugin_mask
59535 + */
59536 + if (aset_get(info->pset, memb) !=
59537 + aset_get(root->pset, memb) ||
59538 + info == root)
59539 + info->plugin_mask |= (1 << memb);
59540 + else
59541 + info->plugin_mask &= ~(1 << memb);
59542 + }
59543 +}
59544 +
59545 +/* Get specified plugin set member from parent,
59546 + or from fs-defaults (if no parent is given) and
59547 + install the result to pset of @self */
59548 +int grab_plugin_pset(struct inode *self,
59549 + struct inode *ancestor,
59550 + pset_member memb)
59551 +{
59552 + reiser4_plugin *plug;
59553 + reiser4_inode *info;
59554 + int result = 0;
59555 +
59556 + /* Do not grab if initialised already. */
59557 + info = reiser4_inode_data(self);
59558 + if (aset_get(info->pset, memb) != NULL)
59559 + return 0;
59560 + if (ancestor) {
59561 + reiser4_inode *parent;
59562 +
59563 + parent = reiser4_inode_data(ancestor);
59564 + plug = aset_get(parent->hset, memb) ? :
59565 + aset_get(parent->pset, memb);
59566 + }
59567 + else
59568 + plug = get_default_plugin(memb);
59569 +
59570 + result = set_plugin(&info->pset, memb, plug);
59571 + if (result == 0) {
59572 + if (!ancestor || self->i_sb->s_root->d_inode != self)
59573 + update_pset_mask(info, memb);
59574 + }
59575 + return result;
59576 +}
59577 +
59578 +/* Take missing pset members from root inode */
59579 +int finish_pset(struct inode *inode)
59580 +{
59581 + reiser4_plugin *plug;
59582 + reiser4_inode *root;
59583 + reiser4_inode *info;
59584 + pset_member memb;
59585 + int result = 0;
59586 +
59587 + root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
59588 + info = reiser4_inode_data(inode);
59589 +
59590 + assert("edward-1455", root != NULL);
59591 + assert("edward-1456", info != NULL);
59592 +
59593 + /* file and directory plugins are already initialized. */
59594 + for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
59595 +
59596 + /* Do not grab if initialised already. */
59597 + if (aset_get(info->pset, memb) != NULL)
59598 + continue;
59599 +
59600 + plug = aset_get(root->pset, memb);
59601 + result = set_plugin(&info->pset, memb, plug);
59602 + if (result != 0)
59603 + break;
59604 + }
59605 + if (result != 0) {
59606 + warning("nikita-3447",
59607 + "Cannot set up plugins for %lli",
59608 + (unsigned long long)
59609 + get_inode_oid(inode));
59610 + }
59611 + return result;
59612 +}
59613 +
59614 +int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
59615 +{
59616 + reiser4_inode *info;
59617 + int result = 0;
59618 +
59619 + if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
59620 + /* Changing pset in the root object. */
59621 + return RETERR(-EINVAL);
59622 + }
59623 +
59624 + info = reiser4_inode_data(self);
59625 + if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59626 + result = plug->h.pops->change(self, plug, memb);
59627 + else
59628 + result = aset_set_unsafe(&info->pset, memb, plug);
59629 + if (result == 0) {
59630 + __u16 oldmask = info->plugin_mask;
59631 +
59632 + update_pset_mask(info, memb);
59633 + if (oldmask != info->plugin_mask)
59634 + reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
59635 + }
59636 + return result;
59637 +}
59638 +
59639 +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59640 + /* C90 initializers */
59641 + [REISER4_FILE_PLUGIN_TYPE] = {
59642 + .type_id = REISER4_FILE_PLUGIN_TYPE,
59643 + .label = "file",
59644 + .desc = "Object plugins",
59645 + .builtin_num = sizeof_array(file_plugins),
59646 + .builtin = file_plugins,
59647 + .plugins_list = {NULL, NULL},
59648 + .size = sizeof(file_plugin)
59649 + },
59650 + [REISER4_DIR_PLUGIN_TYPE] = {
59651 + .type_id = REISER4_DIR_PLUGIN_TYPE,
59652 + .label = "dir",
59653 + .desc = "Directory plugins",
59654 + .builtin_num = sizeof_array(dir_plugins),
59655 + .builtin = dir_plugins,
59656 + .plugins_list = {NULL, NULL},
59657 + .size = sizeof(dir_plugin)
59658 + },
59659 + [REISER4_HASH_PLUGIN_TYPE] = {
59660 + .type_id = REISER4_HASH_PLUGIN_TYPE,
59661 + .label = "hash",
59662 + .desc = "Directory hashes",
59663 + .builtin_num = sizeof_array(hash_plugins),
59664 + .builtin = hash_plugins,
59665 + .plugins_list = {NULL, NULL},
59666 + .size = sizeof(hash_plugin)
59667 + },
59668 + [REISER4_FIBRATION_PLUGIN_TYPE] = {
59669 + .type_id =
59670 + REISER4_FIBRATION_PLUGIN_TYPE,
59671 + .label = "fibration",
59672 + .desc = "Directory fibrations",
59673 + .builtin_num = sizeof_array(fibration_plugins),
59674 + .builtin = fibration_plugins,
59675 + .plugins_list = {NULL, NULL},
59676 + .size = sizeof(fibration_plugin)
59677 + },
59678 + [REISER4_CIPHER_PLUGIN_TYPE] = {
59679 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59680 + .label = "cipher",
59681 + .desc = "Cipher plugins",
59682 + .builtin_num = sizeof_array(cipher_plugins),
59683 + .builtin = cipher_plugins,
59684 + .plugins_list = {NULL, NULL},
59685 + .size = sizeof(cipher_plugin)
59686 + },
59687 + [REISER4_DIGEST_PLUGIN_TYPE] = {
59688 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59689 + .label = "digest",
59690 + .desc = "Digest plugins",
59691 + .builtin_num = sizeof_array(digest_plugins),
59692 + .builtin = digest_plugins,
59693 + .plugins_list = {NULL, NULL},
59694 + .size = sizeof(digest_plugin)
59695 + },
59696 + [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59697 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59698 + .label = "compression",
59699 + .desc = "Compression plugins",
59700 + .builtin_num = sizeof_array(compression_plugins),
59701 + .builtin = compression_plugins,
59702 + .plugins_list = {NULL, NULL},
59703 + .size = sizeof(compression_plugin)
59704 + },
59705 + [REISER4_FORMATTING_PLUGIN_TYPE] = {
59706 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59707 + .label = "formatting",
59708 + .desc = "Tail inlining policies",
59709 + .builtin_num = sizeof_array(formatting_plugins),
59710 + .builtin = formatting_plugins,
59711 + .plugins_list = {NULL, NULL},
59712 + .size = sizeof(formatting_plugin)
59713 + },
59714 + [REISER4_PERM_PLUGIN_TYPE] = {
59715 + .type_id = REISER4_PERM_PLUGIN_TYPE,
59716 + .label = "perm",
59717 + .desc = "Permission checks",
59718 + .builtin_num = sizeof_array(perm_plugins),
59719 + .builtin = perm_plugins,
59720 + .plugins_list = {NULL, NULL},
59721 + .size = sizeof(perm_plugin)
59722 + },
59723 + [REISER4_ITEM_PLUGIN_TYPE] = {
59724 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
59725 + .label = "item",
59726 + .desc = "Item handlers",
59727 + .builtin_num = sizeof_array(item_plugins),
59728 + .builtin = item_plugins,
59729 + .plugins_list = {NULL, NULL},
59730 + .size = sizeof(item_plugin)
59731 + },
59732 + [REISER4_NODE_PLUGIN_TYPE] = {
59733 + .type_id = REISER4_NODE_PLUGIN_TYPE,
59734 + .label = "node",
59735 + .desc = "node layout handlers",
59736 + .builtin_num = sizeof_array(node_plugins),
59737 + .builtin = node_plugins,
59738 + .plugins_list = {NULL, NULL},
59739 + .size = sizeof(node_plugin)
59740 + },
59741 + [REISER4_SD_EXT_PLUGIN_TYPE] = {
59742 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59743 + .label = "sd_ext",
59744 + .desc = "Parts of stat-data",
59745 + .builtin_num = sizeof_array(sd_ext_plugins),
59746 + .builtin = sd_ext_plugins,
59747 + .plugins_list = {NULL, NULL},
59748 + .size = sizeof(sd_ext_plugin)
59749 + },
59750 + [REISER4_FORMAT_PLUGIN_TYPE] = {
59751 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59752 + .label = "disk_layout",
59753 + .desc = "defines filesystem on disk layout",
59754 + .builtin_num = sizeof_array(format_plugins),
59755 + .builtin = format_plugins,
59756 + .plugins_list = {NULL, NULL},
59757 + .size = sizeof(disk_format_plugin)
59758 + },
59759 + [REISER4_JNODE_PLUGIN_TYPE] = {
59760 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
59761 + .label = "jnode",
59762 + .desc = "defines kind of jnode",
59763 + .builtin_num = sizeof_array(jnode_plugins),
59764 + .builtin = jnode_plugins,
59765 + .plugins_list = {NULL, NULL},
59766 + .size = sizeof(jnode_plugin)
59767 + },
59768 + [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59769 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59770 + .label = "compression_mode",
59771 + .desc = "Defines compression mode",
59772 + .builtin_num = sizeof_array(compression_mode_plugins),
59773 + .builtin = compression_mode_plugins,
59774 + .plugins_list = {NULL, NULL},
59775 + .size = sizeof(compression_mode_plugin)
59776 + },
59777 + [REISER4_CLUSTER_PLUGIN_TYPE] = {
59778 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59779 + .label = "cluster",
59780 + .desc = "Defines cluster size",
59781 + .builtin_num = sizeof_array(cluster_plugins),
59782 + .builtin = cluster_plugins,
59783 + .plugins_list = {NULL, NULL},
59784 + .size = sizeof(cluster_plugin)
59785 + }
59786 +};
59787 +
59788 +/*
59789 + * Local variables:
59790 + * c-indentation-style: "K&R"
59791 + * mode-name: "LC"
59792 + * c-basic-offset: 8
59793 + * tab-width: 8
59794 + * fill-column: 120
59795 + * End:
59796 + */
59797 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.h linux-2.6.20/fs/reiser4/plugin/plugin.h
59798 --- linux-2.6.20.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
59799 +++ linux-2.6.20/fs/reiser4/plugin/plugin.h 2007-05-06 14:50:43.855024468 +0400
59800 @@ -0,0 +1,920 @@
59801 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59802 +
59803 +/* Basic plugin data-types.
59804 + see fs/reiser4/plugin/plugin.c for details */
59805 +
59806 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59807 +#define __FS_REISER4_PLUGIN_TYPES_H__
59808 +
59809 +#include "../forward.h"
59810 +#include "../debug.h"
59811 +#include "../dformat.h"
59812 +#include "../key.h"
59813 +#include "compress/compress.h"
59814 +#include "crypto/cipher.h"
59815 +#include "plugin_header.h"
59816 +#include "item/static_stat.h"
59817 +#include "item/internal.h"
59818 +#include "item/sde.h"
59819 +#include "item/cde.h"
59820 +#include "item/item.h"
59821 +#include "node/node.h"
59822 +#include "node/node40.h"
59823 +#include "security/perm.h"
59824 +#include "fibration.h"
59825 +
59826 +#include "space/bitmap.h"
59827 +#include "space/space_allocator.h"
59828 +
59829 +#include "disk_format/disk_format40.h"
59830 +#include "disk_format/disk_format.h"
59831 +
59832 +#include <linux/fs.h> /* for struct super_block, address_space */
59833 +#include <linux/mm.h> /* for struct page */
59834 +#include <linux/buffer_head.h> /* for struct buffer_head */
59835 +#include <linux/dcache.h> /* for struct dentry */
59836 +#include <linux/types.h>
59837 +#include <linux/crypto.h>
59838 +
59839 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59840 +
59841 +/*
59842 + * File plugin. Defines the set of methods that file plugins implement, some
59843 + * of which are optional.
59844 + *
59845 + * A file plugin offers to the caller an interface for IO ( writing to and/or
59846 + * reading from) to what the caller sees as one sequence of bytes. An IO to it
59847 + * may affect more than one physical sequence of bytes, or no physical sequence
59848 + * of bytes, it may affect sequences of bytes offered by other file plugins to
59849 + * the semantic layer, and the file plugin may invoke other plugins and
59850 + * delegate work to them, but its interface is structured for offering the
59851 + * caller the ability to read and/or write what the caller sees as being a
59852 + * single sequence of bytes.
59853 + *
59854 + * The file plugin must present a sequence of bytes to the caller, but it does
59855 + * not necessarily have to store a sequence of bytes, it does not necessarily
59856 + * have to support efficient tree traversal to any offset in the sequence of
59857 + * bytes (tail and extent items, whose keys contain offsets, do however provide
59858 + * efficient non-sequential lookup of any offset in the sequence of bytes).
59859 + *
59860 + * Directory plugins provide methods for selecting file plugins by resolving a
59861 + * name for them.
59862 + *
59863 + * The functionality other filesystems call an attribute, and rigidly tie
59864 + * together, we decompose into orthogonal selectable features of files. Using
59865 + * the terminology we will define next, an attribute is a perhaps constrained,
59866 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
59867 + * which might be grandparent-major-packed, and whose parent has a deletion
59868 + * method that deletes it.
59869 + *
59870 + * File plugins can implement constraints.
59871 + *
59872 + * Files can be of variable length (e.g. regular unix files), or of static
59873 + * length (e.g. static sized attributes).
59874 + *
59875 + * An object may have many sequences of bytes, and many file plugins, but, it
59876 + * has exactly one objectid. It is usually desirable that an object has a
59877 + * deletion method which deletes every item with that objectid. Items cannot
59878 + * in general be found by just their objectids. This means that an object must
59879 + * have either a method built into its deletion plugin method for knowing what
59880 + * items need to be deleted, or links stored with the object that provide the
59881 + * plugin with a method for finding those items. Deleting a file within an
59882 + * object may or may not have the effect of deleting the entire object,
59883 + * depending on the file plugin's deletion method.
59884 + *
59885 + * LINK TAXONOMY:
59886 + *
59887 + * Many objects have a reference count, and when the reference count reaches 0
59888 + * the object's deletion method is invoked. Some links embody a reference
59889 + * count increase ("countlinks"), and others do not ("nocountlinks").
59890 + *
59891 + * Some links are bi-directional links ("bilinks"), and some are
59892 + * uni-directional("unilinks").
59893 + *
59894 + * Some links are between parts of the same object ("intralinks"), and some are
59895 + * between different objects ("interlinks").
59896 + *
59897 + * PACKING TAXONOMY:
59898 + *
59899 + * Some items of an object are stored with a major packing locality based on
59900 + * their object's objectid (e.g. unix directory items in plan A), and these are
59901 + * called "self-major-packed".
59902 + *
59903 + * Some items of an object are stored with a major packing locality based on
59904 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59905 + * and these are called "parent-major-packed".
59906 + *
59907 + * Some items of an object are stored with a major packing locality based on
59908 + * their semantic grandparent, and these are called "grandparent-major-packed".
59909 + * Now carefully notice that we run into trouble with key length if we have to
59910 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59911 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59912 + * a 24 byte key. One of these fields must be sacrificed if an item is to be
59913 + * grandparent-major-packed, and which to sacrifice is left to the item author
59914 + * choosing to make the item grandparent-major-packed. You cannot make tail
59915 + * items and extent items grandparent-major-packed, though you could make them
59916 + * self-major-packed (usually they are parent-major-packed).
59917 + *
59918 + * In the case of ACLs (which are composed of fixed length ACEs which consist
59919 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
59920 + * to not have an offset field in the ACE item key, and to allow duplicate keys
59921 + * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59922 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59923 + * a directory together), the minor packing locality of ACE, the objectid of
59924 + * the file, and 0.
59925 + *
59926 + * IO involves moving data from one location to another, which means that two
59927 + * locations must be specified, source and destination.
59928 + *
59929 + * This source and destination can be in the filesystem, or they can be a
59930 + * pointer in the user process address space plus a byte count.
59931 + *
59932 + * If both source and destination are in the filesystem, then at least one of
59933 + * them must be representable as a pure stream of bytes (which we call a flow,
59934 + * and define as a struct containing a key, a data pointer, and a length).
59935 + * This may mean converting one of them into a flow. We provide a generic
59936 + * cast_into_flow() method, which will work for any plugin supporting
59937 + * read_flow(), though it is inefficiently implemented in that it temporarily
59938 + * stores the flow in a buffer (Question: what to do with huge flows that
59939 + * cannot fit into memory? Answer: we must not convert them all at once. )
59940 + *
59941 + * Performing a write requires resolving the write request into a flow defining
59942 + * the source, and a method that performs the write, and a key that defines
59943 + * where in the tree the write is to go.
59944 + *
59945 + * Performing a read requires resolving the read request into a flow defining
59946 + * the target, and a method that performs the read, and a key that defines
59947 + * where in the tree the read is to come from.
59948 + *
59949 + * There will exist file plugins which have no pluginid stored on the disk for
59950 + * them, and which are only invoked by other plugins.
59951 + */
59952 +
59953 +/* This should be incremented with each new contributed
59954 + pair (plugin type, plugin id).
59955 + NOTE: Make sure there is a release of reiser4progs
59956 + with the corresponding version number */
59957 +#define PLUGIN_LIBRARY_VERSION 0
59958 +
59959 + /* enumeration of fields within plugin_set */
59960 +typedef enum {
59961 + PSET_FILE,
59962 + PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
59963 + * inode.c:read_inode() depends on this. */
59964 + PSET_PERM,
59965 + PSET_FORMATTING,
59966 + PSET_HASH,
59967 + PSET_FIBRATION,
59968 + PSET_SD,
59969 + PSET_DIR_ITEM,
59970 + PSET_CIPHER,
59971 + PSET_DIGEST,
59972 + PSET_COMPRESSION,
59973 + PSET_COMPRESSION_MODE,
59974 + PSET_CLUSTER,
59975 + PSET_CREATE,
59976 + PSET_LAST
59977 +} pset_member;
59978 +
59979 +/* builtin file-plugins */
59980 +typedef enum {
59981 + /* regular file */
59982 + UNIX_FILE_PLUGIN_ID,
59983 + /* directory */
59984 + DIRECTORY_FILE_PLUGIN_ID,
59985 + /* symlink */
59986 + SYMLINK_FILE_PLUGIN_ID,
59987 + /* for objects completely handled by the VFS: fifos, devices,
59988 + sockets */
59989 + SPECIAL_FILE_PLUGIN_ID,
59990 + /* regular cryptcompress file */
59991 + CRYPTCOMPRESS_FILE_PLUGIN_ID,
59992 + /* number of file plugins. Used as size of arrays to hold
59993 + file plugins. */
59994 + LAST_FILE_PLUGIN_ID
59995 +} reiser4_file_id;
59996 +
59997 +typedef struct file_plugin {
59998 +
59999 + /* generic fields */
60000 + plugin_header h;
60001 +
60002 + struct inode_operations inode_ops;
60003 + struct file_operations file_ops;
60004 + struct address_space_operations as_ops;
60005 +
60006 + /* save inode cached stat-data onto disk. It was called
60007 + reiserfs_update_sd() in 3.x */
60008 + int (*write_sd_by_inode) (struct inode *);
60009 +
60010 + /*
60011 + * private methods: These are optional. If used they will allow you to
60012 + * minimize the amount of code needed to implement a deviation from
60013 + * some other method that also uses them.
60014 + */
60015 +
60016 + /*
60017 + * Construct flow into @flow according to user-supplied data.
60018 + *
60019 + * This is used by read/write methods to construct a flow to
60020 + * write/read. ->flow_by_inode() is plugin method, rather than single
60021 + * global implementation, because key in a flow used by plugin may
60022 + * depend on data in a @buf.
60023 + *
60024 + * NIKITA-FIXME-HANS: please create statistics on what functions are
60025 + * dereferenced how often for the mongo benchmark. You can supervise
60026 + * Elena doing this for you if that helps. Email me the list of the
60027 + * top 10, with their counts, and an estimate of the total number of
60028 + * CPU cycles spent dereferencing as a percentage of CPU cycles spent
60029 + * processing (non-idle processing). If the total percent is, say,
60030 + * less than 1%, it will make our coding discussions much easier, and
60031 + * keep me from questioning whether functions like the below are too
60032 + * frequently called to be dereferenced. If the total percent is more
60033 + * than 1%, perhaps private methods should be listed in a "required"
60034 + * comment at the top of each plugin (with stern language about how if
60035 + * the comment is missing it will not be accepted by the maintainer),
60036 + * and implemented using macros not dereferenced functions. How about
60037 + * replacing this whole private methods part of the struct with a
60038 + * thorough documentation of what the standard helper functions are for
60039 + * use in constructing plugins? I think users have been asking for
60040 + * that, though not in so many words.
60041 + */
60042 + int (*flow_by_inode) (struct inode *, const char __user *buf,
60043 + int user, loff_t size,
60044 + loff_t off, rw_op op, flow_t *);
60045 +
60046 + /*
60047 + * Return the key used to retrieve an offset of a file. It is used by
60048 + * default implementation of ->flow_by_inode() method
60049 + * (common_build_flow()) and, among other things, to get to the extent
60050 + * from jnode of unformatted node.
60051 + */
60052 + int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60053 +
60054 + /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60055 + /*
60056 + * set the plugin for a file. Called during file creation in creat()
60057 + * but not reiser4() unless an inode already exists for the file.
60058 + */
60059 + int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60060 + reiser4_object_create_data *);
60061 +
60062 + /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60063 + * are you setting up the object itself also or just adjusting the
60064 + * parent?.... */
60065 + /* set up plugins for new @object created in @parent. @root is root
60066 + directory. */
60067 + int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60068 + struct inode *root);
60069 + /*
60070 + * this does whatever is necessary to do when object is created. For
60071 + * instance, for unix files stat data is inserted. It is supposed to be
60072 + * called by create of struct inode_operations.
60073 + */
60074 + int (*create_object) (struct inode *object, struct inode *parent,
60075 + reiser4_object_create_data *);
60076 +
60077 + /* this does whatever is necessary to do when object is opened */
60078 + int (*open_object) (struct inode * inode, struct file * file);
60079 + /*
60080 + * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60081 + * success. Deletion of an object usually includes removal of items
60082 + * building file body (for directories this is removal of "." and "..")
60083 + * and removal of stat-data item.
60084 + */
60085 + int (*delete_object) (struct inode *);
60086 +
60087 + /* add link from @parent to @object */
60088 + int (*add_link) (struct inode *object, struct inode *parent);
60089 +
60090 + /* remove link from @parent to @object */
60091 + int (*rem_link) (struct inode *object, struct inode *parent);
60092 +
60093 + /*
60094 + * return true if item addressed by @coord belongs to @inode. This is
60095 + * used by read/write to properly slice flow into items in presence of
60096 + * multiple key assignment policies, because items of a file are not
60097 + * necessarily contiguous in a key space, for example, in a plan-b.
60098 + */
60099 + int (*owns_item) (const struct inode *, const coord_t *);
60100 +
60101 + /* checks whether yet another hard links to this object can be
60102 + added */
60103 + int (*can_add_link) (const struct inode *);
60104 +
60105 + /* checks whether hard links to this object can be removed */
60106 + int (*can_rem_link) (const struct inode *);
60107 +
60108 + /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60109 + detach of directory plugin to remove ".." */
60110 + int (*detach) (struct inode * child, struct inode * parent);
60111 +
60112 + /* called when @child was just looked up in the @parent. It is not
60113 + empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60114 + directory plugin */
60115 + int (*bind) (struct inode * child, struct inode * parent);
60116 +
60117 + /* process safe-link during mount */
60118 + int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60119 + __u64 value);
60120 +
60121 + /* The couple of estimate methods for all file operations */
60122 + struct {
60123 + reiser4_block_nr(*create) (const struct inode *);
60124 + reiser4_block_nr(*update) (const struct inode *);
60125 + reiser4_block_nr(*unlink) (const struct inode *,
60126 + const struct inode *);
60127 + } estimate;
60128 +
60129 + /*
60130 + * reiser4 specific part of inode has a union of structures which are
60131 + * specific to a plugin. This method is called when inode is read
60132 + * (read_inode) and when file is created (common_create_child) so that
60133 + * file plugin could initialize its inode data
60134 + */
60135 + void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60136 + int);
60137 +
60138 + /*
60139 + * This method performs progressive deletion of items and whole nodes
60140 + * from right to left.
60141 + *
60142 + * @tap: the point deletion process begins from,
60143 + * @from_key: the beginning of the deleted key range,
60144 + * @to_key: the end of the deleted key range,
60145 + * @smallest_removed: the smallest removed key,
60146 + *
60147 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60148 + * operation was interrupted for allowing atom commit .
60149 + */
60150 + int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60151 + const reiser4_key * to_key,
60152 + reiser4_key * smallest_removed, struct inode *,
60153 + int, int *);
60154 +
60155 + /* called from ->destroy_inode() */
60156 + void (*destroy_inode) (struct inode *);
60157 +
60158 + /*
60159 + * methods to serialize object identify. This is used, for example, by
60160 + * reiser4_{en,de}code_fh().
60161 + */
60162 + struct {
60163 + /* store object's identity at @area */
60164 + char *(*write) (struct inode * inode, char *area);
60165 + /* parse object from wire to the @obj */
60166 + char *(*read) (char *area, reiser4_object_on_wire * obj);
60167 + /* given object identity in @obj, find or create its dentry */
60168 + struct dentry *(*get) (struct super_block * s,
60169 + reiser4_object_on_wire * obj);
60170 + /* how many bytes ->wire.write() consumes */
60171 + int (*size) (struct inode * inode);
60172 + /* finish with object identify */
60173 + void (*done) (reiser4_object_on_wire * obj);
60174 + } wire;
60175 +} file_plugin;
60176 +
60177 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60178 +
60179 +struct reiser4_object_on_wire {
60180 + file_plugin *plugin;
60181 + union {
60182 + struct {
60183 + obj_key_id key_id;
60184 + } std;
60185 + void *generic;
60186 + } u;
60187 +};
60188 +
60189 +/* builtin dir-plugins */
60190 +typedef enum {
60191 + HASHED_DIR_PLUGIN_ID,
60192 + SEEKABLE_HASHED_DIR_PLUGIN_ID,
60193 + LAST_DIR_ID
60194 +} reiser4_dir_id;
60195 +
60196 +typedef struct dir_plugin {
60197 + /* generic fields */
60198 + plugin_header h;
60199 +
60200 + struct inode_operations inode_ops;
60201 + struct file_operations file_ops;
60202 + struct address_space_operations as_ops;
60203 +
60204 + /*
60205 + * private methods: These are optional. If used they will allow you to
60206 + * minimize the amount of code needed to implement a deviation from
60207 + * some other method that uses them. You could logically argue that
60208 + * they should be a separate type of plugin.
60209 + */
60210 +
60211 + struct dentry *(*get_parent) (struct inode * childdir);
60212 +
60213 + /*
60214 + * check whether "name" is acceptable name to be inserted into this
60215 + * object. Optionally implemented by directory-like objects. Can check
60216 + * for maximal length, reserved symbols etc
60217 + */
60218 + int (*is_name_acceptable) (const struct inode * inode, const char *name,
60219 + int len);
60220 +
60221 + void (*build_entry_key) (const struct inode * dir /* directory where
60222 + * entry is (or will
60223 + * be) in.*/ ,
60224 + const struct qstr * name /* name of file
60225 + * referenced by this
60226 + * entry */ ,
60227 + reiser4_key * result /* resulting key of
60228 + * directory entry */ );
60229 + int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60230 + int (*add_entry) (struct inode * object, struct dentry * where,
60231 + reiser4_object_create_data * data,
60232 + reiser4_dir_entry_desc * entry);
60233 + int (*rem_entry) (struct inode * object, struct dentry * where,
60234 + reiser4_dir_entry_desc * entry);
60235 +
60236 + /*
60237 + * initialize directory structure for newly created object. For normal
60238 + * unix directories, insert dot and dotdot.
60239 + */
60240 + int (*init) (struct inode * object, struct inode * parent,
60241 + reiser4_object_create_data * data);
60242 +
60243 + /* destroy directory */
60244 + int (*done) (struct inode * child);
60245 +
60246 + /* called when @subdir was just looked up in the @dir */
60247 + int (*attach) (struct inode * subdir, struct inode * dir);
60248 + int (*detach) (struct inode * subdir, struct inode * dir);
60249 +
60250 + struct {
60251 + reiser4_block_nr(*add_entry) (const struct inode *);
60252 + reiser4_block_nr(*rem_entry) (const struct inode *);
60253 + reiser4_block_nr(*unlink) (const struct inode *,
60254 + const struct inode *);
60255 + } estimate;
60256 +} dir_plugin;
60257 +
60258 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60259 +
60260 +typedef struct formatting_plugin {
60261 + /* generic fields */
60262 + plugin_header h;
60263 + /* returns non-zero iff file's tail has to be stored
60264 + in a direct item. */
60265 + int (*have_tail) (const struct inode * inode, loff_t size);
60266 +} formatting_plugin;
60267 +
60268 +typedef struct hash_plugin {
60269 + /* generic fields */
60270 + plugin_header h;
60271 + /* computes hash of the given name */
60272 + __u64(*hash) (const unsigned char *name, int len);
60273 +} hash_plugin;
60274 +
60275 +typedef struct cipher_plugin {
60276 + /* generic fields */
60277 + plugin_header h;
60278 + struct crypto_blkcipher * (*alloc) (void);
60279 + void (*free) (struct crypto_blkcipher * tfm);
60280 + /* Offset translator. For each offset this returns (k * offset), where
60281 + k (k >= 1) is an expansion factor of the cipher algorithm.
60282 + For all symmetric algorithms k == 1. For asymmetric algorithms (which
60283 + inflate data) offset translation guarantees that all disk cluster's
60284 + units will have keys smaller then next cluster's one.
60285 + */
60286 + loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60287 + /* Cipher algorithms can accept data only by chunks of cipher block
60288 + size. This method is to align any flow up to cipher block size when
60289 + we pass it to cipher algorithm. To align means to append padding of
60290 + special format specific to the cipher algorithm */
60291 + int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60292 + /* low-level key manager (check, install, etc..) */
60293 + int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60294 + unsigned int keylen);
60295 + /* main text processing procedures */
60296 + void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60297 + void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60298 +} cipher_plugin;
60299 +
60300 +typedef struct digest_plugin {
60301 + /* generic fields */
60302 + plugin_header h;
60303 + /* fingerprint size in bytes */
60304 + int fipsize;
60305 + struct crypto_hash * (*alloc) (void);
60306 + void (*free) (struct crypto_hash * tfm);
60307 +} digest_plugin;
60308 +
60309 +typedef struct compression_plugin {
60310 + /* generic fields */
60311 + plugin_header h;
60312 + int (*init) (void);
60313 + /* the maximum number of bytes the size of the "compressed" data can
60314 + * exceed the uncompressed data. */
60315 + int (*overrun) (unsigned src_len);
60316 + coa_t(*alloc) (tfm_action act);
60317 + void (*free) (coa_t coa, tfm_action act);
60318 + /* minimal size of the flow we still try to compress */
60319 + int (*min_size_deflate) (void);
60320 + __u32(*checksum) (char *data, __u32 length);
60321 + /* main transform procedures */
60322 + void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60323 + __u8 * dst_first, unsigned *dst_len);
60324 + void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60325 + __u8 * dst_first, unsigned *dst_len);
60326 +} compression_plugin;
60327 +
60328 +typedef struct compression_mode_plugin {
60329 + /* generic fields */
60330 + plugin_header h;
60331 + /* this is called when estimating compressibility
60332 + of a logical cluster by its content */
60333 + int (*should_deflate) (struct inode * inode, cloff_t index);
60334 + /* this is called when results of compression should be saved */
60335 + int (*accept_hook) (struct inode * inode, cloff_t index);
60336 + /* this is called when results of compression should be discarded */
60337 + int (*discard_hook) (struct inode * inode, cloff_t index);
60338 +} compression_mode_plugin;
60339 +
60340 +typedef struct cluster_plugin {
60341 + /* generic fields */
60342 + plugin_header h;
60343 + int shift;
60344 +} cluster_plugin;
60345 +
60346 +typedef struct sd_ext_plugin {
60347 + /* generic fields */
60348 + plugin_header h;
60349 + int (*present) (struct inode * inode, char **area, int *len);
60350 + int (*absent) (struct inode * inode);
60351 + int (*save_len) (struct inode * inode);
60352 + int (*save) (struct inode * inode, char **area);
60353 + /* alignment requirement for this stat-data part */
60354 + int alignment;
60355 +} sd_ext_plugin;
60356 +
60357 +/* this plugin contains methods to allocate objectid for newly created files,
60358 + to deallocate objectid when file gets removed, to report number of used and
60359 + free objectids */
60360 +typedef struct oid_allocator_plugin {
60361 + /* generic fields */
60362 + plugin_header h;
60363 + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60364 + __u64 oids);
60365 + /* used to report statfs->f_files */
60366 + __u64(*oids_used) (reiser4_oid_allocator * map);
60367 + /* get next oid to use */
60368 + __u64(*next_oid) (reiser4_oid_allocator * map);
60369 + /* used to report statfs->f_ffree */
60370 + __u64(*oids_free) (reiser4_oid_allocator * map);
60371 + /* allocate new objectid */
60372 + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60373 + /* release objectid */
60374 + int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60375 + /* how many pages to reserve in transaction for allocation of new
60376 + objectid */
60377 + int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60378 + /* how many pages to reserve in transaction for freeing of an
60379 + objectid */
60380 + int (*oid_reserve_release) (reiser4_oid_allocator * map);
60381 + void (*print_info) (const char *, reiser4_oid_allocator *);
60382 +} oid_allocator_plugin;
60383 +
60384 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
60385 + are any) locations, etc */
60386 +typedef struct disk_format_plugin {
60387 + /* generic fields */
60388 + plugin_header h;
60389 + /* replay journal, initialize super_info_data, etc */
60390 + int (*init_format) (struct super_block *, void *data);
60391 +
60392 + /* key of root directory stat data */
60393 + const reiser4_key *(*root_dir_key) (const struct super_block *);
60394 +
60395 + int (*release) (struct super_block *);
60396 + jnode *(*log_super) (struct super_block *);
60397 + int (*check_open) (const struct inode * object);
60398 + int (*version_update) (struct super_block *);
60399 +} disk_format_plugin;
60400 +
60401 +struct jnode_plugin {
60402 + /* generic fields */
60403 + plugin_header h;
60404 + int (*init) (jnode * node);
60405 + int (*parse) (jnode * node);
60406 + struct address_space *(*mapping) (const jnode * node);
60407 + unsigned long (*index) (const jnode * node);
60408 + jnode *(*clone) (jnode * node);
60409 +};
60410 +
60411 +/* plugin instance. */
60412 +/* */
60413 +/* This is "wrapper" union for all types of plugins. Most of the code uses */
60414 +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60415 +/* operates with pointers to reiser4_plugin. This union is only used in */
60416 +/* some generic code in plugin/plugin.c that operates on all */
60417 +/* plugins. Technically speaking purpose of this union is to add type */
60418 +/* safety to said generic code: each plugin type (file_plugin, for */
60419 +/* example), contains plugin_header as its first memeber. This first member */
60420 +/* is located at the same place in memory as .h member of */
60421 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60422 +/* looks in the .h which is header of plugin type located in union. This */
60423 +/* allows to avoid type-casts. */
60424 +union reiser4_plugin {
60425 + /* generic fields */
60426 + plugin_header h;
60427 + /* file plugin */
60428 + file_plugin file;
60429 + /* directory plugin */
60430 + dir_plugin dir;
60431 + /* hash plugin, used by directory plugin */
60432 + hash_plugin hash;
60433 + /* fibration plugin used by directory plugin */
60434 + fibration_plugin fibration;
60435 + /* cipher transform plugin, used by file plugin */
60436 + cipher_plugin cipher;
60437 + /* digest transform plugin, used by file plugin */
60438 + digest_plugin digest;
60439 + /* compression transform plugin, used by file plugin */
60440 + compression_plugin compression;
60441 + /* tail plugin, used by file plugin */
60442 + formatting_plugin formatting;
60443 + /* permission plugin */
60444 + perm_plugin perm;
60445 + /* node plugin */
60446 + node_plugin node;
60447 + /* item plugin */
60448 + item_plugin item;
60449 + /* stat-data extension plugin */
60450 + sd_ext_plugin sd_ext;
60451 + /* disk layout plugin */
60452 + disk_format_plugin format;
60453 + /* object id allocator plugin */
60454 + oid_allocator_plugin oid_allocator;
60455 + /* plugin for different jnode types */
60456 + jnode_plugin jnode;
60457 + /* compression mode plugin, used by object plugin */
60458 + compression_mode_plugin compression_mode;
60459 + /* cluster plugin, used by object plugin */
60460 + cluster_plugin clust;
60461 + /* place-holder for new plugin types that can be registered
60462 + dynamically, and used by other dynamically loaded plugins. */
60463 + void *generic;
60464 +};
60465 +
60466 +struct reiser4_plugin_ops {
60467 + /* called when plugin is initialized */
60468 + int (*init) (reiser4_plugin * plugin);
60469 + /* called when plugin is unloaded */
60470 + int (*done) (reiser4_plugin * plugin);
60471 + /* load given plugin from disk */
60472 + int (*load) (struct inode * inode,
60473 + reiser4_plugin * plugin, char **area, int *len);
60474 + /* how many space is required to store this plugin's state
60475 + in stat-data */
60476 + int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60477 + /* save persistent plugin-data to disk */
60478 + int (*save) (struct inode * inode, reiser4_plugin * plugin,
60479 + char **area);
60480 + /* alignment requirement for on-disk state of this plugin
60481 + in number of bytes */
60482 + int alignment;
60483 + /* install itself into given inode. This can return error
60484 + (e.g., you cannot change hash of non-empty directory). */
60485 + int (*change) (struct inode * inode, reiser4_plugin * plugin,
60486 + pset_member memb);
60487 + /* install itself into given inode. This can return error
60488 + (e.g., you cannot change hash of non-empty directory). */
60489 + int (*inherit) (struct inode * inode, struct inode * parent,
60490 + reiser4_plugin * plugin);
60491 +};
60492 +
60493 +/* functions implemented in fs/reiser4/plugin/plugin.c */
60494 +
60495 +/* stores plugin reference in reiser4-specific part of inode */
60496 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60497 +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
60498 +extern int init_plugins(void);
60499 +
60500 +/* builtin plugins */
60501 +
60502 +/* builtin hash-plugins */
60503 +
60504 +typedef enum {
60505 + RUPASOV_HASH_ID,
60506 + R5_HASH_ID,
60507 + TEA_HASH_ID,
60508 + FNV1_HASH_ID,
60509 + DEGENERATE_HASH_ID,
60510 + LAST_HASH_ID
60511 +} reiser4_hash_id;
60512 +
60513 +/* builtin cipher plugins */
60514 +
60515 +typedef enum {
60516 + NONE_CIPHER_ID,
60517 + LAST_CIPHER_ID
60518 +} reiser4_cipher_id;
60519 +
60520 +/* builtin digest plugins */
60521 +
60522 +typedef enum {
60523 + SHA256_32_DIGEST_ID,
60524 + LAST_DIGEST_ID
60525 +} reiser4_digest_id;
60526 +
60527 +/* builtin compression mode plugins */
60528 +typedef enum {
60529 + NONE_COMPRESSION_MODE_ID,
60530 + LATTD_COMPRESSION_MODE_ID,
60531 + ULTIM_COMPRESSION_MODE_ID,
60532 + FORCE_COMPRESSION_MODE_ID,
60533 + CONVX_COMPRESSION_MODE_ID,
60534 + LAST_COMPRESSION_MODE_ID
60535 +} reiser4_compression_mode_id;
60536 +
60537 +/* builtin cluster plugins */
60538 +typedef enum {
60539 + CLUSTER_64K_ID,
60540 + CLUSTER_32K_ID,
60541 + CLUSTER_16K_ID,
60542 + CLUSTER_8K_ID,
60543 + CLUSTER_4K_ID,
60544 + LAST_CLUSTER_ID
60545 +} reiser4_cluster_id;
60546 +
60547 +/* builtin tail-plugins */
60548 +
60549 +typedef enum {
60550 + NEVER_TAILS_FORMATTING_ID,
60551 + ALWAYS_TAILS_FORMATTING_ID,
60552 + SMALL_FILE_FORMATTING_ID,
60553 + LAST_TAIL_FORMATTING_ID
60554 +} reiser4_formatting_id;
60555 +
60556 +/* compression/clustering specific data */
60557 +typedef struct compression_data {
60558 + reiser4_compression_id coa; /* id of the compression algorithm */
60559 +} compression_data_t;
60560 +
60561 +typedef __u8 cluster_data_t; /* cluster info */
60562 +
60563 +/* data type used to pack parameters that we pass to vfs object creation
60564 + function create_object() */
60565 +struct reiser4_object_create_data {
60566 + /* plugin to control created object */
60567 + reiser4_file_id id;
60568 + /* mode of regular file, directory or special file */
60569 +/* what happens if some other sort of perm plugin is in use? */
60570 + int mode;
60571 + /* rdev of special file */
60572 + dev_t rdev;
60573 + /* symlink target */
60574 + const char *name;
60575 + /* add here something for non-standard objects you invent, like
60576 + query for interpolation file etc. */
60577 +
60578 + crypto_stat_t * crypto;
60579 + compression_data_t *compression;
60580 + cluster_data_t *cluster;
60581 +
60582 + struct inode *parent;
60583 + struct dentry *dentry;
60584 +};
60585 +
60586 +/* description of directory entry being created/destroyed/sought for
60587 +
60588 + It is passed down to the directory plugin and farther to the
60589 + directory item plugin methods. Creation of new directory is done in
60590 + several stages: first we search for an entry with the same name, then
60591 + create new one. reiser4_dir_entry_desc is used to store some information
60592 + collected at some stage of this process and required later: key of
60593 + item that we want to insert/delete and pointer to an object that will
60594 + be bound by the new directory entry. Probably some more fields will
60595 + be added there.
60596 +
60597 +*/
60598 +struct reiser4_dir_entry_desc {
60599 + /* key of directory entry */
60600 + reiser4_key key;
60601 + /* object bound by this entry. */
60602 + struct inode *obj;
60603 +};
60604 +
60605 +#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60606 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60607 +
60608 +/* used for interface with user-land: table-driven parsing in
60609 + reiser4(). */
60610 +typedef struct plugin_locator {
60611 + reiser4_plugin_type type_id;
60612 + reiser4_plugin_id id;
60613 + char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
60614 + char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
60615 +} plugin_locator;
60616 +
60617 +extern int locate_plugin(struct inode *inode, plugin_locator * loc);
60618 +
60619 +#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60620 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60621 +{ \
60622 + reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60623 + return plugin ? & plugin -> FIELD : NULL; \
60624 +} \
60625 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60626 +{ \
60627 + reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60628 + return plugin ? & plugin -> FIELD : NULL; \
60629 +} \
60630 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60631 +{ \
60632 + reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60633 + return plugin ? & plugin -> FIELD : NULL; \
60634 +} \
60635 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60636 +{ \
60637 + return ( reiser4_plugin * ) plugin; \
60638 +} \
60639 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60640 +{ \
60641 + return TYPE ## _to_plugin (plugin) -> h.id; \
60642 +} \
60643 +typedef struct { int foo; } TYPE ## _plugin_dummy
60644 +
60645 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60646 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60647 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60648 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60649 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60650 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60651 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60652 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60653 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60654 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60655 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60656 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60657 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60658 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60659 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60660 + compression_mode);
60661 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60662 +
60663 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60664 +
60665 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60666 +
60667 +#define for_all_plugins(ptype, plugin) \
60668 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60669 + get_plugin_list(ptype) != &plugin->h.linkage; \
60670 + plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60671 +
60672 +
60673 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
60674 +extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
60675 +extern int finish_pset(struct inode *inode);
60676 +
60677 +/* defined in fs/reiser4/plugin/object.c */
60678 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60679 +/* defined in fs/reiser4/plugin/object.c */
60680 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60681 +/* defined in fs/reiser4/plugin/item/static_stat.c */
60682 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60683 +/* defined in fs/reiser4/plugin/hash.c */
60684 +extern hash_plugin hash_plugins[LAST_HASH_ID];
60685 +/* defined in fs/reiser4/plugin/fibration.c */
60686 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60687 +/* defined in fs/reiser4/plugin/crypt.c */
60688 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60689 +/* defined in fs/reiser4/plugin/digest.c */
60690 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60691 +/* defined in fs/reiser4/plugin/compress/compress.c */
60692 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60693 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60694 +extern compression_mode_plugin
60695 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60696 +/* defined in fs/reiser4/plugin/cluster.c */
60697 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
60698 +/* defined in fs/reiser4/plugin/tail.c */
60699 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60700 +/* defined in fs/reiser4/plugin/security/security.c */
60701 +extern perm_plugin perm_plugins[LAST_PERM_ID];
60702 +/* defined in fs/reiser4/plugin/item/item.c */
60703 +extern item_plugin item_plugins[LAST_ITEM_ID];
60704 +/* defined in fs/reiser4/plugin/node/node.c */
60705 +extern node_plugin node_plugins[LAST_NODE_ID];
60706 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60707 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60708 +
60709 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
60710 +#endif
60711 +
60712 +/* Make Linus happy.
60713 + Local variables:
60714 + c-indentation-style: "K&R"
60715 + mode-name: "LC"
60716 + c-basic-offset: 8
60717 + tab-width: 8
60718 + fill-column: 120
60719 + End:
60720 +*/
60721 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.20/fs/reiser4/plugin/plugin_header.h
60722 --- linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
60723 +++ linux-2.6.20/fs/reiser4/plugin/plugin_header.h 2007-05-06 14:50:43.855024468 +0400
60724 @@ -0,0 +1,144 @@
60725 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60726 +
60727 +/* plugin header. Data structures required by all plugin types. */
60728 +
60729 +#if !defined( __PLUGIN_HEADER_H__ )
60730 +#define __PLUGIN_HEADER_H__
60731 +
60732 +/* plugin data-types and constants */
60733 +
60734 +#include "../debug.h"
60735 +#include "../dformat.h"
60736 +
60737 +typedef enum {
60738 + REISER4_FILE_PLUGIN_TYPE,
60739 + REISER4_DIR_PLUGIN_TYPE,
60740 + REISER4_ITEM_PLUGIN_TYPE,
60741 + REISER4_NODE_PLUGIN_TYPE,
60742 + REISER4_HASH_PLUGIN_TYPE,
60743 + REISER4_FIBRATION_PLUGIN_TYPE,
60744 + REISER4_FORMATTING_PLUGIN_TYPE,
60745 + REISER4_PERM_PLUGIN_TYPE,
60746 + REISER4_SD_EXT_PLUGIN_TYPE,
60747 + REISER4_FORMAT_PLUGIN_TYPE,
60748 + REISER4_JNODE_PLUGIN_TYPE,
60749 + REISER4_CIPHER_PLUGIN_TYPE,
60750 + REISER4_DIGEST_PLUGIN_TYPE,
60751 + REISER4_COMPRESSION_PLUGIN_TYPE,
60752 + REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60753 + REISER4_CLUSTER_PLUGIN_TYPE,
60754 + REISER4_PLUGIN_TYPES
60755 +} reiser4_plugin_type;
60756 +
60757 +typedef enum {
60758 + REISER4_DIRECTORY_FILE,
60759 + REISER4_REGULAR_FILE,
60760 + REISER4_SYMLINK_FILE,
60761 + REISER4_SPECIAL_FILE,
60762 +} reiser4_plugin_group;
60763 +
60764 +struct reiser4_plugin_ops;
60765 +/* generic plugin operations, supported by each
60766 + plugin type. */
60767 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60768 +
60769 +/* the common part of all plugin instances. */
60770 +typedef struct plugin_header {
60771 + /* plugin type */
60772 + reiser4_plugin_type type_id;
60773 + /* id of this plugin */
60774 + reiser4_plugin_id id;
60775 + /* bitmask of groups the plugin belongs to. */
60776 + reiser4_plugin_groups groups;
60777 + /* plugin operations */
60778 + reiser4_plugin_ops *pops;
60779 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60780 + /* short label of this plugin */
60781 + const char *label;
60782 + /* descriptive string.. */
60783 + const char *desc;
60784 + /* list linkage */
60785 + struct list_head linkage;
60786 +} plugin_header;
60787 +
60788 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
60789 +
60790 +/* PRIVATE INTERFACES */
60791 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60792 +/* plugin type representation. */
60793 +typedef struct reiser4_plugin_type_data {
60794 + /* internal plugin type identifier. Should coincide with
60795 + index of this item in plugins[] array. */
60796 + reiser4_plugin_type type_id;
60797 + /* short symbolic label of this plugin type. Should be no longer
60798 + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60799 + const char *label;
60800 + /* plugin type description longer than .label */
60801 + const char *desc;
60802 +
60803 +/* NIKITA-FIXME-HANS: define built-in */
60804 + /* number of built-in plugin instances of this type */
60805 + int builtin_num;
60806 + /* array of built-in plugins */
60807 + void *builtin;
60808 + struct list_head plugins_list;
60809 + size_t size;
60810 +} reiser4_plugin_type_data;
60811 +
60812 +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
60813 +
60814 +int is_plugin_type_valid(reiser4_plugin_type type);
60815 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
60816 +
60817 +static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
60818 +{
60819 + char *builtin;
60820 +
60821 + builtin = ptype->builtin;
60822 + return (reiser4_plugin *) (builtin + i * ptype->size);
60823 +}
60824 +
60825 +/* return plugin by its @type_id and @id */
60826 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
60827 + reiser4_plugin_id id)
60828 +{
60829 + assert("nikita-1651", is_plugin_type_valid(type));
60830 + assert("nikita-1652", is_plugin_id_valid(type, id));
60831 + return plugin_at(&plugins[type], id);
60832 +}
60833 +
60834 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60835 + reiser4_plugin_id id);
60836 +
60837 +/**
60838 + * plugin_by_disk_id - get reiser4_plugin
60839 + * @type_id: plugin type id
60840 + * @did: plugin id in disk format
60841 + *
60842 + * Returns reiser4_plugin by plugin type id an dplugin_id.
60843 + */
60844 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60845 + reiser4_plugin_type type_id,
60846 + __le16 *plugin_id)
60847 +{
60848 + /*
60849 + * what we should do properly is to maintain within each file-system a
60850 + * dictionary that maps on-disk plugin ids to "universal" ids. This
60851 + * dictionary will be resolved on mount time, so that this function
60852 + * will perform just one additional array lookup.
60853 + */
60854 + return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60855 +}
60856 +
60857 +/* __PLUGIN_HEADER_H__ */
60858 +#endif
60859 +
60860 +/*
60861 + * Local variables:
60862 + * c-indentation-style: "K&R"
60863 + * mode-name: "LC"
60864 + * c-basic-offset: 8
60865 + * tab-width: 8
60866 + * fill-column: 79
60867 + * End:
60868 + */
60869 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.20/fs/reiser4/plugin/plugin_set.c
60870 --- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
60871 +++ linux-2.6.20/fs/reiser4/plugin/plugin_set.c 2007-05-06 14:50:43.855024468 +0400
60872 @@ -0,0 +1,379 @@
60873 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60874 + * reiser4/README */
60875 +/* This file contains Reiser4 plugin set operations */
60876 +
60877 +/* plugin sets
60878 + *
60879 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
60880 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
60881 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
60882 + * set of plugins (so called pset) is described by structure plugin_set (see
60883 + * plugin/plugin_set.h), which contains pointers to all required plugins.
60884 + *
60885 + * Children can inherit some pset members from their parent, however sometimes
60886 + * it is useful to specify members different from parent ones. Since object's
60887 + * pset can not be easily changed without fatal consequences, we use for this
60888 + * purpose another special plugin table (so called hset, or heir set) described
60889 + * by the same structure.
60890 + *
60891 + * Inode only stores a pointers to pset and hset. Different inodes with the
60892 + * same set of pset (hset) members point to the same pset (hset). This is
60893 + * archived by storing psets and hsets in global hash table. Races are avoided
60894 + * by simple (and efficient so far) solution of never recycling psets, even
60895 + * when last inode pointing to it is destroyed.
60896 + */
60897 +
60898 +#include "../debug.h"
60899 +#include "../super.h"
60900 +#include "plugin_set.h"
60901 +
60902 +#include <linux/slab.h>
60903 +#include <linux/stddef.h>
60904 +
60905 +/* slab for plugin sets */
60906 +static struct kmem_cache *plugin_set_slab;
60907 +
60908 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60909 + [0 ... 7] = SPIN_LOCK_UNLOCKED
60910 +};
60911 +
60912 +/* hash table support */
60913 +
60914 +#define PS_TABLE_SIZE (32)
60915 +
60916 +static inline plugin_set *cast_to(const unsigned long *a)
60917 +{
60918 + return container_of(a, plugin_set, hashval);
60919 +}
60920 +
60921 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60922 +{
60923 + plugin_set *set1;
60924 + plugin_set *set2;
60925 +
60926 + /* make sure fields are not missed in the code below */
60927 + cassert(sizeof *set1 ==
60928 + sizeof set1->hashval +
60929 + sizeof set1->link +
60930 + sizeof set1->file +
60931 + sizeof set1->dir +
60932 + sizeof set1->perm +
60933 + sizeof set1->formatting +
60934 + sizeof set1->hash +
60935 + sizeof set1->fibration +
60936 + sizeof set1->sd +
60937 + sizeof set1->dir_item +
60938 + sizeof set1->cipher +
60939 + sizeof set1->digest +
60940 + sizeof set1->compression +
60941 + sizeof set1->compression_mode +
60942 + sizeof set1->cluster +
60943 + sizeof set1->create);
60944 +
60945 + set1 = cast_to(a1);
60946 + set2 = cast_to(a2);
60947 + return
60948 + set1->hashval == set2->hashval &&
60949 + set1->file == set2->file &&
60950 + set1->dir == set2->dir &&
60951 + set1->perm == set2->perm &&
60952 + set1->formatting == set2->formatting &&
60953 + set1->hash == set2->hash &&
60954 + set1->fibration == set2->fibration &&
60955 + set1->sd == set2->sd &&
60956 + set1->dir_item == set2->dir_item &&
60957 + set1->cipher == set2->cipher &&
60958 + set1->digest == set2->digest &&
60959 + set1->compression == set2->compression &&
60960 + set1->compression_mode == set2->compression_mode &&
60961 + set1->cluster == set2->cluster &&
60962 + set1->create == set2->create;
60963 +}
60964 +
60965 +#define HASH_FIELD(hash, set, field) \
60966 +({ \
60967 + (hash) += (unsigned long)(set)->field >> 2; \
60968 +})
60969 +
60970 +static inline unsigned long calculate_hash(const plugin_set * set)
60971 +{
60972 + unsigned long result;
60973 +
60974 + result = 0;
60975 + HASH_FIELD(result, set, file);
60976 + HASH_FIELD(result, set, dir);
60977 + HASH_FIELD(result, set, perm);
60978 + HASH_FIELD(result, set, formatting);
60979 + HASH_FIELD(result, set, hash);
60980 + HASH_FIELD(result, set, fibration);
60981 + HASH_FIELD(result, set, sd);
60982 + HASH_FIELD(result, set, dir_item);
60983 + HASH_FIELD(result, set, cipher);
60984 + HASH_FIELD(result, set, digest);
60985 + HASH_FIELD(result, set, compression);
60986 + HASH_FIELD(result, set, compression_mode);
60987 + HASH_FIELD(result, set, cluster);
60988 + HASH_FIELD(result, set, create);
60989 + return result & (PS_TABLE_SIZE - 1);
60990 +}
60991 +
60992 +static inline unsigned long
60993 +pshash(ps_hash_table * table, const unsigned long *a)
60994 +{
60995 + return *a;
60996 +}
60997 +
60998 +/* The hash table definition */
60999 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
61000 +#define KFREE(ptr, size) kfree(ptr)
61001 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
61002 + pseq);
61003 +#undef KFREE
61004 +#undef KMALLOC
61005 +
61006 +static ps_hash_table ps_table;
61007 +static plugin_set empty_set = {
61008 + .hashval = 0,
61009 + .file = NULL,
61010 + .dir = NULL,
61011 + .perm = NULL,
61012 + .formatting = NULL,
61013 + .hash = NULL,
61014 + .fibration = NULL,
61015 + .sd = NULL,
61016 + .dir_item = NULL,
61017 + .cipher = NULL,
61018 + .digest = NULL,
61019 + .compression = NULL,
61020 + .compression_mode = NULL,
61021 + .cluster = NULL,
61022 + .create = NULL,
61023 + .link = {NULL}
61024 +};
61025 +
61026 +plugin_set *plugin_set_get_empty(void)
61027 +{
61028 + return &empty_set;
61029 +}
61030 +
61031 +void plugin_set_put(plugin_set * set)
61032 +{
61033 +}
61034 +
61035 +static inline unsigned long *pset_field(plugin_set * set, int offset)
61036 +{
61037 + return (unsigned long *)(((char *)set) + offset);
61038 +}
61039 +
61040 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
61041 + const int offset)
61042 +{
61043 + unsigned long *spot;
61044 + spinlock_t *lock;
61045 + plugin_set replica;
61046 + plugin_set *twin;
61047 + plugin_set *psal;
61048 + plugin_set *orig;
61049 +
61050 + assert("nikita-2902", set != NULL);
61051 + assert("nikita-2904", *set != NULL);
61052 +
61053 + spot = pset_field(*set, offset);
61054 + if (unlikely(*spot == val))
61055 + return 0;
61056 +
61057 + replica = *(orig = *set);
61058 + *pset_field(&replica, offset) = val;
61059 + replica.hashval = calculate_hash(&replica);
61060 + rcu_read_lock();
61061 + twin = ps_hash_find(&ps_table, &replica.hashval);
61062 + if (unlikely(twin == NULL)) {
61063 + rcu_read_unlock();
61064 + psal = kmem_cache_alloc(plugin_set_slab,
61065 + reiser4_ctx_gfp_mask_get());
61066 + if (psal == NULL)
61067 + return RETERR(-ENOMEM);
61068 + *psal = replica;
61069 + lock = &plugin_set_lock[replica.hashval & 7];
61070 + spin_lock(lock);
61071 + twin = ps_hash_find(&ps_table, &replica.hashval);
61072 + if (likely(twin == NULL)) {
61073 + *set = psal;
61074 + ps_hash_insert_rcu(&ps_table, psal);
61075 + } else {
61076 + *set = twin;
61077 + kmem_cache_free(plugin_set_slab, psal);
61078 + }
61079 + spin_unlock(lock);
61080 + } else {
61081 + rcu_read_unlock();
61082 + *set = twin;
61083 + }
61084 + return 0;
61085 +}
61086 +
61087 +static struct {
61088 + int offset;
61089 + reiser4_plugin_groups groups;
61090 + reiser4_plugin_type type;
61091 +} pset_descr[PSET_LAST] = {
61092 + [PSET_FILE] = {
61093 + .offset = offsetof(plugin_set, file),
61094 + .type = REISER4_FILE_PLUGIN_TYPE,
61095 + .groups = 0
61096 + },
61097 + [PSET_DIR] = {
61098 + .offset = offsetof(plugin_set, dir),
61099 + .type = REISER4_DIR_PLUGIN_TYPE,
61100 + .groups = 0
61101 + },
61102 + [PSET_PERM] = {
61103 + .offset = offsetof(plugin_set, perm),
61104 + .type = REISER4_PERM_PLUGIN_TYPE,
61105 + .groups = 0
61106 + },
61107 + [PSET_FORMATTING] = {
61108 + .offset = offsetof(plugin_set, formatting),
61109 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
61110 + .groups = 0
61111 + },
61112 + [PSET_HASH] = {
61113 + .offset = offsetof(plugin_set, hash),
61114 + .type = REISER4_HASH_PLUGIN_TYPE,
61115 + .groups = 0
61116 + },
61117 + [PSET_FIBRATION] = {
61118 + .offset = offsetof(plugin_set, fibration),
61119 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
61120 + .groups = 0
61121 + },
61122 + [PSET_SD] = {
61123 + .offset = offsetof(plugin_set, sd),
61124 + .type = REISER4_ITEM_PLUGIN_TYPE,
61125 + .groups = (1 << STAT_DATA_ITEM_TYPE)
61126 + },
61127 + [PSET_DIR_ITEM] = {
61128 + .offset = offsetof(plugin_set, dir_item),
61129 + .type = REISER4_ITEM_PLUGIN_TYPE,
61130 + .groups = (1 << DIR_ENTRY_ITEM_TYPE)
61131 + },
61132 + [PSET_CIPHER] = {
61133 + .offset = offsetof(plugin_set, cipher),
61134 + .type = REISER4_CIPHER_PLUGIN_TYPE,
61135 + .groups = 0
61136 + },
61137 + [PSET_DIGEST] = {
61138 + .offset = offsetof(plugin_set, digest),
61139 + .type = REISER4_DIGEST_PLUGIN_TYPE,
61140 + .groups = 0
61141 + },
61142 + [PSET_COMPRESSION] = {
61143 + .offset = offsetof(plugin_set, compression),
61144 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
61145 + .groups = 0
61146 + },
61147 + [PSET_COMPRESSION_MODE] = {
61148 + .offset = offsetof(plugin_set, compression_mode),
61149 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61150 + .groups = 0
61151 + },
61152 + [PSET_CLUSTER] = {
61153 + .offset = offsetof(plugin_set, cluster),
61154 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
61155 + .groups = 0
61156 + },
61157 + [PSET_CREATE] = {
61158 + .offset = offsetof(plugin_set, create),
61159 + .type = REISER4_FILE_PLUGIN_TYPE,
61160 + .groups = (1 << REISER4_REGULAR_FILE)
61161 + }
61162 +};
61163 +
61164 +#define DEFINE_PSET_OPS(PREFIX) \
61165 + reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
61166 +{ \
61167 + if (memb > PSET_LAST) \
61168 + return REISER4_PLUGIN_TYPES; \
61169 + return pset_descr[memb].type; \
61170 +} \
61171 + \
61172 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
61173 + reiser4_plugin * plugin) \
61174 +{ \
61175 + assert("nikita-3492", set != NULL); \
61176 + assert("nikita-3493", *set != NULL); \
61177 + assert("nikita-3494", plugin != NULL); \
61178 + assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
61179 + assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
61180 + \
61181 + if (pset_descr[memb].groups) \
61182 + if (!(pset_descr[memb].groups & plugin->h.groups)) \
61183 + return -EINVAL; \
61184 + \
61185 + return plugin_set_field(set, \
61186 + (unsigned long)plugin, pset_descr[memb].offset); \
61187 +} \
61188 + \
61189 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
61190 +{ \
61191 + assert("nikita-3497", set != NULL); \
61192 + assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
61193 + \
61194 + return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
61195 +}
61196 +
61197 +DEFINE_PSET_OPS(aset);
61198 +
61199 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
61200 + return plugin_set_field(set,
61201 + (unsigned long)plugin, pset_descr[memb].offset);
61202 +}
61203 +
61204 +/**
61205 + * init_plugin_set - create plugin set cache and hash table
61206 + *
61207 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
61208 + * reiser4 module initialization.
61209 + */
61210 +int init_plugin_set(void)
61211 +{
61212 + int result;
61213 +
61214 + result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61215 + if (result == 0) {
61216 + plugin_set_slab = kmem_cache_create("plugin_set",
61217 + sizeof(plugin_set), 0,
61218 + SLAB_HWCACHE_ALIGN,
61219 + NULL, NULL);
61220 + if (plugin_set_slab == NULL)
61221 + result = RETERR(-ENOMEM);
61222 + }
61223 + return result;
61224 +}
61225 +
61226 +/**
61227 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
61228 + *
61229 + * This is called on reiser4 module unloading or system shutdown.
61230 + */
61231 +void done_plugin_set(void)
61232 +{
61233 + plugin_set *cur, *next;
61234 +
61235 + for_all_in_htable(&ps_table, ps, cur, next) {
61236 + ps_hash_remove(&ps_table, cur);
61237 + kmem_cache_free(plugin_set_slab, cur);
61238 + }
61239 + destroy_reiser4_cache(&plugin_set_slab);
61240 + ps_hash_done(&ps_table);
61241 +}
61242 +
61243 +/*
61244 + * Local variables:
61245 + * c-indentation-style: "K&R"
61246 + * mode-name: "LC"
61247 + * c-basic-offset: 8
61248 + * tab-width: 8
61249 + * fill-column: 120
61250 + * End:
61251 + */
61252 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.20/fs/reiser4/plugin/plugin_set.h
61253 --- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
61254 +++ linux-2.6.20/fs/reiser4/plugin/plugin_set.h 2007-05-06 14:50:43.855024468 +0400
61255 @@ -0,0 +1,77 @@
61256 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61257 +
61258 +/* Reiser4 plugin set definition.
61259 + See fs/reiser4/plugin/plugin_set.c for details */
61260 +
61261 +#if !defined( __PLUGIN_SET_H__ )
61262 +#define __PLUGIN_SET_H__
61263 +
61264 +#include "../type_safe_hash.h"
61265 +#include "plugin.h"
61266 +
61267 +#include <linux/rcupdate.h>
61268 +
61269 +struct plugin_set;
61270 +typedef struct plugin_set plugin_set;
61271 +
61272 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61273 +
61274 +struct plugin_set {
61275 + unsigned long hashval;
61276 + /* plugin of file */
61277 + file_plugin *file;
61278 + /* plugin of dir */
61279 + dir_plugin *dir;
61280 + /* perm plugin for this file */
61281 + perm_plugin *perm;
61282 + /* tail policy plugin. Only meaningful for regular files */
61283 + formatting_plugin *formatting;
61284 + /* hash plugin. Only meaningful for directories. */
61285 + hash_plugin *hash;
61286 + /* fibration plugin. Only meaningful for directories. */
61287 + fibration_plugin *fibration;
61288 + /* plugin of stat-data */
61289 + item_plugin *sd;
61290 + /* plugin of items a directory is built of */
61291 + item_plugin *dir_item;
61292 + /* cipher plugin */
61293 + cipher_plugin *cipher;
61294 + /* digest plugin */
61295 + digest_plugin *digest;
61296 + /* compression plugin */
61297 + compression_plugin *compression;
61298 + /* compression mode plugin */
61299 + compression_mode_plugin *compression_mode;
61300 + /* cluster plugin */
61301 + cluster_plugin *cluster;
61302 + /* this specifies file plugin of regular children.
61303 + only meaningful for directories */
61304 + file_plugin *create;
61305 + ps_hash_link link;
61306 +};
61307 +
61308 +extern plugin_set *plugin_set_get_empty(void);
61309 +extern void plugin_set_put(plugin_set * set);
61310 +
61311 +extern int init_plugin_set(void);
61312 +extern void done_plugin_set(void);
61313 +
61314 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61315 +extern int set_plugin(plugin_set ** set, pset_member memb,
61316 + reiser4_plugin * plugin);
61317 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61318 + reiser4_plugin * plugin);
61319 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
61320 +
61321 +/* __PLUGIN_SET_H__ */
61322 +#endif
61323 +
61324 +/* Make Linus happy.
61325 + Local variables:
61326 + c-indentation-style: "K&R"
61327 + mode-name: "LC"
61328 + c-basic-offset: 8
61329 + tab-width: 8
61330 + fill-column: 120
61331 + End:
61332 +*/
61333 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile linux-2.6.20/fs/reiser4/plugin/security/Makefile
61334 --- linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
61335 +++ linux-2.6.20/fs/reiser4/plugin/security/Makefile 2007-05-06 14:50:43.855024468 +0400
61336 @@ -0,0 +1,4 @@
61337 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
61338 +
61339 +security_plugins-objs := \
61340 + perm.o
61341 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c linux-2.6.20/fs/reiser4/plugin/security/perm.c
61342 --- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
61343 +++ linux-2.6.20/fs/reiser4/plugin/security/perm.c 2007-05-06 14:50:43.859025718 +0400
61344 @@ -0,0 +1,44 @@
61345 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61346 +
61347 +/*
61348 + * this file contains implementation of permission plugins. Currently, only
61349 + * RWX_PERM_ID is implemented
61350 + */
61351 +
61352 +#include "../plugin.h"
61353 +#include "../plugin_header.h"
61354 +#include "../../debug.h"
61355 +
61356 +perm_plugin perm_plugins[LAST_PERM_ID] = {
61357 + [NULL_PERM_ID] = {
61358 + .h = {
61359 + .type_id = REISER4_PERM_PLUGIN_TYPE,
61360 + .id = NULL_PERM_ID,
61361 + .pops = NULL,
61362 + .label = "null",
61363 + .desc = "stub permission plugin",
61364 + .linkage = {NULL, NULL}
61365 + },
61366 + .read_ok = NULL,
61367 + .write_ok = NULL,
61368 + .lookup_ok = NULL,
61369 + .create_ok = NULL,
61370 + .link_ok = NULL,
61371 + .unlink_ok = NULL,
61372 + .delete_ok = NULL,
61373 + .mask_ok = NULL,
61374 + .setattr_ok = NULL,
61375 + .getattr_ok = NULL,
61376 + .rename_ok = NULL,
61377 + }
61378 +};
61379 +
61380 +/*
61381 + * Local variables:
61382 + * c-indentation-style: "K&R"
61383 + * mode-name: "LC"
61384 + * c-basic-offset: 8
61385 + * tab-width: 8
61386 + * fill-column: 79
61387 + * End:
61388 + */
61389 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h linux-2.6.20/fs/reiser4/plugin/security/perm.h
61390 --- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
61391 +++ linux-2.6.20/fs/reiser4/plugin/security/perm.h 2007-05-06 14:50:43.859025718 +0400
61392 @@ -0,0 +1,82 @@
61393 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61394 +
61395 +/* Perm (short for "permissions") plugins common stuff. */
61396 +
61397 +#if !defined( __REISER4_PERM_H__ )
61398 +#define __REISER4_PERM_H__
61399 +
61400 +#include "../../forward.h"
61401 +#include "../plugin_header.h"
61402 +
61403 +#include <linux/types.h>
61404 +#include <linux/fs.h> /* for struct file */
61405 +#include <linux/dcache.h> /* for struct dentry */
61406 +
61407 +/* interface for perm plugin.
61408 +
61409 + Perm plugin method can be implemented through:
61410 +
61411 + 1. consulting ->i_mode bits in stat data
61412 +
61413 + 2. obtaining acl from the tree and inspecting it
61414 +
61415 + 3. asking some kernel module or user-level program to authorize access.
61416 +
61417 + This allows for integration with things like capabilities, SELinux-style
61418 + secutiry contexts, etc.
61419 +
61420 +*/
61421 +/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */
61422 +typedef struct perm_plugin {
61423 + /* generic plugin fields */
61424 + plugin_header h;
61425 +
61426 + /* check permissions for read/write */
61427 + int (*read_ok) (struct file *file, const char __user *buf,
61428 + size_t size, loff_t *off);
61429 + int (*write_ok) (struct file *file, const char __user *buf,
61430 + size_t size, loff_t *off);
61431 +
61432 + /* check permissions for lookup */
61433 + int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
61434 +
61435 + /* check permissions for create */
61436 + int (*create_ok) (struct inode * parent, struct dentry * dentry,
61437 + reiser4_object_create_data * data);
61438 +
61439 + /* check permissions for linking @where to @existing */
61440 + int (*link_ok) (struct dentry * existing, struct inode * parent,
61441 + struct dentry * where);
61442 +
61443 + /* check permissions for unlinking @victim from @parent */
61444 + int (*unlink_ok) (struct inode * parent, struct dentry * victim);
61445 +
61446 + /* check permissions for deletion of @object whose last reference is
61447 + by @parent */
61448 + int (*delete_ok) (struct inode * parent, struct dentry * victim);
61449 + int (*mask_ok) (struct inode * inode, int mask);
61450 + /* check whether attribute change is acceptable */
61451 + int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
61452 +
61453 + /* check whether stat(2) is allowed */
61454 + int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
61455 + struct dentry * dentry, struct kstat * stat);
61456 + /* check whether rename(2) is allowed */
61457 + int (*rename_ok) (struct inode * old_dir, struct dentry * old,
61458 + struct inode * new_dir, struct dentry * new);
61459 +} perm_plugin;
61460 +
61461 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61462 +
61463 +/* __REISER4_PERM_H__ */
61464 +#endif
61465 +
61466 +/* Make Linus happy.
61467 + Local variables:
61468 + c-indentation-style: "K&R"
61469 + mode-name: "LC"
61470 + c-basic-offset: 8
61471 + tab-width: 8
61472 + fill-column: 120
61473 + End:
61474 +*/
61475 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.20/fs/reiser4/plugin/space/bitmap.c
61476 --- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
61477 +++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.c 2007-05-06 14:50:43.859025718 +0400
61478 @@ -0,0 +1,1585 @@
61479 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61480 +
61481 +#include "../../debug.h"
61482 +#include "../../dformat.h"
61483 +#include "../../txnmgr.h"
61484 +#include "../../jnode.h"
61485 +#include "../../block_alloc.h"
61486 +#include "../../tree.h"
61487 +#include "../../super.h"
61488 +#include "../plugin.h"
61489 +#include "space_allocator.h"
61490 +#include "bitmap.h"
61491 +
61492 +#include <linux/types.h>
61493 +#include <linux/fs.h> /* for struct super_block */
61494 +#include <linux/mutex.h>
61495 +#include <asm/div64.h>
61496 +
61497 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61498 + * blocks
61499 +
61500 + A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61501 + blocks loading/unloading which is different from v3.x where all bitmap
61502 + blocks are loaded at mount time.
61503 +
61504 + To implement bitmap blocks unloading we need to count bitmap block usage
61505 + and detect currently unused blocks allowing them to be unloaded. It is not
61506 + a simple task since we allow several threads to modify one bitmap block
61507 + simultaneously.
61508 +
61509 + Briefly speaking, the following schema is proposed: we count in special
61510 + variable associated with each bitmap block. That is for counting of block
61511 + alloc/dealloc operations on that bitmap block. With a deferred block
61512 + deallocation feature of reiser4 all those operation will be represented in
61513 + atom dirty/deleted lists as jnodes for freshly allocated or deleted
61514 + nodes.
61515 +
61516 + So, we increment usage counter for each new node allocated or deleted, and
61517 + decrement it at atom commit one time for each node from the dirty/deleted
61518 + atom's list. Of course, freshly allocated node deletion and node reusing
61519 + from atom deleted (if we do so) list should decrement bitmap usage counter
61520 + also.
61521 +
61522 + This schema seems to be working but that reference counting is
61523 + not easy to debug. I think we should agree with Hans and do not implement
61524 + it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61525 +
61526 + For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61527 + loaded into memory on fs mount time or each bitmap nodes are loaded at the
61528 + first access to it, the "dont_load_bitmap" mount option controls whether
61529 + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61530 + nodes currently is not supported. */
61531 +
61532 +#define CHECKSUM_SIZE 4
61533 +
61534 +#define BYTES_PER_LONG (sizeof(long))
61535 +
61536 +#if BITS_PER_LONG == 64
61537 +# define LONG_INT_SHIFT (6)
61538 +#else
61539 +# define LONG_INT_SHIFT (5)
61540 +#endif
61541 +
61542 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61543 +
61544 +typedef unsigned long ulong_t;
61545 +
61546 +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61547 +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61548 +
61549 +/* Block allocation/deallocation are done through special bitmap objects which
61550 + are allocated in an array at fs mount. */
61551 +struct bitmap_node {
61552 + struct mutex mutex; /* long term lock object */
61553 +
61554 + jnode *wjnode; /* j-nodes for WORKING ... */
61555 + jnode *cjnode; /* ... and COMMIT bitmap blocks */
61556 +
61557 + bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61558 +
61559 + atomic_t loaded; /* a flag which shows that bnode is loaded
61560 + * already */
61561 +};
61562 +
61563 +static inline char *bnode_working_data(struct bitmap_node *bnode)
61564 +{
61565 + char *data;
61566 +
61567 + data = jdata(bnode->wjnode);
61568 + assert("zam-429", data != NULL);
61569 +
61570 + return data + CHECKSUM_SIZE;
61571 +}
61572 +
61573 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61574 +{
61575 + char *data;
61576 +
61577 + data = jdata(bnode->cjnode);
61578 + assert("zam-430", data != NULL);
61579 +
61580 + return data + CHECKSUM_SIZE;
61581 +}
61582 +
61583 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61584 +{
61585 + char *data;
61586 +
61587 + data = jdata(bnode->cjnode);
61588 + assert("vpf-261", data != NULL);
61589 +
61590 + return le32_to_cpu(get_unaligned((d32 *)data));
61591 +}
61592 +
61593 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61594 +{
61595 + char *data;
61596 +
61597 + data = jdata(bnode->cjnode);
61598 + assert("vpf-261", data != NULL);
61599 +
61600 + put_unaligned(cpu_to_le32(crc), (d32 *)data);
61601 +}
61602 +
61603 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61604 + * written the code, does this added abstraction still have */
61605 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61606 + * reiser4_space_allocator structure) */
61607 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61608 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61609 + * someday?". What they about? If there is a reason to have a union, it should
61610 + * be a union, if not, it should not be a union. "..might be someday" means no
61611 + * reason. */
61612 +struct bitmap_allocator_data {
61613 + /* an array for bitmap blocks direct access */
61614 + struct bitmap_node *bitmap;
61615 +};
61616 +
61617 +#define get_barray(super) \
61618 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61619 +
61620 +#define get_bnode(super, i) (get_barray(super) + i)
61621 +
61622 +/* allocate and initialize jnode with JNODE_BITMAP type */
61623 +static jnode *bnew(void)
61624 +{
61625 + jnode *jal = jalloc();
61626 +
61627 + if (jal)
61628 + jnode_init(jal, current_tree, JNODE_BITMAP);
61629 +
61630 + return jal;
61631 +}
61632 +
61633 +/* this file contains:
61634 + - bitmap based implementation of space allocation plugin
61635 + - all the helper functions like set bit, find_first_zero_bit, etc */
61636 +
61637 +/* Audited by: green(2002.06.12) */
61638 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61639 +{
61640 + ulong_t mask = 1UL << start_bit;
61641 + int i = start_bit;
61642 +
61643 + while ((word & mask) != 0) {
61644 + mask <<= 1;
61645 + if (++i >= BITS_PER_LONG)
61646 + break;
61647 + }
61648 +
61649 + return i;
61650 +}
61651 +
61652 +#include <asm/bitops.h>
61653 +
61654 +#if BITS_PER_LONG == 64
61655 +
61656 +#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61657 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61658 +
61659 +static inline void reiser4_set_bit(int nr, void *addr)
61660 +{
61661 + ext2_set_bit(nr + OFF(addr), BASE(addr));
61662 +}
61663 +
61664 +static inline void reiser4_clear_bit(int nr, void *addr)
61665 +{
61666 + ext2_clear_bit(nr + OFF(addr), BASE(addr));
61667 +}
61668 +
61669 +static inline int reiser4_test_bit(int nr, void *addr)
61670 +{
61671 + return ext2_test_bit(nr + OFF(addr), BASE(addr));
61672 +}
61673 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61674 + int offset)
61675 +{
61676 + int off = OFF(addr);
61677 +
61678 + return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61679 + offset + off) - off;
61680 +}
61681 +
61682 +#else
61683 +
61684 +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61685 +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61686 +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61687 +
61688 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61689 +ext2_find_next_zero_bit(addr, maxoffset, offset)
61690 +#endif
61691 +
61692 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61693 + * are counted from @addr, return the offset of the first bit if it is found,
61694 + * @maxoffset otherwise. */
61695 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61696 + bmap_off_t start_offset)
61697 +{
61698 + ulong_t *base = addr;
61699 + /* start_offset is in bits, convert it to byte offset within bitmap. */
61700 + int word_nr = start_offset >> LONG_INT_SHIFT;
61701 + /* bit number within the byte. */
61702 + int bit_nr = start_offset & LONG_INT_MASK;
61703 + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61704 +
61705 + assert("zam-387", max_offset != 0);
61706 +
61707 + /* Unaligned @start_offset case. */
61708 + if (bit_nr != 0) {
61709 + bmap_nr_t nr;
61710 +
61711 + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61712 +
61713 + if (nr < BITS_PER_LONG)
61714 + return (word_nr << LONG_INT_SHIFT) + nr;
61715 +
61716 + ++word_nr;
61717 + }
61718 +
61719 + /* Fast scan trough aligned words. */
61720 + while (word_nr <= max_word_nr) {
61721 + if (base[word_nr] != 0) {
61722 + return (word_nr << LONG_INT_SHIFT)
61723 + + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61724 + }
61725 +
61726 + ++word_nr;
61727 + }
61728 +
61729 + return max_offset;
61730 +}
61731 +
61732 +#if BITS_PER_LONG == 64
61733 +
61734 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61735 + bmap_off_t start_offset)
61736 +{
61737 + bmap_off_t off = OFF(addr);
61738 +
61739 + return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61740 + start_offset + off) - off;
61741 +}
61742 +
61743 +#else
61744 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61745 + __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61746 +#endif
61747 +
61748 +/* search for the first set bit in single word. */
61749 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61750 +{
61751 + ulong_t bit_mask;
61752 + int nr = start_bit;
61753 +
61754 + assert("zam-965", start_bit < BITS_PER_LONG);
61755 + assert("zam-966", start_bit >= 0);
61756 +
61757 + bit_mask = (1UL << nr);
61758 +
61759 + while (bit_mask != 0) {
61760 + if (bit_mask & word)
61761 + return nr;
61762 + bit_mask >>= 1;
61763 + nr--;
61764 + }
61765 + return BITS_PER_LONG;
61766 +}
61767 +
61768 +/* Search bitmap for a set bit in backward direction from the end to the
61769 + * beginning of given region
61770 + *
61771 + * @result: result offset of the last set bit
61772 + * @addr: base memory address,
61773 + * @low_off: low end of the search region, edge bit included into the region,
61774 + * @high_off: high end of the search region, edge bit included into the region,
61775 + *
61776 + * @return: 0 - set bit was found, -1 otherwise.
61777 + */
61778 +static int
61779 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61780 + bmap_off_t high_off)
61781 +{
61782 + ulong_t *base = addr;
61783 + int last_word;
61784 + int first_word;
61785 + int last_bit;
61786 + int nr;
61787 +
61788 + assert("zam-962", high_off >= low_off);
61789 +
61790 + last_word = high_off >> LONG_INT_SHIFT;
61791 + last_bit = high_off & LONG_INT_MASK;
61792 + first_word = low_off >> LONG_INT_SHIFT;
61793 +
61794 + if (last_bit < BITS_PER_LONG) {
61795 + nr = find_last_set_bit_in_word(base[last_word], last_bit);
61796 + if (nr < BITS_PER_LONG) {
61797 + *result = (last_word << LONG_INT_SHIFT) + nr;
61798 + return 0;
61799 + }
61800 + --last_word;
61801 + }
61802 + while (last_word >= first_word) {
61803 + if (base[last_word] != 0x0) {
61804 + last_bit =
61805 + find_last_set_bit_in_word(base[last_word],
61806 + BITS_PER_LONG - 1);
61807 + assert("zam-972", last_bit < BITS_PER_LONG);
61808 + *result = (last_word << LONG_INT_SHIFT) + last_bit;
61809 + return 0;
61810 + }
61811 + --last_word;
61812 + }
61813 +
61814 + return -1; /* set bit not found */
61815 +}
61816 +
61817 +/* Search bitmap for a clear bit in backward direction from the end to the
61818 + * beginning of given region */
61819 +static int
61820 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61821 + bmap_off_t high_off)
61822 +{
61823 + ulong_t *base = addr;
61824 + int last_word;
61825 + int first_word;
61826 + int last_bit;
61827 + int nr;
61828 +
61829 + last_word = high_off >> LONG_INT_SHIFT;
61830 + last_bit = high_off & LONG_INT_MASK;
61831 + first_word = low_off >> LONG_INT_SHIFT;
61832 +
61833 + if (last_bit < BITS_PER_LONG) {
61834 + nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61835 + if (nr < BITS_PER_LONG) {
61836 + *result = (last_word << LONG_INT_SHIFT) + nr;
61837 + return 0;
61838 + }
61839 + --last_word;
61840 + }
61841 + while (last_word >= first_word) {
61842 + if (base[last_word] != (ulong_t) (-1)) {
61843 + *result = (last_word << LONG_INT_SHIFT) +
61844 + find_last_set_bit_in_word(~base[last_word],
61845 + BITS_PER_LONG - 1);
61846 + return 0;
61847 + }
61848 + --last_word;
61849 + }
61850 +
61851 + return -1; /* zero bit not found */
61852 +}
61853 +
61854 +/* Audited by: green(2002.06.12) */
61855 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61856 +{
61857 + int first_byte;
61858 + int last_byte;
61859 +
61860 + unsigned char first_byte_mask = 0xFF;
61861 + unsigned char last_byte_mask = 0xFF;
61862 +
61863 + assert("zam-410", start < end);
61864 +
61865 + first_byte = start >> 3;
61866 + last_byte = (end - 1) >> 3;
61867 +
61868 + if (last_byte > first_byte + 1)
61869 + memset(addr + first_byte + 1, 0,
61870 + (size_t) (last_byte - first_byte - 1));
61871 +
61872 + first_byte_mask >>= 8 - (start & 0x7);
61873 + last_byte_mask <<= ((end - 1) & 0x7) + 1;
61874 +
61875 + if (first_byte == last_byte) {
61876 + addr[first_byte] &= (first_byte_mask | last_byte_mask);
61877 + } else {
61878 + addr[first_byte] &= first_byte_mask;
61879 + addr[last_byte] &= last_byte_mask;
61880 + }
61881 +}
61882 +
61883 +/* Audited by: green(2002.06.12) */
61884 +/* ZAM-FIXME-HANS: comment this */
61885 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61886 +{
61887 + int first_byte;
61888 + int last_byte;
61889 +
61890 + unsigned char first_byte_mask = 0xFF;
61891 + unsigned char last_byte_mask = 0xFF;
61892 +
61893 + assert("zam-386", start < end);
61894 +
61895 + first_byte = start >> 3;
61896 + last_byte = (end - 1) >> 3;
61897 +
61898 + if (last_byte > first_byte + 1)
61899 + memset(addr + first_byte + 1, 0xFF,
61900 + (size_t) (last_byte - first_byte - 1));
61901 +
61902 + first_byte_mask <<= start & 0x7;
61903 + last_byte_mask >>= 7 - ((end - 1) & 0x7);
61904 +
61905 + if (first_byte == last_byte) {
61906 + addr[first_byte] |= (first_byte_mask & last_byte_mask);
61907 + } else {
61908 + addr[first_byte] |= first_byte_mask;
61909 + addr[last_byte] |= last_byte_mask;
61910 + }
61911 +}
61912 +
61913 +#define ADLER_BASE 65521
61914 +#define ADLER_NMAX 5552
61915 +
61916 +/* Calculates the adler32 checksum for the data pointed by `data` of the
61917 + length `len`. This function was originally taken from zlib, version 1.1.3,
61918 + July 9th, 1998.
61919 +
61920 + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61921 +
61922 + This software is provided 'as-is', without any express or implied
61923 + warranty. In no event will the authors be held liable for any damages
61924 + arising from the use of this software.
61925 +
61926 + Permission is granted to anyone to use this software for any purpose,
61927 + including commercial applications, and to alter it and redistribute it
61928 + freely, subject to the following restrictions:
61929 +
61930 + 1. The origin of this software must not be misrepresented; you must not
61931 + claim that you wrote the original software. If you use this software
61932 + in a product, an acknowledgment in the product documentation would be
61933 + appreciated but is not required.
61934 + 2. Altered source versions must be plainly marked as such, and must not be
61935 + misrepresented as being the original software.
61936 + 3. This notice may not be removed or altered from any source distribution.
61937 +
61938 + Jean-loup Gailly Mark Adler
61939 + jloup@gzip.org madler@alumni.caltech.edu
61940 +
61941 + The above comment applies only to the reiser4_adler32 function.
61942 +*/
61943 +
61944 +__u32 reiser4_adler32(char *data, __u32 len)
61945 +{
61946 + unsigned char *t = data;
61947 + __u32 s1 = 1;
61948 + __u32 s2 = 0;
61949 + int k;
61950 +
61951 + while (len > 0) {
61952 + k = len < ADLER_NMAX ? len : ADLER_NMAX;
61953 + len -= k;
61954 +
61955 + while (k--) {
61956 + s1 += *t++;
61957 + s2 += s1;
61958 + }
61959 +
61960 + s1 %= ADLER_BASE;
61961 + s2 %= ADLER_BASE;
61962 + }
61963 + return (s2 << 16) | s1;
61964 +}
61965 +
61966 +#define sb_by_bnode(bnode) \
61967 + ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
61968 +
61969 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
61970 +{
61971 + return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
61972 +}
61973 +
61974 +static int
61975 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
61976 +{
61977 + if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
61978 + bmap_nr_t bmap;
61979 +
61980 + bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
61981 +
61982 + warning("vpf-263",
61983 + "Checksum for the bitmap block %llu is incorrect",
61984 + bmap);
61985 +
61986 + return RETERR(-EIO);
61987 + }
61988 +
61989 + return 0;
61990 +}
61991 +
61992 +#define REISER4_CHECK_BMAP_CRC (0)
61993 +
61994 +#if REISER4_CHECK_BMAP_CRC
61995 +static int bnode_check_crc(const struct bitmap_node *bnode)
61996 +{
61997 + return bnode_check_adler32(bnode,
61998 + bmap_size(sb_by_bnode(bnode)->s_blocksize));
61999 +}
62000 +
62001 +/* REISER4_CHECK_BMAP_CRC */
62002 +#else
62003 +
62004 +#define bnode_check_crc(bnode) (0)
62005 +
62006 +/* REISER4_CHECK_BMAP_CRC */
62007 +#endif
62008 +
62009 +/* Recalculates the adler32 checksum for only 1 byte change.
62010 + adler - previous adler checksum
62011 + old_data, data - old, new byte values.
62012 + tail == (chunk - offset) : length, checksum was calculated for, - offset of
62013 + the changed byte within this chunk.
62014 + This function can be used for checksum calculation optimisation.
62015 +*/
62016 +
62017 +static __u32
62018 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62019 + __u32 tail)
62020 +{
62021 + __u32 delta = data - old_data + 2 * ADLER_BASE;
62022 + __u32 s1 = adler & 0xffff;
62023 + __u32 s2 = (adler >> 16) & 0xffff;
62024 +
62025 + s1 = (delta + s1) % ADLER_BASE;
62026 + s2 = (delta * tail + s2) % ADLER_BASE;
62027 +
62028 + return (s2 << 16) | s1;
62029 +}
62030 +
62031 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62032 +
62033 +/**
62034 + * get_nr_bitmap - calculate number of bitmap blocks
62035 + * @super: super block with initialized blocksize and block count
62036 + *
62037 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62038 + * maintain free disk space. It assumes that each bitmap addresses the same
62039 + * number of blocks which is calculated by bmap_block_count macro defined in
62040 + * above. Number of blocks in the filesystem has to be initialized in reiser4
62041 + * private data of super block already so that it can be obtained via
62042 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62043 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62044 + * to use special function to divide and modulo 64bits filesystem block
62045 + * counters.
62046 + *
62047 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62048 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62049 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62050 + */
62051 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
62052 +{
62053 + u64 quotient;
62054 +
62055 + assert("zam-393", reiser4_block_count(super) != 0);
62056 +
62057 + quotient = reiser4_block_count(super) - 1;
62058 + do_div(quotient, bmap_bit_count(super->s_blocksize));
62059 + return quotient + 1;
62060 +}
62061 +
62062 +/**
62063 + * parse_blocknr - calculate bitmap number and offset in it by block number
62064 + * @block: pointer to block number to calculate location in bitmap of
62065 + * @bmap: pointer where to store bitmap block number
62066 + * @offset: pointer where to store offset within bitmap block
62067 + *
62068 + * Calculates location of bit which is responsible for allocation/freeing of
62069 + * block @*block. That location is represented by bitmap block number and offset
62070 + * within that bitmap block.
62071 + */
62072 +static void
62073 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62074 + bmap_off_t *offset)
62075 +{
62076 + struct super_block *super = get_current_context()->super;
62077 + u64 quotient = *block;
62078 +
62079 + *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62080 + *bmap = quotient;
62081 +
62082 + assert("zam-433", *bmap < get_nr_bmap(super));
62083 + assert("", *offset < bmap_bit_count(super->s_blocksize));
62084 +}
62085 +
62086 +#if REISER4_DEBUG
62087 +/* Audited by: green(2002.06.12) */
62088 +static void
62089 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62090 +{
62091 + struct super_block *sb = reiser4_get_current_sb();
62092 +
62093 + assert("zam-436", sb != NULL);
62094 +
62095 + assert("zam-455", start != NULL);
62096 + assert("zam-437", *start != 0);
62097 + assert("zam-541", !reiser4_blocknr_is_fake(start));
62098 + assert("zam-441", *start < reiser4_block_count(sb));
62099 +
62100 + if (len != NULL) {
62101 + assert("zam-438", *len != 0);
62102 + assert("zam-442", *start + *len <= reiser4_block_count(sb));
62103 + }
62104 +}
62105 +
62106 +static void check_bnode_loaded(const struct bitmap_node *bnode)
62107 +{
62108 + assert("zam-485", bnode != NULL);
62109 + assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62110 + assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62111 + assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62112 + assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62113 +}
62114 +
62115 +#else
62116 +
62117 +# define check_block_range(start, len) do { /* nothing */} while(0)
62118 +# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
62119 +
62120 +#endif
62121 +
62122 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62123 + spin-locked */
62124 +static inline void
62125 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62126 +{
62127 + if (offset < bnode->first_zero_bit)
62128 + bnode->first_zero_bit = offset;
62129 +}
62130 +
62131 +/* return a physical disk address for logical bitmap number @bmap */
62132 +/* FIXME-VS: this is somehow related to disk layout? */
62133 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62134 + * per block allocation so that performance is not affected. Probably this
62135 + * whole file should be considered part of the disk layout plugin, and other
62136 + * disk layouts can use other defines and efficiency will not be significantly
62137 + * affected. */
62138 +
62139 +#define REISER4_FIRST_BITMAP_BLOCK \
62140 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62141 +
62142 +/* Audited by: green(2002.06.12) */
62143 +static void
62144 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62145 + reiser4_block_nr * bnr)
62146 +{
62147 +
62148 + assert("zam-390", bmap < get_nr_bmap(super));
62149 +
62150 +#ifdef CONFIG_REISER4_BADBLOCKS
62151 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62152 + /* Check if the diskmap have this already, first. */
62153 + if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62154 + return; /* Found it in diskmap */
62155 +#endif
62156 + /* FIXME_ZAM: before discussing of disk layouts and disk format
62157 + plugins I implement bitmap location scheme which is close to scheme
62158 + used in reiser 3.6 */
62159 + if (bmap == 0) {
62160 + *bnr = REISER4_FIRST_BITMAP_BLOCK;
62161 + } else {
62162 + *bnr = bmap * bmap_bit_count(super->s_blocksize);
62163 + }
62164 +}
62165 +
62166 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62167 +/* Audited by: green(2002.06.12) */
62168 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62169 +{
62170 + *bnr =
62171 + (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62172 + REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62173 +}
62174 +
62175 +/* bnode structure initialization */
62176 +static void
62177 +init_bnode(struct bitmap_node *bnode,
62178 + struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62179 +{
62180 + memset(bnode, 0, sizeof(struct bitmap_node));
62181 +
62182 + mutex_init(&bnode->mutex);
62183 + atomic_set(&bnode->loaded, 0);
62184 +}
62185 +
62186 +static void release(jnode * node)
62187 +{
62188 + jrelse(node);
62189 + JF_SET(node, JNODE_HEARD_BANSHEE);
62190 + jput(node);
62191 +}
62192 +
62193 +/* This function is for internal bitmap.c use because it assumes that jnode is
62194 + in under full control of this thread */
62195 +static void done_bnode(struct bitmap_node *bnode)
62196 +{
62197 + if (bnode) {
62198 + atomic_set(&bnode->loaded, 0);
62199 + if (bnode->wjnode != NULL)
62200 + release(bnode->wjnode);
62201 + if (bnode->cjnode != NULL)
62202 + release(bnode->cjnode);
62203 + bnode->wjnode = bnode->cjnode = NULL;
62204 + }
62205 +}
62206 +
62207 +/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
62208 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
62209 + jnode **wjnode_ret)
62210 +{
62211 + struct super_block *super;
62212 + jnode *cjnode;
62213 + jnode *wjnode;
62214 + bmap_nr_t bmap;
62215 + int ret;
62216 +
62217 + super = reiser4_get_current_sb();
62218 +
62219 + *wjnode_ret = wjnode = bnew();
62220 + if (wjnode == NULL) {
62221 + *cjnode_ret = NULL;
62222 + return RETERR(-ENOMEM);
62223 + }
62224 +
62225 + *cjnode_ret = cjnode = bnew();
62226 + if (cjnode == NULL)
62227 + return RETERR(-ENOMEM);
62228 +
62229 + bmap = bnode - get_bnode(super, 0);
62230 +
62231 + get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62232 + get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62233 +
62234 + jref(cjnode);
62235 + jref(wjnode);
62236 +
62237 + /* load commit bitmap */
62238 + ret = jload_gfp(cjnode, GFP_NOFS, 1);
62239 +
62240 + if (ret)
62241 + goto error;
62242 +
62243 + /* allocate memory for working bitmap block. Note that for
62244 + * bitmaps jinit_new() doesn't actually modifies node content,
62245 + * so parallel calls to this are ok. */
62246 + ret = jinit_new(wjnode, GFP_NOFS);
62247 +
62248 + if (ret != 0) {
62249 + jrelse(cjnode);
62250 + goto error;
62251 + }
62252 +
62253 + return 0;
62254 +
62255 + error:
62256 + jput(cjnode);
62257 + jput(wjnode);
62258 + *wjnode_ret = *cjnode_ret = NULL;
62259 + return ret;
62260 +
62261 +}
62262 +
62263 +/* Check the bnode data on read. */
62264 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62265 +{
62266 + void *data;
62267 + int ret;
62268 +
62269 + /* Check CRC */
62270 + ret = bnode_check_adler32(bnode, blksize);
62271 +
62272 + if (ret) {
62273 + return ret;
62274 + }
62275 +
62276 + data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62277 +
62278 + /* Check the very first bit -- it must be busy. */
62279 + if (!reiser4_test_bit(0, data)) {
62280 + warning("vpf-1362", "The allocator block %llu is not marked "
62281 + "as used.", (unsigned long long)bnode->cjnode->blocknr);
62282 +
62283 + return -EINVAL;
62284 + }
62285 +
62286 + return 0;
62287 +}
62288 +
62289 +/* load bitmap blocks "on-demand" */
62290 +static int load_and_lock_bnode(struct bitmap_node *bnode)
62291 +{
62292 + int ret;
62293 +
62294 + jnode *cjnode;
62295 + jnode *wjnode;
62296 +
62297 + assert("nikita-3040", reiser4_schedulable());
62298 +
62299 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62300 + * need to be atomic, right? Just leave a comment that if bitmaps were
62301 + * unloadable, this would need to be atomic. */
62302 + if (atomic_read(&bnode->loaded)) {
62303 + /* bitmap is already loaded, nothing to do */
62304 + check_bnode_loaded(bnode);
62305 + mutex_lock(&bnode->mutex);
62306 + assert("nikita-2827", atomic_read(&bnode->loaded));
62307 + return 0;
62308 + }
62309 +
62310 + ret = prepare_bnode(bnode, &cjnode, &wjnode);
62311 + if (ret == 0) {
62312 + mutex_lock(&bnode->mutex);
62313 +
62314 + if (!atomic_read(&bnode->loaded)) {
62315 + assert("nikita-2822", cjnode != NULL);
62316 + assert("nikita-2823", wjnode != NULL);
62317 + assert("nikita-2824", jnode_is_loaded(cjnode));
62318 + assert("nikita-2825", jnode_is_loaded(wjnode));
62319 +
62320 + bnode->wjnode = wjnode;
62321 + bnode->cjnode = cjnode;
62322 +
62323 + ret = check_struct_bnode(bnode, current_blocksize);
62324 + if (!ret) {
62325 + cjnode = wjnode = NULL;
62326 + atomic_set(&bnode->loaded, 1);
62327 + /* working bitmap is initialized by on-disk
62328 + * commit bitmap. This should be performed
62329 + * under mutex. */
62330 + memcpy(bnode_working_data(bnode),
62331 + bnode_commit_data(bnode),
62332 + bmap_size(current_blocksize));
62333 + } else
62334 + mutex_unlock(&bnode->mutex);
62335 + } else
62336 + /* race: someone already loaded bitmap while we were
62337 + * busy initializing data. */
62338 + check_bnode_loaded(bnode);
62339 + }
62340 +
62341 + if (wjnode != NULL) {
62342 + release(wjnode);
62343 + bnode->wjnode = NULL;
62344 + }
62345 + if (cjnode != NULL) {
62346 + release(cjnode);
62347 + bnode->cjnode = NULL;
62348 + }
62349 +
62350 + return ret;
62351 +}
62352 +
62353 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
62354 +{
62355 + check_bnode_loaded(bnode);
62356 + mutex_unlock(&bnode->mutex);
62357 +}
62358 +
62359 +/* This function does all block allocation work but only for one bitmap
62360 + block.*/
62361 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62362 + block responsibility zone boundaries. This had no sense in v3.6 but may
62363 + have it in v4.x */
62364 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62365 +static int
62366 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62367 + bmap_off_t max_offset, int min_len, int max_len)
62368 +{
62369 + struct super_block *super = get_current_context()->super;
62370 + struct bitmap_node *bnode = get_bnode(super, bmap);
62371 +
62372 + char *data;
62373 +
62374 + bmap_off_t search_end;
62375 + bmap_off_t start;
62376 + bmap_off_t end;
62377 +
62378 + int set_first_zero_bit = 0;
62379 +
62380 + int ret;
62381 +
62382 + assert("zam-364", min_len > 0);
62383 + assert("zam-365", max_len >= min_len);
62384 + assert("zam-366", *offset <= max_offset);
62385 +
62386 + ret = load_and_lock_bnode(bnode);
62387 +
62388 + if (ret)
62389 + return ret;
62390 +
62391 + data = bnode_working_data(bnode);
62392 +
62393 + start = *offset;
62394 +
62395 + if (bnode->first_zero_bit >= start) {
62396 + start = bnode->first_zero_bit;
62397 + set_first_zero_bit = 1;
62398 + }
62399 +
62400 + while (start + min_len < max_offset) {
62401 +
62402 + start =
62403 + reiser4_find_next_zero_bit((long *)data, max_offset, start);
62404 + if (set_first_zero_bit) {
62405 + bnode->first_zero_bit = start;
62406 + set_first_zero_bit = 0;
62407 + }
62408 + if (start >= max_offset)
62409 + break;
62410 +
62411 + search_end = LIMIT(start + max_len, max_offset);
62412 + end =
62413 + reiser4_find_next_set_bit((long *)data, search_end, start);
62414 + if (end >= start + min_len) {
62415 + /* we can't trust find_next_set_bit result if set bit
62416 + was not fount, result may be bigger than
62417 + max_offset */
62418 + if (end > search_end)
62419 + end = search_end;
62420 +
62421 + ret = end - start;
62422 + *offset = start;
62423 +
62424 + reiser4_set_bits(data, start, end);
62425 +
62426 + /* FIXME: we may advance first_zero_bit if [start,
62427 + end] region overlaps the first_zero_bit point */
62428 +
62429 + break;
62430 + }
62431 +
62432 + start = end + 1;
62433 + }
62434 +
62435 + release_and_unlock_bnode(bnode);
62436 +
62437 + return ret;
62438 +}
62439 +
62440 +static int
62441 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62442 + bmap_off_t end_offset, int min_len, int max_len)
62443 +{
62444 + struct super_block *super = get_current_context()->super;
62445 + struct bitmap_node *bnode = get_bnode(super, bmap);
62446 + char *data;
62447 + bmap_off_t start;
62448 + int ret;
62449 +
62450 + assert("zam-958", min_len > 0);
62451 + assert("zam-959", max_len >= min_len);
62452 + assert("zam-960", *start_offset >= end_offset);
62453 +
62454 + ret = load_and_lock_bnode(bnode);
62455 + if (ret)
62456 + return ret;
62457 +
62458 + data = bnode_working_data(bnode);
62459 + start = *start_offset;
62460 +
62461 + while (1) {
62462 + bmap_off_t end, search_end;
62463 +
62464 + /* Find the beginning of the zero filled region */
62465 + if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62466 + break;
62467 + /* Is there more than `min_len' bits from `start' to
62468 + * `end_offset'? */
62469 + if (start < end_offset + min_len - 1)
62470 + break;
62471 +
62472 + /* Do not search to `end_offset' if we need to find less than
62473 + * `max_len' zero bits. */
62474 + if (end_offset + max_len - 1 < start)
62475 + search_end = start - max_len + 1;
62476 + else
62477 + search_end = end_offset;
62478 +
62479 + if (reiser4_find_last_set_bit(&end, data, search_end, start))
62480 + end = search_end;
62481 + else
62482 + end++;
62483 +
62484 + if (end + min_len <= start + 1) {
62485 + if (end < search_end)
62486 + end = search_end;
62487 + ret = start - end + 1;
62488 + *start_offset = end; /* `end' is lowest offset */
62489 + assert("zam-987",
62490 + reiser4_find_next_set_bit(data, start + 1,
62491 + end) >= start + 1);
62492 + reiser4_set_bits(data, end, start + 1);
62493 + break;
62494 + }
62495 +
62496 + if (end <= end_offset)
62497 + /* left search boundary reached. */
62498 + break;
62499 + start = end - 1;
62500 + }
62501 +
62502 + release_and_unlock_bnode(bnode);
62503 + return ret;
62504 +}
62505 +
62506 +/* allocate contiguous range of blocks in bitmap */
62507 +static int bitmap_alloc_forward(reiser4_block_nr * start,
62508 + const reiser4_block_nr * end, int min_len,
62509 + int max_len)
62510 +{
62511 + bmap_nr_t bmap, end_bmap;
62512 + bmap_off_t offset, end_offset;
62513 + int len;
62514 +
62515 + reiser4_block_nr tmp;
62516 +
62517 + struct super_block *super = get_current_context()->super;
62518 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62519 +
62520 + parse_blocknr(start, &bmap, &offset);
62521 +
62522 + tmp = *end - 1;
62523 + parse_blocknr(&tmp, &end_bmap, &end_offset);
62524 + ++end_offset;
62525 +
62526 + assert("zam-358", end_bmap >= bmap);
62527 + assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62528 +
62529 + for (; bmap < end_bmap; bmap++, offset = 0) {
62530 + len =
62531 + search_one_bitmap_forward(bmap, &offset, max_offset,
62532 + min_len, max_len);
62533 + if (len != 0)
62534 + goto out;
62535 + }
62536 +
62537 + len =
62538 + search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62539 + max_len);
62540 + out:
62541 + *start = bmap * max_offset + offset;
62542 + return len;
62543 +}
62544 +
62545 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
62546 + * backward direction) */
62547 +static int bitmap_alloc_backward(reiser4_block_nr * start,
62548 + const reiser4_block_nr * end, int min_len,
62549 + int max_len)
62550 +{
62551 + bmap_nr_t bmap, end_bmap;
62552 + bmap_off_t offset, end_offset;
62553 + int len;
62554 + struct super_block *super = get_current_context()->super;
62555 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62556 +
62557 + parse_blocknr(start, &bmap, &offset);
62558 + parse_blocknr(end, &end_bmap, &end_offset);
62559 +
62560 + assert("zam-961", end_bmap <= bmap);
62561 + assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62562 +
62563 + for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62564 + len =
62565 + search_one_bitmap_backward(bmap, &offset, 0, min_len,
62566 + max_len);
62567 + if (len != 0)
62568 + goto out;
62569 + }
62570 +
62571 + len =
62572 + search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62573 + max_len);
62574 + out:
62575 + *start = bmap * max_offset + offset;
62576 + return len;
62577 +}
62578 +
62579 +/* plugin->u.space_allocator.alloc_blocks() */
62580 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62581 + reiser4_block_nr *start, reiser4_block_nr *len)
62582 +{
62583 + struct super_block *super = get_current_context()->super;
62584 + int actual_len;
62585 +
62586 + reiser4_block_nr search_start;
62587 + reiser4_block_nr search_end;
62588 +
62589 + assert("zam-398", super != NULL);
62590 + assert("zam-412", hint != NULL);
62591 + assert("zam-397", hint->blk <= reiser4_block_count(super));
62592 +
62593 + if (hint->max_dist == 0)
62594 + search_end = reiser4_block_count(super);
62595 + else
62596 + search_end =
62597 + LIMIT(hint->blk + hint->max_dist,
62598 + reiser4_block_count(super));
62599 +
62600 + /* We use @hint -> blk as a search start and search from it to the end
62601 + of the disk or in given region if @hint -> max_dist is not zero */
62602 + search_start = hint->blk;
62603 +
62604 + actual_len =
62605 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62606 +
62607 + /* There is only one bitmap search if max_dist was specified or first
62608 + pass was from the beginning of the bitmap. We also do one pass for
62609 + scanning bitmap in backward direction. */
62610 + if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62611 + /* next step is a scanning from 0 to search_start */
62612 + search_end = search_start;
62613 + search_start = 0;
62614 + actual_len =
62615 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62616 + }
62617 + if (actual_len == 0)
62618 + return RETERR(-ENOSPC);
62619 + if (actual_len < 0)
62620 + return RETERR(actual_len);
62621 + *len = actual_len;
62622 + *start = search_start;
62623 + return 0;
62624 +}
62625 +
62626 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62627 + reiser4_block_nr * start,
62628 + reiser4_block_nr * len)
62629 +{
62630 + reiser4_block_nr search_start;
62631 + reiser4_block_nr search_end;
62632 + int actual_len;
62633 +
62634 + ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62635 +
62636 + assert("zam-969", super != NULL);
62637 + assert("zam-970", hint != NULL);
62638 + assert("zam-971", hint->blk <= reiser4_block_count(super));
62639 +
62640 + search_start = hint->blk;
62641 + if (hint->max_dist == 0 || search_start <= hint->max_dist)
62642 + search_end = 0;
62643 + else
62644 + search_end = search_start - hint->max_dist;
62645 +
62646 + actual_len =
62647 + bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62648 + if (actual_len == 0)
62649 + return RETERR(-ENOSPC);
62650 + if (actual_len < 0)
62651 + return RETERR(actual_len);
62652 + *len = actual_len;
62653 + *start = search_start;
62654 + return 0;
62655 +}
62656 +
62657 +/* plugin->u.space_allocator.alloc_blocks() */
62658 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
62659 + reiser4_blocknr_hint * hint, int needed,
62660 + reiser4_block_nr * start, reiser4_block_nr * len)
62661 +{
62662 + if (hint->backward)
62663 + return alloc_blocks_backward(hint, needed, start, len);
62664 + return alloc_blocks_forward(hint, needed, start, len);
62665 +}
62666 +
62667 +/* plugin->u.space_allocator.dealloc_blocks(). */
62668 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62669 + nodes deletion is deferred until transaction commit. However, deallocation
62670 + of temporary objects like wandered blocks and transaction commit records
62671 + requires immediate node deletion from WORKING BITMAP.*/
62672 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
62673 + reiser4_block_nr start, reiser4_block_nr len)
62674 +{
62675 + struct super_block *super = reiser4_get_current_sb();
62676 +
62677 + bmap_nr_t bmap;
62678 + bmap_off_t offset;
62679 +
62680 + struct bitmap_node *bnode;
62681 + int ret;
62682 +
62683 + assert("zam-468", len != 0);
62684 + check_block_range(&start, &len);
62685 +
62686 + parse_blocknr(&start, &bmap, &offset);
62687 +
62688 + assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62689 +
62690 + bnode = get_bnode(super, bmap);
62691 +
62692 + assert("zam-470", bnode != NULL);
62693 +
62694 + ret = load_and_lock_bnode(bnode);
62695 + assert("zam-481", ret == 0);
62696 +
62697 + reiser4_clear_bits(bnode_working_data(bnode), offset,
62698 + (bmap_off_t) (offset + len));
62699 +
62700 + adjust_first_zero_bit(bnode, offset);
62701 +
62702 + release_and_unlock_bnode(bnode);
62703 +}
62704 +
62705 +/* plugin->u.space_allocator.check_blocks(). */
62706 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
62707 + const reiser4_block_nr * len, int desired)
62708 +{
62709 +#if REISER4_DEBUG
62710 + struct super_block *super = reiser4_get_current_sb();
62711 +
62712 + bmap_nr_t bmap;
62713 + bmap_off_t start_offset;
62714 + bmap_off_t end_offset;
62715 +
62716 + struct bitmap_node *bnode;
62717 + int ret;
62718 +
62719 + assert("zam-622", len != NULL);
62720 + check_block_range(start, len);
62721 + parse_blocknr(start, &bmap, &start_offset);
62722 +
62723 + end_offset = start_offset + *len;
62724 + assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62725 +
62726 + bnode = get_bnode(super, bmap);
62727 +
62728 + assert("nikita-2215", bnode != NULL);
62729 +
62730 + ret = load_and_lock_bnode(bnode);
62731 + assert("zam-626", ret == 0);
62732 +
62733 + assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62734 +
62735 + if (desired) {
62736 + assert("zam-623",
62737 + reiser4_find_next_zero_bit(bnode_working_data(bnode),
62738 + end_offset, start_offset)
62739 + >= end_offset);
62740 + } else {
62741 + assert("zam-624",
62742 + reiser4_find_next_set_bit(bnode_working_data(bnode),
62743 + end_offset, start_offset)
62744 + >= end_offset);
62745 + }
62746 +
62747 + release_and_unlock_bnode(bnode);
62748 +#endif
62749 +}
62750 +
62751 +/* conditional insertion of @node into atom's overwrite set if it was not there */
62752 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62753 +{
62754 + assert("zam-546", atom != NULL);
62755 + assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62756 + assert("zam-548", node != NULL);
62757 +
62758 + spin_lock_atom(atom);
62759 + spin_lock_jnode(node);
62760 +
62761 + if (node->atom == NULL) {
62762 + JF_SET(node, JNODE_OVRWR);
62763 + insert_into_atom_ovrwr_list(atom, node);
62764 + } else {
62765 + assert("zam-549", node->atom == atom);
62766 + }
62767 +
62768 + spin_unlock_jnode(node);
62769 + spin_unlock_atom(atom);
62770 +}
62771 +
62772 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
62773 + pages in a single-linked list */
62774 +static int
62775 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62776 + const reiser4_block_nr * len, void *data)
62777 +{
62778 +
62779 + bmap_nr_t bmap;
62780 + bmap_off_t offset;
62781 + int ret;
62782 +
62783 + long long *blocks_freed_p = data;
62784 +
62785 + struct bitmap_node *bnode;
62786 +
62787 + struct super_block *sb = reiser4_get_current_sb();
62788 +
62789 + check_block_range(start, len);
62790 +
62791 + parse_blocknr(start, &bmap, &offset);
62792 +
62793 + /* FIXME-ZAM: we assume that all block ranges are allocated by this
62794 + bitmap-based allocator and each block range can't go over a zone of
62795 + responsibility of one bitmap block; same assumption is used in
62796 + other journal hooks in bitmap code. */
62797 + bnode = get_bnode(sb, bmap);
62798 + assert("zam-448", bnode != NULL);
62799 +
62800 + /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62801 + assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62802 + ret = load_and_lock_bnode(bnode);
62803 + if (ret)
62804 + return ret;
62805 +
62806 + /* put bnode into atom's overwrite set */
62807 + cond_add_to_overwrite_set(atom, bnode->cjnode);
62808 +
62809 + data = bnode_commit_data(bnode);
62810 +
62811 + ret = bnode_check_crc(bnode);
62812 + if (ret != 0)
62813 + return ret;
62814 +
62815 + if (len != NULL) {
62816 + /* FIXME-ZAM: a check that all bits are set should be there */
62817 + assert("zam-443",
62818 + offset + *len <= bmap_bit_count(sb->s_blocksize));
62819 + reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62820 +
62821 + (*blocks_freed_p) += *len;
62822 + } else {
62823 + reiser4_clear_bit(offset, data);
62824 + (*blocks_freed_p)++;
62825 + }
62826 +
62827 + bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62828 +
62829 + release_and_unlock_bnode(bnode);
62830 +
62831 + return 0;
62832 +}
62833 +
62834 +/* plugin->u.space_allocator.pre_commit_hook(). */
62835 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62836 + rest is done by transaction manager (allocate wandered locations for COMMIT
62837 + BITMAP blocks, copy COMMIT BITMAP blocks data). */
62838 +/* Only one instance of this function can be running at one given time, because
62839 + only one transaction can be committed a time, therefore it is safe to access
62840 + some global variables without any locking */
62841 +
62842 +int reiser4_pre_commit_hook_bitmap(void)
62843 +{
62844 + struct super_block *super = reiser4_get_current_sb();
62845 + txn_atom *atom;
62846 +
62847 + long long blocks_freed = 0;
62848 +
62849 + atom = get_current_atom_locked();
62850 + assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62851 + spin_unlock_atom(atom);
62852 +
62853 + { /* scan atom's captured list and find all freshly allocated nodes,
62854 + * mark corresponded bits in COMMIT BITMAP as used */
62855 + struct list_head *head = ATOM_CLEAN_LIST(atom);
62856 + jnode *node = list_entry(head->next, jnode, capture_link);
62857 +
62858 + while (head != &node->capture_link) {
62859 + /* we detect freshly allocated jnodes */
62860 + if (JF_ISSET(node, JNODE_RELOC)) {
62861 + int ret;
62862 + bmap_nr_t bmap;
62863 +
62864 + bmap_off_t offset;
62865 + bmap_off_t index;
62866 + struct bitmap_node *bn;
62867 + __u32 size = bmap_size(super->s_blocksize);
62868 + __u32 crc;
62869 + char byte;
62870 +
62871 + assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62872 + assert("zam-460",
62873 + !reiser4_blocknr_is_fake(&node->blocknr));
62874 +
62875 + parse_blocknr(&node->blocknr, &bmap, &offset);
62876 + bn = get_bnode(super, bmap);
62877 +
62878 + index = offset >> 3;
62879 + assert("vpf-276", index < size);
62880 +
62881 + ret = bnode_check_crc(bnode);
62882 + if (ret != 0)
62883 + return ret;
62884 +
62885 + check_bnode_loaded(bn);
62886 + load_and_lock_bnode(bn);
62887 +
62888 + byte = *(bnode_commit_data(bn) + index);
62889 + reiser4_set_bit(offset, bnode_commit_data(bn));
62890 +
62891 + crc = adler32_recalc(bnode_commit_crc(bn), byte,
62892 + *(bnode_commit_data(bn) +
62893 + index),
62894 + size - index),
62895 + bnode_set_commit_crc(bn, crc);
62896 +
62897 + release_and_unlock_bnode(bn);
62898 +
62899 + ret = bnode_check_crc(bn);
62900 + if (ret != 0)
62901 + return ret;
62902 +
62903 + /* working of this depends on how it inserts
62904 + new j-node into clean list, because we are
62905 + scanning the same list now. It is OK, if
62906 + insertion is done to the list front */
62907 + cond_add_to_overwrite_set(atom, bn->cjnode);
62908 + }
62909 +
62910 + node = list_entry(node->capture_link.next, jnode, capture_link);
62911 + }
62912 + }
62913 +
62914 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62915 + &blocks_freed, 0);
62916 +
62917 + blocks_freed -= atom->nr_blocks_allocated;
62918 +
62919 + {
62920 + reiser4_super_info_data *sbinfo;
62921 +
62922 + sbinfo = get_super_private(super);
62923 +
62924 + spin_lock_reiser4_super(sbinfo);
62925 + sbinfo->blocks_free_committed += blocks_freed;
62926 + spin_unlock_reiser4_super(sbinfo);
62927 + }
62928 +
62929 + return 0;
62930 +}
62931 +
62932 +/* plugin->u.space_allocator.init_allocator
62933 + constructor of reiser4_space_allocator object. It is called on fs mount */
62934 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
62935 + struct super_block *super, void *arg)
62936 +{
62937 + struct bitmap_allocator_data *data = NULL;
62938 + bmap_nr_t bitmap_blocks_nr;
62939 + bmap_nr_t i;
62940 +
62941 + assert("nikita-3039", reiser4_schedulable());
62942 +
62943 + /* getting memory for bitmap allocator private data holder */
62944 + data =
62945 + kmalloc(sizeof(struct bitmap_allocator_data),
62946 + reiser4_ctx_gfp_mask_get());
62947 +
62948 + if (data == NULL)
62949 + return RETERR(-ENOMEM);
62950 +
62951 + /* allocation and initialization for the array of bnodes */
62952 + bitmap_blocks_nr = get_nr_bmap(super);
62953 +
62954 + /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
62955 + which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
62956 + may I never meet someone who still uses the ia32 architecture when
62957 + storage devices of that size enter the market, and wants to use ia32
62958 + with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
62959 + probably, another dynamic data structure should replace a static
62960 + array of bnodes. */
62961 + /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
62962 + data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
62963 + if (data->bitmap == NULL) {
62964 + kfree(data);
62965 + return RETERR(-ENOMEM);
62966 + }
62967 +
62968 + for (i = 0; i < bitmap_blocks_nr; i++)
62969 + init_bnode(data->bitmap + i, super, i);
62970 +
62971 + allocator->u.generic = data;
62972 +
62973 +#if REISER4_DEBUG
62974 + get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
62975 +#endif
62976 +
62977 + /* Load all bitmap blocks at mount time. */
62978 + if (!test_bit
62979 + (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
62980 + __u64 start_time, elapsed_time;
62981 + struct bitmap_node *bnode;
62982 + int ret;
62983 +
62984 + if (REISER4_DEBUG)
62985 + printk(KERN_INFO "loading reiser4 bitmap...");
62986 + start_time = jiffies;
62987 +
62988 + for (i = 0; i < bitmap_blocks_nr; i++) {
62989 + bnode = data->bitmap + i;
62990 + ret = load_and_lock_bnode(bnode);
62991 + if (ret) {
62992 + reiser4_destroy_allocator_bitmap(allocator,
62993 + super);
62994 + return ret;
62995 + }
62996 + release_and_unlock_bnode(bnode);
62997 + }
62998 +
62999 + elapsed_time = jiffies - start_time;
63000 + if (REISER4_DEBUG)
63001 + printk("...done (%llu jiffies)\n",
63002 + (unsigned long long)elapsed_time);
63003 + }
63004 +
63005 + return 0;
63006 +}
63007 +
63008 +/* plugin->u.space_allocator.destroy_allocator
63009 + destructor. It is called on fs unmount */
63010 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63011 + struct super_block *super)
63012 +{
63013 + bmap_nr_t bitmap_blocks_nr;
63014 + bmap_nr_t i;
63015 +
63016 + struct bitmap_allocator_data *data = allocator->u.generic;
63017 +
63018 + assert("zam-414", data != NULL);
63019 + assert("zam-376", data->bitmap != NULL);
63020 +
63021 + bitmap_blocks_nr = get_nr_bmap(super);
63022 +
63023 + for (i = 0; i < bitmap_blocks_nr; i++) {
63024 + struct bitmap_node *bnode = data->bitmap + i;
63025 +
63026 + mutex_lock(&bnode->mutex);
63027 +
63028 +#if REISER4_DEBUG
63029 + if (atomic_read(&bnode->loaded)) {
63030 + jnode *wj = bnode->wjnode;
63031 + jnode *cj = bnode->cjnode;
63032 +
63033 + assert("zam-480", jnode_page(cj) != NULL);
63034 + assert("zam-633", jnode_page(wj) != NULL);
63035 +
63036 + assert("zam-634",
63037 + memcmp(jdata(wj), jdata(wj),
63038 + bmap_size(super->s_blocksize)) == 0);
63039 +
63040 + }
63041 +#endif
63042 + done_bnode(bnode);
63043 + mutex_unlock(&bnode->mutex);
63044 + }
63045 +
63046 + vfree(data->bitmap);
63047 + kfree(data);
63048 +
63049 + allocator->u.generic = NULL;
63050 +
63051 + return 0;
63052 +}
63053 +
63054 +/*
63055 + * Local variables:
63056 + * c-indentation-style: "K&R"
63057 + * mode-name: "LC"
63058 + * c-basic-offset: 8
63059 + * tab-width: 8
63060 + * fill-column: 79
63061 + * scroll-step: 1
63062 + * End:
63063 + */
63064 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.20/fs/reiser4/plugin/space/bitmap.h
63065 --- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
63066 +++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.h 2007-05-06 14:50:43.863026968 +0400
63067 @@ -0,0 +1,47 @@
63068 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63069 +
63070 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63071 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63072 +
63073 +#include "../../dformat.h"
63074 +#include "../../block_alloc.h"
63075 +
63076 +#include <linux/types.h> /* for __u?? */
63077 +#include <linux/fs.h> /* for struct super_block */
63078 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63079 +/* declarations of functions implementing methods of space allocator plugin for
63080 + bitmap based allocator. The functions themselves are in bitmap.c */
63081 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
63082 + struct super_block *, void *);
63083 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
63084 + struct super_block *);
63085 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
63086 + reiser4_blocknr_hint *, int needed,
63087 + reiser4_block_nr * start,
63088 + reiser4_block_nr * len);
63089 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
63090 + const reiser4_block_nr *, int);
63091 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
63092 + reiser4_block_nr,
63093 + reiser4_block_nr);
63094 +extern int reiser4_pre_commit_hook_bitmap(void);
63095 +
63096 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
63097 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
63098 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
63099 +
63100 +typedef __u64 bmap_nr_t;
63101 +typedef __u32 bmap_off_t;
63102 +
63103 +#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63104 +
63105 +/* Make Linus happy.
63106 + Local variables:
63107 + c-indentation-style: "K&R"
63108 + mode-name: "LC"
63109 + c-basic-offset: 8
63110 + tab-width: 8
63111 + fill-column: 120
63112 + scroll-step: 1
63113 + End:
63114 +*/
63115 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile linux-2.6.20/fs/reiser4/plugin/space/Makefile
63116 --- linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
63117 +++ linux-2.6.20/fs/reiser4/plugin/space/Makefile 2007-05-06 14:50:43.863026968 +0400
63118 @@ -0,0 +1,4 @@
63119 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
63120 +
63121 +space_plugins-objs := \
63122 + bitmap.o
63123 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h
63124 --- linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
63125 +++ linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h 2007-05-06 14:50:43.863026968 +0400
63126 @@ -0,0 +1,80 @@
63127 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63128 +
63129 +#ifndef __SPACE_ALLOCATOR_H__
63130 +#define __SPACE_ALLOCATOR_H__
63131 +
63132 +#include "../../forward.h"
63133 +#include "bitmap.h"
63134 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63135 + * but... */
63136 +#define DEF_SPACE_ALLOCATOR(allocator) \
63137 + \
63138 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
63139 +{ \
63140 + return reiser4_init_allocator_##allocator (al, s, opaque); \
63141 +} \
63142 + \
63143 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
63144 +{ \
63145 + reiser4_destroy_allocator_##allocator (al, s); \
63146 +} \
63147 + \
63148 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
63149 + int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
63150 +{ \
63151 + return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
63152 +} \
63153 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
63154 +{ \
63155 + reiser4_dealloc_blocks_##allocator (al, start, len); \
63156 +} \
63157 + \
63158 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
63159 +{ \
63160 + reiser4_check_blocks_##allocator (start, end, desired); \
63161 +} \
63162 + \
63163 +static inline void sa_pre_commit_hook (void) \
63164 +{ \
63165 + reiser4_pre_commit_hook_##allocator (); \
63166 +} \
63167 + \
63168 +static inline void sa_post_commit_hook (void) \
63169 +{ \
63170 + reiser4_post_commit_hook_##allocator (); \
63171 +} \
63172 + \
63173 +static inline void sa_post_write_back_hook (void) \
63174 +{ \
63175 + reiser4_post_write_back_hook_##allocator(); \
63176 +} \
63177 + \
63178 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
63179 +{ \
63180 + reiser4_print_info_##allocator (prefix, al); \
63181 +}
63182 +
63183 +DEF_SPACE_ALLOCATOR(bitmap)
63184 +
63185 +/* this object is part of reiser4 private in-core super block */
63186 +struct reiser4_space_allocator {
63187 + union {
63188 + /* space allocators might use this pointer to reference their
63189 + * data. */
63190 + void *generic;
63191 + } u;
63192 +};
63193 +
63194 +/* __SPACE_ALLOCATOR_H__ */
63195 +#endif
63196 +
63197 +/* Make Linus happy.
63198 + Local variables:
63199 + c-indentation-style: "K&R"
63200 + mode-name: "LC"
63201 + c-basic-offset: 8
63202 + tab-width: 8
63203 + fill-column: 120
63204 + scroll-step: 1
63205 + End:
63206 +*/
63207 diff -urN linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.20/fs/reiser4/plugin/tail_policy.c
63208 --- linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
63209 +++ linux-2.6.20/fs/reiser4/plugin/tail_policy.c 2007-05-06 14:50:43.863026968 +0400
63210 @@ -0,0 +1,113 @@
63211 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63212 + * reiser4/README */
63213 +
63214 +/* Formatting policy plugins */
63215 +
63216 +/*
63217 + * Formatting policy plugin is used by object plugin (of regular file) to
63218 + * convert file between two representations.
63219 + *
63220 + * Currently following policies are implemented:
63221 + * never store file in formatted nodes
63222 + * always store file in formatted nodes
63223 + * store file in formatted nodes if file is smaller than 4 blocks (default)
63224 + */
63225 +
63226 +#include "../tree.h"
63227 +#include "../inode.h"
63228 +#include "../super.h"
63229 +#include "object.h"
63230 +#include "plugin.h"
63231 +#include "node/node.h"
63232 +#include "plugin_header.h"
63233 +
63234 +#include <linux/pagemap.h>
63235 +#include <linux/fs.h> /* For struct inode */
63236 +
63237 +/**
63238 + * have_formatting_never -
63239 + * @inode:
63240 + * @size:
63241 + *
63242 + *
63243 + */
63244 +/* Never store file's tail as direct item */
63245 +/* Audited by: green(2002.06.12) */
63246 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
63247 + /* inode to operate on */ ,
63248 + loff_t size UNUSED_ARG /* new object size */ )
63249 +{
63250 + return 0;
63251 +}
63252 +
63253 +/* Always store file's tail as direct item */
63254 +/* Audited by: green(2002.06.12) */
63255 +static int
63256 +have_formatting_always(const struct inode *inode UNUSED_ARG
63257 + /* inode to operate on */ ,
63258 + loff_t size UNUSED_ARG /* new object size */ )
63259 +{
63260 + return 1;
63261 +}
63262 +
63263 +/* This function makes test if we should store file denoted @inode as tails only or
63264 + as extents only. */
63265 +static int
63266 +have_formatting_default(const struct inode *inode UNUSED_ARG
63267 + /* inode to operate on */ ,
63268 + loff_t size /* new object size */ )
63269 +{
63270 + assert("umka-1253", inode != NULL);
63271 +
63272 + if (size > inode->i_sb->s_blocksize * 4)
63273 + return 0;
63274 +
63275 + return 1;
63276 +}
63277 +
63278 +/* tail plugins */
63279 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63280 + [NEVER_TAILS_FORMATTING_ID] = {
63281 + .h = {
63282 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63283 + .id = NEVER_TAILS_FORMATTING_ID,
63284 + .pops = NULL,
63285 + .label = "never",
63286 + .desc = "Never store file's tail",
63287 + .linkage = {NULL, NULL}
63288 + },
63289 + .have_tail = have_formatting_never
63290 + },
63291 + [ALWAYS_TAILS_FORMATTING_ID] = {
63292 + .h = {
63293 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63294 + .id = ALWAYS_TAILS_FORMATTING_ID,
63295 + .pops = NULL,
63296 + .label = "always",
63297 + .desc = "Always store file's tail",
63298 + .linkage = {NULL, NULL}
63299 + },
63300 + .have_tail = have_formatting_always
63301 + },
63302 + [SMALL_FILE_FORMATTING_ID] = {
63303 + .h = {
63304 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63305 + .id = SMALL_FILE_FORMATTING_ID,
63306 + .pops = NULL,
63307 + .label = "4blocks",
63308 + .desc = "store files shorter than 4 blocks in tail items",
63309 + .linkage = {NULL, NULL}
63310 + },
63311 + .have_tail = have_formatting_default
63312 + }
63313 +};
63314 +
63315 +/*
63316 + * Local variables:
63317 + * c-indentation-style: "K&R"
63318 + * mode-name: "LC"
63319 + * c-basic-offset: 8
63320 + * tab-width: 8
63321 + * fill-column: 79
63322 + * End:
63323 + */
63324 diff -urN linux-2.6.20.orig/fs/reiser4/pool.c linux-2.6.20/fs/reiser4/pool.c
63325 --- linux-2.6.20.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
63326 +++ linux-2.6.20/fs/reiser4/pool.c 2007-05-06 14:50:43.863026968 +0400
63327 @@ -0,0 +1,234 @@
63328 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63329 + * reiser4/README */
63330 +
63331 +/* Fast pool allocation.
63332 +
63333 + There are situations when some sub-system normally asks memory allocator
63334 + for only few objects, but under some circumstances could require much
63335 + more. Typical and actually motivating example is tree balancing. It needs
63336 + to keep track of nodes that were involved into it, and it is well-known
63337 + that in reasonable packed balanced tree most (92.938121%) percent of all
63338 + balancings end up after working with only few nodes (3.141592 on
63339 + average). But in rare cases balancing can involve much more nodes
63340 + (3*tree_height+1 in extremal situation).
63341 +
63342 + On the one hand, we don't want to resort to dynamic allocation (slab,
63343 + malloc(), etc.) to allocate data structures required to keep track of
63344 + nodes during balancing. On the other hand, we cannot statically allocate
63345 + required amount of space on the stack, because first: it is useless wastage
63346 + of precious resource, and second: this amount is unknown in advance (tree
63347 + height can change).
63348 +
63349 + Pools, implemented in this file are solution for this problem:
63350 +
63351 + - some configurable amount of objects is statically preallocated on the
63352 + stack
63353 +
63354 + - if this preallocated pool is exhausted and more objects is requested
63355 + they are allocated dynamically.
63356 +
63357 + Pools encapsulate distinction between statically and dynamically allocated
63358 + objects. Both allocation and recycling look exactly the same.
63359 +
63360 + To keep track of dynamically allocated objects, pool adds its own linkage
63361 + to each object.
63362 +
63363 + NOTE-NIKITA This linkage also contains some balancing-specific data. This
63364 + is not perfect. On the other hand, balancing is currently the only client
63365 + of pool code.
63366 +
63367 + NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63368 + functions in the style of tslist/tshash, i.e., make them unreadable, but
63369 + type-safe.
63370 +
63371 +*/
63372 +
63373 +#include "debug.h"
63374 +#include "pool.h"
63375 +#include "super.h"
63376 +
63377 +#include <linux/types.h>
63378 +#include <linux/err.h>
63379 +
63380 +/* initialize new pool object */
63381 +static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to
63382 + * initialize */ )
63383 +{
63384 + INIT_LIST_HEAD(&h->usage_linkage);
63385 + INIT_LIST_HEAD(&h->level_linkage);
63386 + INIT_LIST_HEAD(&h->extra_linkage);
63387 +}
63388 +
63389 +/* initialize new pool */
63390 +void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63391 + size_t obj_size /* size of objects in @pool */ ,
63392 + int num_of_objs /* number of preallocated objects */ ,
63393 + char *data /* area for preallocated objects */ )
63394 +{
63395 + reiser4_pool_header *h;
63396 + int i;
63397 +
63398 + assert("nikita-955", pool != NULL);
63399 + assert("nikita-1044", obj_size > 0);
63400 + assert("nikita-956", num_of_objs >= 0);
63401 + assert("nikita-957", data != NULL);
63402 +
63403 + memset(pool, 0, sizeof *pool);
63404 + pool->obj_size = obj_size;
63405 + pool->data = data;
63406 + INIT_LIST_HEAD(&pool->free);
63407 + INIT_LIST_HEAD(&pool->used);
63408 + INIT_LIST_HEAD(&pool->extra);
63409 + memset(data, 0, obj_size * num_of_objs);
63410 + for (i = 0; i < num_of_objs; ++i) {
63411 + h = (reiser4_pool_header *) (data + i * obj_size);
63412 + reiser4_init_pool_obj(h);
63413 + /* add pool header to the end of pool's free list */
63414 + list_add_tail(&h->usage_linkage, &pool->free);
63415 + }
63416 +}
63417 +
63418 +/* release pool resources
63419 +
63420 + Release all resources acquired by this pool, specifically, dynamically
63421 + allocated objects.
63422 +
63423 +*/
63424 +void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
63425 +{
63426 +}
63427 +
63428 +/* allocate carry object from pool
63429 +
63430 + First, try to get preallocated object. If this fails, resort to dynamic
63431 + allocation.
63432 +
63433 +*/
63434 +static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
63435 + * from */ )
63436 +{
63437 + reiser4_pool_header *result;
63438 +
63439 + assert("nikita-959", pool != NULL);
63440 +
63441 + if (!list_empty(&pool->free)) {
63442 + struct list_head *linkage;
63443 +
63444 + linkage = pool->free.next;
63445 + list_del(linkage);
63446 + INIT_LIST_HEAD(linkage);
63447 + result = list_entry(linkage, reiser4_pool_header, usage_linkage);
63448 + BUG_ON(!list_empty(&result->level_linkage) ||
63449 + !list_empty(&result->extra_linkage));
63450 + } else {
63451 + /* pool is empty. Extra allocations don't deserve dedicated
63452 + slab to be served from, as they are expected to be rare. */
63453 + result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
63454 + if (result != 0) {
63455 + reiser4_init_pool_obj(result);
63456 + list_add(&result->extra_linkage, &pool->extra);
63457 + } else
63458 + return ERR_PTR(RETERR(-ENOMEM));
63459 + BUG_ON(!list_empty(&result->usage_linkage) ||
63460 + !list_empty(&result->level_linkage));
63461 + }
63462 + ++pool->objs;
63463 + list_add(&result->usage_linkage, &pool->used);
63464 + memset(result + 1, 0, pool->obj_size - sizeof *result);
63465 + return result;
63466 +}
63467 +
63468 +/* return object back to the pool */
63469 +void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back
63470 + * into */ )
63471 +{
63472 + assert("nikita-961", h != NULL);
63473 + assert("nikita-962", pool != NULL);
63474 +
63475 + --pool->objs;
63476 + assert("nikita-963", pool->objs >= 0);
63477 +
63478 + list_del_init(&h->usage_linkage);
63479 + list_del_init(&h->level_linkage);
63480 +
63481 + if (list_empty(&h->extra_linkage))
63482 + /*
63483 + * pool header is not an extra one. Push it onto free list
63484 + * using usage_linkage
63485 + */
63486 + list_add(&h->usage_linkage, &pool->free);
63487 + else {
63488 + /* remove pool header from pool's extra list and kfree it */
63489 + list_del(&h->extra_linkage);
63490 + kfree(h);
63491 + }
63492 +}
63493 +
63494 +/* add new object to the carry level list
63495 +
63496 + Carry level is FIFO most of the time, but not always. Complications arise
63497 + when make_space() function tries to go to the left neighbor and thus adds
63498 + carry node before existing nodes, and also, when updating delimiting keys
63499 + after moving data between two nodes, we want left node to be locked before
63500 + right node.
63501 +
63502 + Latter case is confusing at the first glance. Problem is that COP_UPDATE
63503 + opration that updates delimiting keys is sometimes called with two nodes
63504 + (when data are moved between two nodes) and sometimes with only one node
63505 + (when leftmost item is deleted in a node). In any case operation is
63506 + supplied with at least node whose left delimiting key is to be updated
63507 + (that is "right" node).
63508 +
63509 +*/
63510 +reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool /* pool from which to
63511 + * allocate new object
63512 + */,
63513 + struct list_head *list /* list where to add
63514 + * object */,
63515 + pool_ordering order /* where to add */,
63516 + reiser4_pool_header * reference
63517 + /* after (or before) which existing object
63518 + to add */)
63519 +{
63520 + reiser4_pool_header *result;
63521 +
63522 + assert("nikita-972", pool != NULL);
63523 +
63524 + result = reiser4_pool_alloc(pool);
63525 + if (IS_ERR(result))
63526 + return result;
63527 +
63528 + assert("nikita-973", result != NULL);
63529 +
63530 + switch (order) {
63531 + case POOLO_BEFORE:
63532 + __list_add(&result->level_linkage,
63533 + reference->level_linkage.prev,
63534 + &reference->level_linkage);
63535 + break;
63536 + case POOLO_AFTER:
63537 + __list_add(&result->level_linkage,
63538 + &reference->level_linkage,
63539 + reference->level_linkage.next);
63540 + break;
63541 + case POOLO_LAST:
63542 + list_add_tail(&result->level_linkage, list);
63543 + break;
63544 + case POOLO_FIRST:
63545 + list_add(&result->level_linkage, list);
63546 + break;
63547 + default:
63548 + wrong_return_value("nikita-927", "order");
63549 + }
63550 + return result;
63551 +}
63552 +
63553 +/* Make Linus happy.
63554 + Local variables:
63555 + c-indentation-style: "K&R"
63556 + mode-name: "LC"
63557 + c-basic-offset: 8
63558 + tab-width: 8
63559 + fill-column: 120
63560 + End:
63561 +*/
63562 diff -urN linux-2.6.20.orig/fs/reiser4/pool.h linux-2.6.20/fs/reiser4/pool.h
63563 --- linux-2.6.20.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
63564 +++ linux-2.6.20/fs/reiser4/pool.h 2007-05-06 14:50:43.863026968 +0400
63565 @@ -0,0 +1,55 @@
63566 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63567 +
63568 +/* Fast pool allocation */
63569 +
63570 +#ifndef __REISER4_POOL_H__
63571 +#define __REISER4_POOL_H__
63572 +
63573 +#include <linux/types.h>
63574 +
63575 +typedef struct reiser4_pool {
63576 + size_t obj_size;
63577 + int objs;
63578 + char *data;
63579 + struct list_head free;
63580 + struct list_head used;
63581 + struct list_head extra;
63582 +} reiser4_pool;
63583 +
63584 +typedef struct reiser4_pool_header {
63585 + /* object is either on free or "used" lists */
63586 + struct list_head usage_linkage;
63587 + struct list_head level_linkage;
63588 + struct list_head extra_linkage;
63589 +} reiser4_pool_header;
63590 +
63591 +typedef enum {
63592 + POOLO_BEFORE,
63593 + POOLO_AFTER,
63594 + POOLO_LAST,
63595 + POOLO_FIRST
63596 +} pool_ordering;
63597 +
63598 +/* pool manipulation functions */
63599 +
63600 +extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
63601 + int num_of_objs, char *data);
63602 +extern void reiser4_done_pool(reiser4_pool * pool);
63603 +extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
63604 +reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool,
63605 + struct list_head * list,
63606 + pool_ordering order,
63607 + reiser4_pool_header * reference);
63608 +
63609 +/* __REISER4_POOL_H__ */
63610 +#endif
63611 +
63612 +/* Make Linus happy.
63613 + Local variables:
63614 + c-indentation-style: "K&R"
63615 + mode-name: "LC"
63616 + c-basic-offset: 8
63617 + tab-width: 8
63618 + fill-column: 120
63619 + End:
63620 +*/
63621 diff -urN linux-2.6.20.orig/fs/reiser4/readahead.c linux-2.6.20/fs/reiser4/readahead.c
63622 --- linux-2.6.20.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
63623 +++ linux-2.6.20/fs/reiser4/readahead.c 2007-05-06 14:50:43.867028218 +0400
63624 @@ -0,0 +1,138 @@
63625 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63626 + * reiser4/README */
63627 +
63628 +#include "forward.h"
63629 +#include "tree.h"
63630 +#include "tree_walk.h"
63631 +#include "super.h"
63632 +#include "inode.h"
63633 +#include "key.h"
63634 +#include "znode.h"
63635 +
63636 +#include <linux/swap.h> /* for totalram_pages */
63637 +
63638 +void reiser4_init_ra_info(ra_info_t * rai)
63639 +{
63640 + rai->key_to_stop = *reiser4_min_key();
63641 +}
63642 +
63643 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63644 +static inline int ra_adjacent_only(int flags)
63645 +{
63646 + return flags & RA_ADJACENT_ONLY;
63647 +}
63648 +
63649 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63650 + if right neighbor's first key is less or equal to readahead's stop key */
63651 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
63652 +{
63653 + int result;
63654 +
63655 + read_lock_dk(znode_get_tree(node));
63656 + result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63657 + read_unlock_dk(znode_get_tree(node));
63658 + return result;
63659 +}
63660 +
63661 +#define LOW_MEM_PERCENTAGE (5)
63662 +
63663 +static int low_on_memory(void)
63664 +{
63665 + unsigned int freepages;
63666 +
63667 + freepages = nr_free_pages();
63668 + return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63669 +}
63670 +
63671 +/* start read for @node and for a few of its right neighbors */
63672 +void formatted_readahead(znode * node, ra_info_t * info)
63673 +{
63674 + ra_params_t *ra_params;
63675 + znode *cur;
63676 + int i;
63677 + int grn_flags;
63678 + lock_handle next_lh;
63679 +
63680 + /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
63681 + if (reiser4_blocknr_is_fake(znode_get_block(node)))
63682 + return;
63683 +
63684 + ra_params = get_current_super_ra_params();
63685 +
63686 + if (znode_page(node) == NULL)
63687 + jstartio(ZJNODE(node));
63688 +
63689 + if (znode_get_level(node) != LEAF_LEVEL)
63690 + return;
63691 +
63692 + /* don't waste memory for read-ahead when low on memory */
63693 + if (low_on_memory())
63694 + return;
63695 +
63696 + /* We can have locked nodes on upper tree levels, in this situation lock
63697 + priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63698 + here. */
63699 + grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63700 +
63701 + i = 0;
63702 + cur = zref(node);
63703 + init_lh(&next_lh);
63704 + while (i < ra_params->max) {
63705 + const reiser4_block_nr *nextblk;
63706 +
63707 + if (!should_readahead_neighbor(cur, info))
63708 + break;
63709 +
63710 + if (reiser4_get_right_neighbor
63711 + (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63712 + break;
63713 +
63714 + nextblk = znode_get_block(next_lh.node);
63715 + if (reiser4_blocknr_is_fake(nextblk) ||
63716 + (ra_adjacent_only(ra_params->flags)
63717 + && *nextblk != *znode_get_block(cur) + 1)) {
63718 + break;
63719 + }
63720 +
63721 + zput(cur);
63722 + cur = zref(next_lh.node);
63723 + done_lh(&next_lh);
63724 + if (znode_page(cur) == NULL)
63725 + jstartio(ZJNODE(cur));
63726 + else
63727 + /* Do not scan read-ahead window if pages already
63728 + * allocated (and i/o already started). */
63729 + break;
63730 +
63731 + i++;
63732 + }
63733 + zput(cur);
63734 + done_lh(&next_lh);
63735 +}
63736 +
63737 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63738 +{
63739 + reiser4_key *stop_key;
63740 +
63741 + assert("nikita-3542", dir != NULL);
63742 + assert("nikita-3543", tap != NULL);
63743 +
63744 + stop_key = &tap->ra_info.key_to_stop;
63745 + /* initialize readdir readahead information: include into readahead
63746 + * stat data of all files of the directory */
63747 + set_key_locality(stop_key, get_inode_oid(dir));
63748 + set_key_type(stop_key, KEY_SD_MINOR);
63749 + set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
63750 + set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
63751 + set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
63752 +}
63753 +
63754 +/*
63755 + Local variables:
63756 + c-indentation-style: "K&R"
63757 + mode-name: "LC"
63758 + c-basic-offset: 8
63759 + tab-width: 8
63760 + fill-column: 80
63761 + End:
63762 +*/
63763 diff -urN linux-2.6.20.orig/fs/reiser4/readahead.h linux-2.6.20/fs/reiser4/readahead.h
63764 --- linux-2.6.20.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
63765 +++ linux-2.6.20/fs/reiser4/readahead.h 2007-05-06 14:50:43.867028218 +0400
63766 @@ -0,0 +1,48 @@
63767 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63768 +
63769 +#ifndef __READAHEAD_H__
63770 +#define __READAHEAD_H__
63771 +
63772 +#include "key.h"
63773 +
63774 +typedef enum {
63775 + RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
63776 +} ra_global_flags;
63777 +
63778 +/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
63779 +typedef struct formatted_read_ahead_params {
63780 + unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
63781 + int flags;
63782 +} ra_params_t;
63783 +
63784 +typedef struct {
63785 + reiser4_key key_to_stop;
63786 +} ra_info_t;
63787 +
63788 +void formatted_readahead(znode *, ra_info_t *);
63789 +void reiser4_init_ra_info(ra_info_t * rai);
63790 +
63791 +struct reiser4_file_ra_state {
63792 + loff_t start; /* Current window */
63793 + loff_t size;
63794 + loff_t next_size; /* Next window size */
63795 + loff_t ahead_start; /* Ahead window */
63796 + loff_t ahead_size;
63797 + loff_t max_window_size; /* Maximum readahead window */
63798 + loff_t slow_start; /* enlarging r/a size algorithm. */
63799 +};
63800 +
63801 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63802 +
63803 +/* __READAHEAD_H__ */
63804 +#endif
63805 +
63806 +/*
63807 + Local variables:
63808 + c-indentation-style: "K&R"
63809 + mode-name: "LC"
63810 + c-basic-offset: 8
63811 + tab-width: 8
63812 + fill-column: 120
63813 + End:
63814 +*/
63815 diff -urN linux-2.6.20.orig/fs/reiser4/README linux-2.6.20/fs/reiser4/README
63816 --- linux-2.6.20.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
63817 +++ linux-2.6.20/fs/reiser4/README 2007-05-06 14:50:43.867028218 +0400
63818 @@ -0,0 +1,125 @@
63819 +[LICENSING]
63820 +
63821 +Reiser4 is hereby licensed under the GNU General
63822 +Public License version 2.
63823 +
63824 +Source code files that contain the phrase "licensing governed by
63825 +reiser4/README" are "governed files" throughout this file. Governed
63826 +files are licensed under the GPL. The portions of them owned by Hans
63827 +Reiser, or authorized to be licensed by him, have been in the past,
63828 +and likely will be in the future, licensed to other parties under
63829 +other licenses. If you add your code to governed files, and don't
63830 +want it to be owned by Hans Reiser, put your copyright label on that
63831 +code so the poor blight and his customers can keep things straight.
63832 +All portions of governed files not labeled otherwise are owned by Hans
63833 +Reiser, and by adding your code to it, widely distributing it to
63834 +others or sending us a patch, and leaving the sentence in stating that
63835 +licensing is governed by the statement in this file, you accept this.
63836 +It will be a kindness if you identify whether Hans Reiser is allowed
63837 +to license code labeled as owned by you on your behalf other than
63838 +under the GPL, because he wants to know if it is okay to do so and put
63839 +a check in the mail to you (for non-trivial improvements) when he
63840 +makes his next sale. He makes no guarantees as to the amount if any,
63841 +though he feels motivated to motivate contributors, and you can surely
63842 +discuss this with him before or after contributing. You have the
63843 +right to decline to allow him to license your code contribution other
63844 +than under the GPL.
63845 +
63846 +Further licensing options are available for commercial and/or other
63847 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
63848 +the GPL as not allowing those additional licensing options, you read
63849 +it wrongly, and Richard Stallman agrees with me, when carefully read
63850 +you can see that those restrictions on additional terms do not apply
63851 +to the owner of the copyright, and my interpretation of this shall
63852 +govern for this license.
63853 +
63854 +[END LICENSING]
63855 +
63856 +Reiser4 is a file system based on dancing tree algorithms, and is
63857 +described at http://www.namesys.com
63858 +
63859 +mkfs.reiser4 and other utilities are on our webpage or wherever your
63860 +Linux provider put them. You really want to be running the latest
63861 +version off the website if you use fsck.
63862 +
63863 +Yes, if you update your reiser4 kernel module you do have to
63864 +recompile your kernel, most of the time. The errors you get will be
63865 +quite cryptic if your forget to do so.
63866 +
63867 +Hideous Commercial Pitch: Spread your development costs across other OS
63868 +vendors. Select from the best in the world, not the best in your
63869 +building, by buying from third party OS component suppliers. Leverage
63870 +the software component development power of the internet. Be the most
63871 +aggressive in taking advantage of the commercial possibilities of
63872 +decentralized internet development, and add value through your branded
63873 +integration that you sell as an operating system. Let your competitors
63874 +be the ones to compete against the entire internet by themselves. Be
63875 +hip, get with the new economic trend, before your competitors do. Send
63876 +email to reiser@namesys.com
63877 +
63878 +Hans Reiser was the primary architect of Reiser4, but a whole team
63879 +chipped their ideas in. He invested everything he had into Namesys
63880 +for 5.5 dark years of no money before Reiser3 finally started to work well
63881 +enough to bring in money. He owns the copyright.
63882 +
63883 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
63884 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
63885 +opinion, unique in its willingness to invest into things more
63886 +theoretical than the VC community can readily understand, and more
63887 +longterm than allows them to be sure that they will be the ones to
63888 +extract the economic benefits from. DARPA also integrated us into a
63889 +security community that transformed our security worldview.
63890 +
63891 +Vladimir Saveliev is our lead programmer, with us from the beginning,
63892 +and he worked long hours writing the cleanest code. This is why he is
63893 +now the lead programmer after years of commitment to our work. He
63894 +always made the effort to be the best he could be, and to make his
63895 +code the best that it could be. What resulted was quite remarkable. I
63896 +don't think that money can ever motivate someone to work the way he
63897 +did, he is one of the most selfless men I know.
63898 +
63899 +Alexander Lyamin was our sysadmin, and helped to educate us in
63900 +security issues. Moscow State University and IMT were very generous
63901 +in the internet access they provided us, and in lots of other little
63902 +ways that a generous institution can be.
63903 +
63904 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
63905 +locking code, the block allocator, and finished the flushing code.
63906 +His code is always crystal clean and well structured.
63907 +
63908 +Nikita Danilov wrote the core of the balancing code, the core of the
63909 +plugins code, and the directory code. He worked a steady pace of long
63910 +hours that produced a whole lot of well abstracted code. He is our
63911 +senior computer scientist.
63912 +
63913 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
63914 +something very few persons have the skills for, and it is thanks to
63915 +him that we can say that the parser is really not so big compared to
63916 +various bits of our other code, and making a parser work in the kernel
63917 +was not so complicated as everyone would imagine mainly because it was
63918 +him doing it...
63919 +
63920 +Joshua McDonald wrote the transaction manager, and the flush code.
63921 +The flush code unexpectedly turned out be extremely hairy for reasons
63922 +you can read about on our web page, and he did a great job on an
63923 +extremely difficult task.
63924 +
63925 +Nina Reiser handled our accounting, government relations, and much
63926 +more.
63927 +
63928 +Ramon Reiser developed our website.
63929 +
63930 +Beverly Palmer drew our graphics.
63931 +
63932 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
63933 +and worked with Umka on developing libreiser4 and userspace plugins.
63934 +
63935 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
63936 +userspace tools (reiser4progs).
63937 +
63938 +Oleg Drokin (aka Green) is the release manager who fixes everything.
63939 +It is so nice to have someone like that on the team. He (plus Chris
63940 +and Jeff) make it possible for the entire rest of the Namesys team to
63941 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
63942 +is just amazing to watch his talent for spotting bugs in action.
63943 +
63944 diff -urN linux-2.6.20.orig/fs/reiser4/reiser4.h linux-2.6.20/fs/reiser4/reiser4.h
63945 --- linux-2.6.20.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
63946 +++ linux-2.6.20/fs/reiser4/reiser4.h 2007-05-06 14:50:43.867028218 +0400
63947 @@ -0,0 +1,269 @@
63948 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63949 + * reiser4/README */
63950 +
63951 +/* definitions of common constants used by reiser4 */
63952 +
63953 +#if !defined( __REISER4_H__ )
63954 +#define __REISER4_H__
63955 +
63956 +#include <asm/param.h> /* for HZ */
63957 +#include <linux/errno.h>
63958 +#include <linux/types.h>
63959 +#include <linux/fs.h>
63960 +#include <linux/hardirq.h>
63961 +#include <linux/sched.h>
63962 +
63963 +/*
63964 + * reiser4 compilation options.
63965 + */
63966 +
63967 +#if defined(CONFIG_REISER4_DEBUG)
63968 +/* turn on assertion checks */
63969 +#define REISER4_DEBUG (1)
63970 +#else
63971 +#define REISER4_DEBUG (0)
63972 +#endif
63973 +
63974 +#if defined(CONFIG_ZLIB_INFLATE)
63975 +/* turn on zlib */
63976 +#define REISER4_ZLIB (1)
63977 +#else
63978 +#define REISER4_ZLIB (0)
63979 +#endif
63980 +
63981 +#if defined(CONFIG_CRYPTO_SHA256)
63982 +#define REISER4_SHA256 (1)
63983 +#else
63984 +#define REISER4_SHA256 (0)
63985 +#endif
63986 +
63987 +/*
63988 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63989 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
63990 + * components. Additional component, referred to as "ordering" is used to
63991 + * order items from which given object is composed of. As such, ordering is
63992 + * placed between locality and objectid. For directory item ordering contains
63993 + * initial prefix of the file name this item is for. This sorts all directory
63994 + * items within given directory lexicographically (but see
63995 + * fibration.[ch]). For file body and stat-data, ordering contains initial
63996 + * prefix of the name file was initially created with. In the common case
63997 + * (files with single name) this allows to order file bodies and stat-datas in
63998 + * the same order as their respective directory entries, thus speeding up
63999 + * readdir.
64000 + *
64001 + * Note, that kernel can only mount file system with the same key size as one
64002 + * it is compiled for, so flipping this option may render your data
64003 + * inaccessible.
64004 + */
64005 +#define REISER4_LARGE_KEY (1)
64006 +/*#define REISER4_LARGE_KEY (0)*/
64007 +
64008 +/*#define GUESS_EXISTS 1*/
64009 +
64010 +/*
64011 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
64012 + * option
64013 + */
64014 +
64015 +extern const char *REISER4_SUPER_MAGIC_STRING;
64016 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
64017 + * beginning of device */
64018 +
64019 +/* here go tunable parameters that are not worth special entry in kernel
64020 + configuration */
64021 +
64022 +/* default number of slots in coord-by-key caches */
64023 +#define CBK_CACHE_SLOTS (16)
64024 +/* how many elementary tree operation to carry on the next level */
64025 +#define CARRIES_POOL_SIZE (5)
64026 +/* size of pool of preallocated nodes for carry process. */
64027 +#define NODES_LOCKED_POOL_SIZE (5)
64028 +
64029 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64030 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64031 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
64032 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
64033 +
64034 +/* we are supporting reservation of disk space on uid basis */
64035 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
64036 +/* we are supporting reservation of disk space for groups */
64037 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
64038 +/* we are supporting reservation of disk space for root */
64039 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
64040 +/* we use rapid flush mode, see flush.c for comments. */
64041 +#define REISER4_USE_RAPID_FLUSH (1)
64042 +
64043 +/*
64044 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
64045 + */
64046 +#define REISER4_USE_ENTD (1)
64047 +
64048 +/* key allocation is Plan-A */
64049 +#define REISER4_PLANA_KEY_ALLOCATION (1)
64050 +/* key allocation follows good old 3.x scheme */
64051 +#define REISER4_3_5_KEY_ALLOCATION (0)
64052 +
64053 +/* size of hash-table for znodes */
64054 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64055 +
64056 +/* number of buckets in lnode hash-table */
64057 +#define LNODE_HTABLE_BUCKETS (1024)
64058 +
64059 +/* some ridiculously high maximal limit on height of znode tree. This
64060 + is used in declaration of various per level arrays and
64061 + to allocate stattistics gathering array for per-level stats. */
64062 +#define REISER4_MAX_ZTREE_HEIGHT (8)
64063 +
64064 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64065 +
64066 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64067 + sequential search is on average faster than binary. This is because
64068 + of better optimization and because sequential search is more CPU
64069 + cache friendly. This number (25) was found by experiments on dual AMD
64070 + Athlon(tm), 1400MHz.
64071 +
64072 + NOTE: testing in kernel has shown that binary search is more effective than
64073 + implied by results of the user level benchmarking. Probably because in the
64074 + node keys are separated by other data. So value was adjusted after few
64075 + tests. More thorough tuning is needed.
64076 +*/
64077 +#define REISER4_SEQ_SEARCH_BREAK (3)
64078 +
64079 +/* don't allow tree to be lower than this */
64080 +#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
64081 +
64082 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64083 + * available memory. */
64084 +/* Default value of maximal atom size. Can be ovewritten by
64085 + tmgr.atom_max_size mount option. By default infinity. */
64086 +#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
64087 +
64088 +/* Default value of maximal atom age (in jiffies). After reaching this age
64089 + atom will be forced to commit, either synchronously or asynchronously. Can
64090 + be overwritten by tmgr.atom_max_age mount option. */
64091 +#define REISER4_ATOM_MAX_AGE (600 * HZ)
64092 +
64093 +/* sleeping period for ktxnmrgd */
64094 +#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
64095 +
64096 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64097 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64098 +
64099 +/* start complaining after that many restarts in coord_by_key().
64100 +
64101 + This either means incredibly heavy contention for this part of a tree, or
64102 + some corruption or bug.
64103 +*/
64104 +#define REISER4_CBK_ITERATIONS_LIMIT (100)
64105 +
64106 +/* return -EIO after that many iterations in coord_by_key().
64107 +
64108 + I have witnessed more than 800 iterations (in 30 thread test) before cbk
64109 + finished. --nikita
64110 +*/
64111 +#define REISER4_MAX_CBK_ITERATIONS 500000
64112 +
64113 +/* put a per-inode limit on maximal number of directory entries with identical
64114 + keys in hashed directory.
64115 +
64116 + Disable this until inheritance interfaces stabilize: we need some way to
64117 + set per directory limit.
64118 +*/
64119 +#define REISER4_USE_COLLISION_LIMIT (0)
64120 +
64121 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64122 + will force them to be relocated. */
64123 +#define FLUSH_RELOCATE_THRESHOLD 64
64124 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64125 + from the preceder it will relocate to that position. */
64126 +#define FLUSH_RELOCATE_DISTANCE 64
64127 +
64128 +/* If we have written this much or more blocks before encountering busy jnode
64129 + in flush list - abort flushing hoping that next time we get called
64130 + this jnode will be clean already, and we will save some seeks. */
64131 +#define FLUSH_WRITTEN_THRESHOLD 50
64132 +
64133 +/* The maximum number of nodes to scan left on a level during flush. */
64134 +#define FLUSH_SCAN_MAXNODES 10000
64135 +
64136 +/* per-atom limit of flushers */
64137 +#define ATOM_MAX_FLUSHERS (1)
64138 +
64139 +/* default tracing buffer size */
64140 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
64141 +
64142 +/* what size units of IO we would like cp, etc., to use, in writing to
64143 + reiser4. In bytes.
64144 +
64145 + Can be overwritten by optimal_io_size mount option.
64146 +*/
64147 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64148 +
64149 +/* see comments in inode.c:oid_to_uino() */
64150 +#define REISER4_UINO_SHIFT (1 << 30)
64151 +
64152 +/* Mark function argument as unused to avoid compiler warnings. */
64153 +#define UNUSED_ARG __attribute__((unused))
64154 +
64155 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64156 +#define NONNULL __attribute__((nonnull))
64157 +#else
64158 +#define NONNULL
64159 +#endif
64160 +
64161 +/* master super block offset in bytes.*/
64162 +#define REISER4_MASTER_OFFSET 65536
64163 +
64164 +/* size of VFS block */
64165 +#define VFS_BLKSIZE 512
64166 +/* number of bits in size of VFS block (512==2^9) */
64167 +#define VFS_BLKSIZE_BITS 9
64168 +
64169 +#define REISER4_I reiser4_inode_data
64170 +
64171 +/* implication */
64172 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64173 +/* logical equivalence */
64174 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64175 +
64176 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64177 +
64178 +#define NOT_YET (0)
64179 +
64180 +/** Reiser4 specific error codes **/
64181 +
64182 +#define REISER4_ERROR_CODE_BASE 500
64183 +
64184 +/* Neighbor is not available (side neighbor or parent) */
64185 +#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
64186 +
64187 +/* Node was not found in cache */
64188 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64189 +
64190 +/* node has no free space enough for completion of balancing operation */
64191 +#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
64192 +
64193 +/* repeat operation */
64194 +#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
64195 +
64196 +/* deadlock happens */
64197 +#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
64198 +
64199 +/* operation cannot be performed, because it would block and non-blocking mode
64200 + * was requested. */
64201 +#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
64202 +
64203 +/* wait some event (depends on context), then repeat */
64204 +#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
64205 +
64206 +#endif /* __REISER4_H__ */
64207 +
64208 +/* Make Linus happy.
64209 + Local variables:
64210 + c-indentation-style: "K&R"
64211 + mode-name: "LC"
64212 + c-basic-offset: 8
64213 + tab-width: 8
64214 + fill-column: 120
64215 + End:
64216 +*/
64217 diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.c linux-2.6.20/fs/reiser4/safe_link.c
64218 --- linux-2.6.20.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
64219 +++ linux-2.6.20/fs/reiser4/safe_link.c 2007-05-06 14:50:43.867028218 +0400
64220 @@ -0,0 +1,351 @@
64221 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64222 + * reiser4/README */
64223 +
64224 +/* Safe-links. */
64225 +
64226 +/*
64227 + * Safe-links are used to maintain file system consistency during operations
64228 + * that spawns multiple transactions. For example:
64229 + *
64230 + * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64231 + * without user-visible names in the file system, but still opened by some
64232 + * active process. What happens here is that unlink proper (i.e., removal
64233 + * of the last file name) and file deletion (truncate of file body to zero
64234 + * and deletion of stat-data, that happens when last file descriptor is
64235 + * closed), may belong to different transactions T1 and T2. If a crash
64236 + * happens after T1 commit, but before T2 commit, on-disk file system has
64237 + * a file without name, that is, disk space leak.
64238 + *
64239 + * 2. Truncate. Truncate of large file may spawn multiple transactions. If
64240 + * system crashes while truncate was in-progress, file is left partially
64241 + * truncated, which violates "atomicity guarantees" of reiser4, viz. that
64242 + * every system is atomic.
64243 + *
64244 + * Safe-links address both above cases. Basically, safe-link is a way post
64245 + * some operation to be executed during commit of some other transaction than
64246 + * current one. (Another way to look at the safe-link is to interpret it as a
64247 + * logical logging.)
64248 + *
64249 + * Specifically, at the beginning of unlink safe-link in inserted in the
64250 + * tree. This safe-link is normally removed by file deletion code (during
64251 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
64252 + * normally removed when truncate operation is finished.
64253 + *
64254 + * This means, that in the case of "clean umount" there are no safe-links in
64255 + * the tree. If safe-links are observed during mount, it means that (a) system
64256 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
64257 + * (i.e., not finished) operations that were in-progress during system
64258 + * termination. Each safe-link record enough information to complete
64259 + * corresponding operation, and mount simply "replays" them (hence, the
64260 + * analogy with the logical logging).
64261 + *
64262 + * Safe-links are implemented as blackbox items (see
64263 + * plugin/item/blackbox.[ch]).
64264 + *
64265 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
64266 + * list" there.
64267 + */
64268 +
64269 +#include "safe_link.h"
64270 +#include "debug.h"
64271 +#include "inode.h"
64272 +
64273 +#include "plugin/item/blackbox.h"
64274 +
64275 +#include <linux/fs.h>
64276 +
64277 +/*
64278 + * On-disk format of safe-link.
64279 + */
64280 +typedef struct safelink {
64281 + reiser4_key sdkey; /* key of stat-data for the file safe-link is
64282 + * for */
64283 + d64 size; /* size to which file should be truncated */
64284 +} safelink_t;
64285 +
64286 +/*
64287 + * locality where safe-link items are stored. Next to the objectid of root
64288 + * directory.
64289 + */
64290 +static oid_t safe_link_locality(reiser4_tree * tree)
64291 +{
64292 + return get_key_objectid(get_super_private(tree->super)->df_plug->
64293 + root_dir_key(tree->super)) + 1;
64294 +}
64295 +
64296 +/*
64297 + Construct a key for the safe-link. Key has the following format:
64298 +
64299 +| 60 | 4 | 64 | 4 | 60 | 64 |
64300 ++---------------+---+------------------+---+---------------+------------------+
64301 +| locality | 0 | 0 | 0 | objectid | link type |
64302 ++---------------+---+------------------+---+---------------+------------------+
64303 +| | | | |
64304 +| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64305 +
64306 + This is in large keys format. In small keys format second 8 byte chunk is
64307 + out. Locality is a constant returned by safe_link_locality(). objectid is
64308 + an oid of a file on which operation protected by this safe-link is
64309 + performed. link-type is used to distinguish safe-links for different
64310 + operations.
64311 +
64312 + */
64313 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64314 + reiser4_safe_link_t link, reiser4_key * key)
64315 +{
64316 + reiser4_key_init(key);
64317 + set_key_locality(key, safe_link_locality(tree));
64318 + set_key_objectid(key, oid);
64319 + set_key_offset(key, link);
64320 + return key;
64321 +}
64322 +
64323 +/*
64324 + * how much disk space is necessary to insert and remove (in the
64325 + * error-handling path) safe-link.
64326 + */
64327 +static __u64 safe_link_tograb(reiser4_tree * tree)
64328 +{
64329 + return
64330 + /* insert safe link */
64331 + estimate_one_insert_item(tree) +
64332 + /* remove safe link */
64333 + estimate_one_item_removal(tree) +
64334 + /* drill to the leaf level during insertion */
64335 + 1 + estimate_one_insert_item(tree) +
64336 + /*
64337 + * possible update of existing safe-link. Actually, if
64338 + * safe-link existed already (we failed to remove it), then no
64339 + * insertion is necessary, so this term is already "covered",
64340 + * but for simplicity let's left it.
64341 + */
64342 + 1;
64343 +}
64344 +
64345 +/*
64346 + * grab enough disk space to insert and remove (in the error-handling path)
64347 + * safe-link.
64348 + */
64349 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64350 +{
64351 + int result;
64352 +
64353 + grab_space_enable();
64354 + /* The sbinfo->delete_mutex can be taken here.
64355 + * safe_link_release() should be called before leaving reiser4
64356 + * context. */
64357 + result =
64358 + reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64359 + grab_space_enable();
64360 + return result;
64361 +}
64362 +
64363 +/*
64364 + * release unused disk space reserved by safe_link_grab().
64365 + */
64366 +void safe_link_release(reiser4_tree * tree)
64367 +{
64368 + reiser4_release_reserved(tree->super);
64369 +}
64370 +
64371 +/*
64372 + * insert into tree safe-link for operation @link on inode @inode.
64373 + */
64374 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64375 +{
64376 + reiser4_key key;
64377 + safelink_t sl;
64378 + int length;
64379 + int result;
64380 + reiser4_tree *tree;
64381 +
64382 + build_sd_key(inode, &sl.sdkey);
64383 + length = sizeof sl.sdkey;
64384 +
64385 + if (link == SAFE_TRUNCATE) {
64386 + /*
64387 + * for truncate we have to store final file length also,
64388 + * expand item.
64389 + */
64390 + length += sizeof(sl.size);
64391 + put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64392 + }
64393 + tree = reiser4_tree_by_inode(inode);
64394 + build_link_key(tree, get_inode_oid(inode), link, &key);
64395 +
64396 + result = store_black_box(tree, &key, &sl, length);
64397 + if (result == -EEXIST)
64398 + result = update_black_box(tree, &key, &sl, length);
64399 + return result;
64400 +}
64401 +
64402 +/*
64403 + * remove safe-link corresponding to the operation @link on inode @inode from
64404 + * the tree.
64405 + */
64406 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64407 +{
64408 + reiser4_key key;
64409 +
64410 + return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64411 +}
64412 +
64413 +/*
64414 + * in-memory structure to keep information extracted from safe-link. This is
64415 + * used to iterate over all safe-links.
64416 + */
64417 +typedef struct {
64418 + reiser4_tree *tree; /* internal tree */
64419 + reiser4_key key; /* safe-link key */
64420 + reiser4_key sdkey; /* key of object stat-data */
64421 + reiser4_safe_link_t link; /* safe-link type */
64422 + oid_t oid; /* object oid */
64423 + __u64 size; /* final size for truncate */
64424 +} safe_link_context;
64425 +
64426 +/*
64427 + * start iterating over all safe-links.
64428 + */
64429 +static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64430 +{
64431 + ctx->tree = tree;
64432 + reiser4_key_init(&ctx->key);
64433 + set_key_locality(&ctx->key, safe_link_locality(tree));
64434 + set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64435 + set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
64436 +}
64437 +
64438 +/*
64439 + * return next safe-link.
64440 + */
64441 +static int safe_link_iter_next(safe_link_context * ctx)
64442 +{
64443 + int result;
64444 + safelink_t sl;
64445 +
64446 + result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64447 + if (result == 0) {
64448 + ctx->oid = get_key_objectid(&ctx->key);
64449 + ctx->link = get_key_offset(&ctx->key);
64450 + ctx->sdkey = sl.sdkey;
64451 + if (ctx->link == SAFE_TRUNCATE)
64452 + ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64453 + }
64454 + return result;
64455 +}
64456 +
64457 +/*
64458 + * check are there any more safe-links left in the tree.
64459 + */
64460 +static int safe_link_iter_finished(safe_link_context * ctx)
64461 +{
64462 + return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64463 +}
64464 +
64465 +/*
64466 + * finish safe-link iteration.
64467 + */
64468 +static void safe_link_iter_end(safe_link_context * ctx)
64469 +{
64470 + /* nothing special */
64471 +}
64472 +
64473 +/*
64474 + * process single safe-link.
64475 + */
64476 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64477 + reiser4_key * sdkey, oid_t oid, __u64 size)
64478 +{
64479 + struct inode *inode;
64480 + int result;
64481 +
64482 + /*
64483 + * obtain object inode by reiser4_iget(), then call object plugin
64484 + * ->safelink() method to do actual work, then delete safe-link on
64485 + * success.
64486 + */
64487 + inode = reiser4_iget(super, sdkey, 1);
64488 + if (!IS_ERR(inode)) {
64489 + file_plugin *fplug;
64490 +
64491 + fplug = inode_file_plugin(inode);
64492 + assert("nikita-3428", fplug != NULL);
64493 + assert("", oid == get_inode_oid(inode));
64494 + if (fplug->safelink != NULL) {
64495 + /* reiser4_txn_restart_current is not necessary because
64496 + * mounting is signle thread. However, without it
64497 + * deadlock detection code will complain (see
64498 + * nikita-3361). */
64499 + reiser4_txn_restart_current();
64500 + result = fplug->safelink(inode, link, size);
64501 + } else {
64502 + warning("nikita-3430",
64503 + "Cannot handle safelink for %lli",
64504 + (unsigned long long)oid);
64505 + reiser4_print_key("key", sdkey);
64506 + result = 0;
64507 + }
64508 + if (result != 0) {
64509 + warning("nikita-3431",
64510 + "Error processing safelink for %lli: %i",
64511 + (unsigned long long)oid, result);
64512 + }
64513 + reiser4_iget_complete(inode);
64514 + iput(inode);
64515 + if (result == 0) {
64516 + result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
64517 + if (result == 0)
64518 + result =
64519 + safe_link_del(reiser4_get_tree(super), oid, link);
64520 + safe_link_release(reiser4_get_tree(super));
64521 + /*
64522 + * restart transaction: if there was large number of
64523 + * safe-links, their processing may fail to fit into
64524 + * single transaction.
64525 + */
64526 + if (result == 0)
64527 + reiser4_txn_restart_current();
64528 + }
64529 + } else
64530 + result = PTR_ERR(inode);
64531 + return result;
64532 +}
64533 +
64534 +/*
64535 + * iterate over all safe-links in the file-system processing them one by one.
64536 + */
64537 +int process_safelinks(struct super_block *super)
64538 +{
64539 + safe_link_context ctx;
64540 + int result;
64541 +
64542 + if (rofs_super(super))
64543 + /* do nothing on the read-only file system */
64544 + return 0;
64545 + safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64546 + result = 0;
64547 + do {
64548 + result = safe_link_iter_next(&ctx);
64549 + if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64550 + result = 0;
64551 + break;
64552 + }
64553 + if (result == 0)
64554 + result = process_safelink(super, ctx.link,
64555 + &ctx.sdkey, ctx.oid,
64556 + ctx.size);
64557 + } while (result == 0);
64558 + safe_link_iter_end(&ctx);
64559 + return result;
64560 +}
64561 +
64562 +/* Make Linus happy.
64563 + Local variables:
64564 + c-indentation-style: "K&R"
64565 + mode-name: "LC"
64566 + c-basic-offset: 8
64567 + tab-width: 8
64568 + fill-column: 120
64569 + scroll-step: 1
64570 + End:
64571 +*/
64572 diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.h linux-2.6.20/fs/reiser4/safe_link.h
64573 --- linux-2.6.20.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
64574 +++ linux-2.6.20/fs/reiser4/safe_link.h 2007-05-06 14:50:43.867028218 +0400
64575 @@ -0,0 +1,29 @@
64576 +/* Copyright 2003 by Hans Reiser, licensing governed by
64577 + * reiser4/README */
64578 +
64579 +/* Safe-links. See safe_link.c for details. */
64580 +
64581 +#if !defined( __FS_SAFE_LINK_H__ )
64582 +#define __FS_SAFE_LINK_H__
64583 +
64584 +#include "tree.h"
64585 +
64586 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64587 +void safe_link_release(reiser4_tree * tree);
64588 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64589 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64590 +
64591 +int process_safelinks(struct super_block *super);
64592 +
64593 +/* __FS_SAFE_LINK_H__ */
64594 +#endif
64595 +
64596 +/* Make Linus happy.
64597 + Local variables:
64598 + c-indentation-style: "K&R"
64599 + mode-name: "LC"
64600 + c-basic-offset: 8
64601 + tab-width: 8
64602 + fill-column: 120
64603 + End:
64604 +*/
64605 diff -urN linux-2.6.20.orig/fs/reiser4/seal.c linux-2.6.20/fs/reiser4/seal.c
64606 --- linux-2.6.20.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
64607 +++ linux-2.6.20/fs/reiser4/seal.c 2007-05-06 14:50:43.871029467 +0400
64608 @@ -0,0 +1,218 @@
64609 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64610 +/* Seals implementation. */
64611 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
64612 + allowing to bypass tree traversal. But normal usage of coords implies that
64613 + node pointed to by coord is locked, whereas seals don't keep a lock (or
64614 + even a reference) to znode. In stead, each znode contains a version number,
64615 + increased on each znode modification. This version number is copied into a
64616 + seal when seal is created. Later, one can "validate" seal by calling
64617 + reiser4_seal_validate(). If znode is in cache and its version number is
64618 + still the same, seal is "pristine" and coord associated with it can be
64619 + re-used immediately.
64620 +
64621 + If, on the other hand, znode is out of cache, or it is obviously different
64622 + one from the znode seal was initially attached to (for example, it is on
64623 + the different level, or is being removed from the tree), seal is
64624 + irreparably invalid ("burned") and tree traversal has to be repeated.
64625 +
64626 + Otherwise, there is some hope, that while znode was modified (and seal was
64627 + "broken" as a result), key attached to the seal is still in the node. This
64628 + is checked by first comparing this key with delimiting keys of node and, if
64629 + key is ok, doing intra-node lookup.
64630 +
64631 + Znode version is maintained in the following way:
64632 +
64633 + there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64634 + znode_epoch is incremented and its new value is stored in ->version field
64635 + of new znode. Whenever znode is dirtied (which means it was probably
64636 + modified), znode_epoch is also incremented and its new value is stored in
64637 + znode->version. This is done so, because just incrementing znode->version
64638 + on each update is not enough: it may so happen, that znode get deleted, new
64639 + znode is allocated for the same disk block and gets the same version
64640 + counter, tricking seal code into false positive.
64641 +*/
64642 +
64643 +#include "forward.h"
64644 +#include "debug.h"
64645 +#include "key.h"
64646 +#include "coord.h"
64647 +#include "seal.h"
64648 +#include "plugin/item/item.h"
64649 +#include "plugin/node/node.h"
64650 +#include "jnode.h"
64651 +#include "znode.h"
64652 +#include "super.h"
64653 +
64654 +static znode *seal_node(const seal_t * seal);
64655 +static int seal_matches(const seal_t * seal, znode * node);
64656 +
64657 +/* initialise seal. This can be called several times on the same seal. @coord
64658 + and @key can be NULL. */
64659 +void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
64660 + const coord_t * coord /* coord @seal will be
64661 + * attached to */ ,
64662 + const reiser4_key * key UNUSED_ARG /* key @seal will be
64663 + * attached to */ )
64664 +{
64665 + assert("nikita-1886", seal != NULL);
64666 + memset(seal, 0, sizeof *seal);
64667 + if (coord != NULL) {
64668 + znode *node;
64669 +
64670 + node = coord->node;
64671 + assert("nikita-1987", node != NULL);
64672 + spin_lock_znode(node);
64673 + seal->version = node->version;
64674 + assert("nikita-1988", seal->version != 0);
64675 + seal->block = *znode_get_block(node);
64676 +#if REISER4_DEBUG
64677 + seal->coord1 = *coord;
64678 + if (key != NULL)
64679 + seal->key = *key;
64680 +#endif
64681 + spin_unlock_znode(node);
64682 + }
64683 +}
64684 +
64685 +/* finish with seal */
64686 +void reiser4_seal_done(seal_t * seal /* seal to clear */ )
64687 +{
64688 + assert("nikita-1887", seal != NULL);
64689 + seal->version = 0;
64690 +}
64691 +
64692 +/* true if seal was initialised */
64693 +int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
64694 +{
64695 + assert("nikita-1890", seal != NULL);
64696 + return seal->version != 0;
64697 +}
64698 +
64699 +#if REISER4_DEBUG
64700 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
64701 + * has expected key. This is to detect cases where node was modified but wasn't
64702 + * marked dirty. */
64703 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64704 + const reiser4_key * k /* expected key */ )
64705 +{
64706 + reiser4_key ukey;
64707 +
64708 + return (coord->between != AT_UNIT) ||
64709 + /* FIXME-VS: we only can compare keys for items whose units
64710 + represent exactly one key */
64711 + ((coord_is_existing_unit(coord))
64712 + && (item_is_extent(coord)
64713 + || keyeq(k, unit_key_by_coord(coord, &ukey))))
64714 + || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64715 + && keyge(k, unit_key_by_coord(coord, &ukey)));
64716 +}
64717 +#endif
64718 +
64719 +/* this is used by reiser4_seal_validate. It accepts return value of
64720 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
64721 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
64722 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
64723 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
64724 + * distinguish between -EINVAL and -E_REPEAT. */
64725 +static int should_repeat(int return_code)
64726 +{
64727 + return return_code == -EINVAL;
64728 +}
64729 +
64730 +/* (re-)validate seal.
64731 +
64732 + Checks whether seal is pristine, and try to revalidate it if possible.
64733 +
64734 + If seal was burned, or broken irreparably, return -E_REPEAT.
64735 +
64736 + NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
64737 + looking for is in range of keys covered by the sealed node, but item wasn't
64738 + found by node ->lookup() method. Alternative is to return -ENOENT in this
64739 + case, but this would complicate callers logic.
64740 +
64741 +*/
64742 +int reiser4_seal_validate(seal_t * seal /* seal to validate */,
64743 + coord_t * coord /* coord to validate against */,
64744 + const reiser4_key * key /* key to validate against */,
64745 + lock_handle * lh /* resulting lock handle */,
64746 + znode_lock_mode mode /* lock node */,
64747 + znode_lock_request request /* locking priority */)
64748 +{
64749 + znode *node;
64750 + int result;
64751 +
64752 + assert("nikita-1889", seal != NULL);
64753 + assert("nikita-1881", reiser4_seal_is_set(seal));
64754 + assert("nikita-1882", key != NULL);
64755 + assert("nikita-1883", coord != NULL);
64756 + assert("nikita-1884", lh != NULL);
64757 + assert("nikita-1885", keyeq(&seal->key, key));
64758 + assert("nikita-1989", coords_equal(&seal->coord1, coord));
64759 +
64760 + /* obtain znode by block number */
64761 + node = seal_node(seal);
64762 + if (node != NULL) {
64763 + /* znode was in cache, lock it */
64764 + result = longterm_lock_znode(lh, node, mode, request);
64765 + zput(node);
64766 + if (result == 0) {
64767 + if (seal_matches(seal, node)) {
64768 + /* if seal version and znode version
64769 + coincide */
64770 + ON_DEBUG(coord_update_v(coord));
64771 + assert("nikita-1990",
64772 + node == seal->coord1.node);
64773 + assert("nikita-1898",
64774 + WITH_DATA_RET(coord->node, 1,
64775 + check_seal_match(coord,
64776 + key)));
64777 + } else
64778 + result = RETERR(-E_REPEAT);
64779 + }
64780 + if (result != 0) {
64781 + if (should_repeat(result))
64782 + result = RETERR(-E_REPEAT);
64783 + /* unlock node on failure */
64784 + done_lh(lh);
64785 + }
64786 + } else {
64787 + /* znode wasn't in cache */
64788 + result = RETERR(-E_REPEAT);
64789 + }
64790 + return result;
64791 +}
64792 +
64793 +/* helpers functions */
64794 +
64795 +/* obtain reference to znode seal points to, if in cache */
64796 +static znode *seal_node(const seal_t * seal /* seal to query */ )
64797 +{
64798 + assert("nikita-1891", seal != NULL);
64799 + return zlook(current_tree, &seal->block);
64800 +}
64801 +
64802 +/* true if @seal version and @node version coincide */
64803 +static int seal_matches(const seal_t * seal /* seal to check */ ,
64804 + znode * node /* node to check */ )
64805 +{
64806 + int result;
64807 +
64808 + assert("nikita-1991", seal != NULL);
64809 + assert("nikita-1993", node != NULL);
64810 +
64811 + spin_lock_znode(node);
64812 + result = (seal->version == node->version);
64813 + spin_unlock_znode(node);
64814 + return result;
64815 +}
64816 +
64817 +/* Make Linus happy.
64818 + Local variables:
64819 + c-indentation-style: "K&R"
64820 + mode-name: "LC"
64821 + c-basic-offset: 8
64822 + tab-width: 8
64823 + fill-column: 120
64824 + scroll-step: 1
64825 + End:
64826 +*/
64827 diff -urN linux-2.6.20.orig/fs/reiser4/seal.h linux-2.6.20/fs/reiser4/seal.h
64828 --- linux-2.6.20.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
64829 +++ linux-2.6.20/fs/reiser4/seal.h 2007-05-06 14:50:43.871029467 +0400
64830 @@ -0,0 +1,49 @@
64831 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64832 +
64833 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64834 +
64835 +#ifndef __SEAL_H__
64836 +#define __SEAL_H__
64837 +
64838 +#include "forward.h"
64839 +#include "debug.h"
64840 +#include "dformat.h"
64841 +#include "key.h"
64842 +#include "coord.h"
64843 +
64844 +/* for __u?? types */
64845 +/*#include <linux/types.h>*/
64846 +
64847 +/* seal. See comment at the top of seal.c */
64848 +typedef struct seal_s {
64849 + /* version of znode recorder at the time of seal creation */
64850 + __u64 version;
64851 + /* block number of znode attached to this seal */
64852 + reiser4_block_nr block;
64853 +#if REISER4_DEBUG
64854 + /* coord this seal is attached to. For debugging. */
64855 + coord_t coord1;
64856 + /* key this seal is attached to. For debugging. */
64857 + reiser4_key key;
64858 +#endif
64859 +} seal_t;
64860 +
64861 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
64862 +extern void reiser4_seal_done(seal_t *);
64863 +extern int reiser4_seal_is_set(const seal_t *);
64864 +extern int reiser4_seal_validate(seal_t *, coord_t *,
64865 + const reiser4_key *, lock_handle *,
64866 + znode_lock_mode mode, znode_lock_request request);
64867 +
64868 +/* __SEAL_H__ */
64869 +#endif
64870 +
64871 +/* Make Linus happy.
64872 + Local variables:
64873 + c-indentation-style: "K&R"
64874 + mode-name: "LC"
64875 + c-basic-offset: 8
64876 + tab-width: 8
64877 + fill-column: 120
64878 + End:
64879 +*/
64880 diff -urN linux-2.6.20.orig/fs/reiser4/search.c linux-2.6.20/fs/reiser4/search.c
64881 --- linux-2.6.20.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
64882 +++ linux-2.6.20/fs/reiser4/search.c 2007-05-06 14:50:43.871029467 +0400
64883 @@ -0,0 +1,1611 @@
64884 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64885 + * reiser4/README */
64886 +
64887 +#include "forward.h"
64888 +#include "debug.h"
64889 +#include "dformat.h"
64890 +#include "key.h"
64891 +#include "coord.h"
64892 +#include "seal.h"
64893 +#include "plugin/item/item.h"
64894 +#include "plugin/node/node.h"
64895 +#include "plugin/plugin.h"
64896 +#include "jnode.h"
64897 +#include "znode.h"
64898 +#include "block_alloc.h"
64899 +#include "tree_walk.h"
64900 +#include "tree.h"
64901 +#include "reiser4.h"
64902 +#include "super.h"
64903 +#include "inode.h"
64904 +
64905 +#include <linux/slab.h>
64906 +
64907 +static const char *bias_name(lookup_bias bias);
64908 +
64909 +/* tree searching algorithm, intranode searching algorithms are in
64910 + plugin/node/ */
64911 +
64912 +/* tree lookup cache
64913 + *
64914 + * The coord by key cache consists of small list of recently accessed nodes
64915 + * maintained according to the LRU discipline. Before doing real top-to-down
64916 + * tree traversal this cache is scanned for nodes that can contain key
64917 + * requested.
64918 + *
64919 + * The efficiency of coord cache depends heavily on locality of reference for
64920 + * tree accesses. Our user level simulations show reasonably good hit ratios
64921 + * for coord cache under most loads so far.
64922 + */
64923 +
64924 +/* Initialise coord cache slot */
64925 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
64926 +{
64927 + assert("nikita-345", slot != NULL);
64928 +
64929 + INIT_LIST_HEAD(&slot->lru);
64930 + slot->node = NULL;
64931 +}
64932 +
64933 +/* Initialize coord cache */
64934 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64935 +{
64936 + int i;
64937 +
64938 + assert("nikita-346", cache != NULL);
64939 +
64940 + cache->slot =
64941 + kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
64942 + reiser4_ctx_gfp_mask_get());
64943 + if (cache->slot == NULL)
64944 + return RETERR(-ENOMEM);
64945 +
64946 + INIT_LIST_HEAD(&cache->lru);
64947 + for (i = 0; i < cache->nr_slots; ++i) {
64948 + cbk_cache_init_slot(cache->slot + i);
64949 + list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64950 + }
64951 + rwlock_init(&cache->guard);
64952 + return 0;
64953 +}
64954 +
64955 +/* free cbk cache data */
64956 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64957 +{
64958 + assert("nikita-2493", cache != NULL);
64959 + if (cache->slot != NULL) {
64960 + kfree(cache->slot);
64961 + cache->slot = NULL;
64962 + }
64963 +}
64964 +
64965 +/* macro to iterate over all cbk cache slots */
64966 +#define for_all_slots(cache, slot) \
64967 + for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64968 + &(cache)->lru != &(slot)->lru; \
64969 + (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64970 +
64971 +#if REISER4_DEBUG
64972 +/* this function assures that [cbk-cache-invariant] invariant holds */
64973 +static int cbk_cache_invariant(const cbk_cache *cache)
64974 +{
64975 + cbk_cache_slot *slot;
64976 + int result;
64977 + int unused;
64978 +
64979 + if (cache->nr_slots == 0)
64980 + return 1;
64981 +
64982 + assert("nikita-2469", cache != NULL);
64983 + unused = 0;
64984 + result = 1;
64985 + read_lock(&((cbk_cache *)cache)->guard);
64986 + for_all_slots(cache, slot) {
64987 + /* in LRU first go all `used' slots followed by `unused' */
64988 + if (unused && (slot->node != NULL))
64989 + result = 0;
64990 + if (slot->node == NULL)
64991 + unused = 1;
64992 + else {
64993 + cbk_cache_slot *scan;
64994 +
64995 + /* all cached nodes are different */
64996 + scan = slot;
64997 + while (result) {
64998 + scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64999 + if (&cache->lru == &scan->lru)
65000 + break;
65001 + if (slot->node == scan->node)
65002 + result = 0;
65003 + }
65004 + }
65005 + if (!result)
65006 + break;
65007 + }
65008 + read_unlock(&((cbk_cache *)cache)->guard);
65009 + return result;
65010 +}
65011 +
65012 +#endif
65013 +
65014 +/* Remove references, if any, to @node from coord cache */
65015 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
65016 + reiser4_tree * tree /* tree to remove node from */ )
65017 +{
65018 + cbk_cache_slot *slot;
65019 + cbk_cache *cache;
65020 + int i;
65021 +
65022 + assert("nikita-350", node != NULL);
65023 + assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
65024 +
65025 + cache = &tree->cbk_cache;
65026 + assert("nikita-2470", cbk_cache_invariant(cache));
65027 +
65028 + write_lock(&(cache->guard));
65029 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65030 + if (slot->node == node) {
65031 + list_move_tail(&slot->lru, &cache->lru);
65032 + slot->node = NULL;
65033 + break;
65034 + }
65035 + }
65036 + write_unlock(&(cache->guard));
65037 + assert("nikita-2471", cbk_cache_invariant(cache));
65038 +}
65039 +
65040 +/* add to the cbk-cache in the "tree" information about "node". This
65041 + can actually be update of existing slot in a cache. */
65042 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65043 +{
65044 + cbk_cache *cache;
65045 + cbk_cache_slot *slot;
65046 + int i;
65047 +
65048 + assert("nikita-352", node != NULL);
65049 +
65050 + cache = &znode_get_tree(node)->cbk_cache;
65051 + assert("nikita-2472", cbk_cache_invariant(cache));
65052 +
65053 + if (cache->nr_slots == 0)
65054 + return;
65055 +
65056 + write_lock(&(cache->guard));
65057 + /* find slot to update/add */
65058 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65059 + /* oops, this node is already in a cache */
65060 + if (slot->node == node)
65061 + break;
65062 + }
65063 + /* if all slots are used, reuse least recently used one */
65064 + if (i == cache->nr_slots) {
65065 + slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65066 + slot->node = (znode *) node;
65067 + }
65068 + list_move(&slot->lru, &cache->lru);
65069 + write_unlock(&(cache->guard));
65070 + assert("nikita-2473", cbk_cache_invariant(cache));
65071 +}
65072 +
65073 +static int setup_delimiting_keys(cbk_handle * h);
65074 +static lookup_result coord_by_handle(cbk_handle * handle);
65075 +static lookup_result traverse_tree(cbk_handle * h);
65076 +static int cbk_cache_search(cbk_handle * h);
65077 +
65078 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
65079 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
65080 +
65081 +/* helper functions */
65082 +
65083 +static void update_stale_dk(reiser4_tree * tree, znode * node);
65084 +
65085 +/* release parent node during traversal */
65086 +static void put_parent(cbk_handle * h);
65087 +/* check consistency of fields */
65088 +static int sanity_check(cbk_handle * h);
65089 +/* release resources in handle */
65090 +static void hput(cbk_handle * h);
65091 +
65092 +static level_lookup_result search_to_left(cbk_handle * h);
65093 +
65094 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
65095 + * cbk_handle */
65096 +static cbk_handle *cbk_pack(cbk_handle * handle,
65097 + reiser4_tree * tree,
65098 + const reiser4_key * key,
65099 + coord_t * coord,
65100 + lock_handle * active_lh,
65101 + lock_handle * parent_lh,
65102 + znode_lock_mode lock_mode,
65103 + lookup_bias bias,
65104 + tree_level lock_level,
65105 + tree_level stop_level,
65106 + __u32 flags, ra_info_t * info)
65107 +{
65108 + memset(handle, 0, sizeof *handle);
65109 +
65110 + handle->tree = tree;
65111 + handle->key = key;
65112 + handle->lock_mode = lock_mode;
65113 + handle->bias = bias;
65114 + handle->lock_level = lock_level;
65115 + handle->stop_level = stop_level;
65116 + handle->coord = coord;
65117 + /* set flags. See comment in tree.h:cbk_flags */
65118 + handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65119 +
65120 + handle->active_lh = active_lh;
65121 + handle->parent_lh = parent_lh;
65122 + handle->ra_info = info;
65123 + return handle;
65124 +}
65125 +
65126 +/* main tree lookup procedure
65127 +
65128 + Check coord cache. If key we are looking for is not found there, call cbk()
65129 + to do real tree traversal.
65130 +
65131 + As we have extents on the twig level, @lock_level and @stop_level can
65132 + be different from LEAF_LEVEL and each other.
65133 +
65134 + Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65135 + long term locks) while calling this.
65136 +*/
65137 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65138 + * in. Usually this tree is
65139 + * part of file-system
65140 + * super-block */ ,
65141 + const reiser4_key * key /* key to look for */ ,
65142 + coord_t * coord /* where to store found
65143 + * position in a tree. Fields
65144 + * in "coord" are only valid if
65145 + * coord_by_key() returned
65146 + * "CBK_COORD_FOUND" */ ,
65147 + lock_handle * lh, /* resulting lock handle */
65148 + znode_lock_mode lock_mode /* type of lookup we
65149 + * want on node. Pass
65150 + * ZNODE_READ_LOCK here
65151 + * if you only want to
65152 + * read item found and
65153 + * ZNODE_WRITE_LOCK if
65154 + * you want to modify
65155 + * it */ ,
65156 + lookup_bias bias /* what to return if coord
65157 + * with exactly the @key is
65158 + * not in the tree */ ,
65159 + tree_level lock_level /* tree level where to start
65160 + * taking @lock type of
65161 + * locks */ ,
65162 + tree_level stop_level /* tree level to stop. Pass
65163 + * LEAF_LEVEL or TWIG_LEVEL
65164 + * here Item being looked
65165 + * for has to be between
65166 + * @lock_level and
65167 + * @stop_level, inclusive */ ,
65168 + __u32 flags /* search flags */ ,
65169 + ra_info_t *
65170 + info
65171 + /* information about desired tree traversal readahead */
65172 + )
65173 +{
65174 + cbk_handle handle;
65175 + lock_handle parent_lh;
65176 + lookup_result result;
65177 +
65178 + init_lh(lh);
65179 + init_lh(&parent_lh);
65180 +
65181 + assert("nikita-3023", reiser4_schedulable());
65182 +
65183 + assert("nikita-353", tree != NULL);
65184 + assert("nikita-354", key != NULL);
65185 + assert("nikita-355", coord != NULL);
65186 + assert("nikita-356", (bias == FIND_EXACT)
65187 + || (bias == FIND_MAX_NOT_MORE_THAN));
65188 + assert("nikita-357", stop_level >= LEAF_LEVEL);
65189 + /* no locks can be held during tree traversal */
65190 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65191 +
65192 + cbk_pack(&handle,
65193 + tree,
65194 + key,
65195 + coord,
65196 + lh,
65197 + &parent_lh,
65198 + lock_mode, bias, lock_level, stop_level, flags, info);
65199 +
65200 + result = coord_by_handle(&handle);
65201 + assert("nikita-3247",
65202 + ergo(!IS_CBKERR(result), coord->node == lh->node));
65203 + return result;
65204 +}
65205 +
65206 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
65207 + * from tree root. */
65208 +lookup_result reiser4_object_lookup(struct inode * object,
65209 + const reiser4_key * key,
65210 + coord_t * coord,
65211 + lock_handle * lh,
65212 + znode_lock_mode lock_mode,
65213 + lookup_bias bias,
65214 + tree_level lock_level,
65215 + tree_level stop_level, __u32 flags,
65216 + ra_info_t * info)
65217 +{
65218 + cbk_handle handle;
65219 + lock_handle parent_lh;
65220 + lookup_result result;
65221 +
65222 + init_lh(lh);
65223 + init_lh(&parent_lh);
65224 +
65225 + assert("nikita-3023", reiser4_schedulable());
65226 +
65227 + assert("nikita-354", key != NULL);
65228 + assert("nikita-355", coord != NULL);
65229 + assert("nikita-356", (bias == FIND_EXACT)
65230 + || (bias == FIND_MAX_NOT_MORE_THAN));
65231 + assert("nikita-357", stop_level >= LEAF_LEVEL);
65232 + /* no locks can be held during tree search by key */
65233 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65234 +
65235 + cbk_pack(&handle,
65236 + object != NULL ? reiser4_tree_by_inode(object) : current_tree,
65237 + key,
65238 + coord,
65239 + lh,
65240 + &parent_lh,
65241 + lock_mode, bias, lock_level, stop_level, flags, info);
65242 + handle.object = object;
65243 +
65244 + result = coord_by_handle(&handle);
65245 + assert("nikita-3247",
65246 + ergo(!IS_CBKERR(result), coord->node == lh->node));
65247 + return result;
65248 +}
65249 +
65250 +/* lookup by cbk_handle. Common part of coord_by_key() and
65251 + reiser4_object_lookup(). */
65252 +static lookup_result coord_by_handle(cbk_handle * handle)
65253 +{
65254 + /*
65255 + * first check cbk_cache (which is look-aside cache for our tree) and
65256 + * of this fails, start traversal.
65257 + */
65258 + /* first check whether "key" is in cache of recent lookups. */
65259 + if (cbk_cache_search(handle) == 0)
65260 + return handle->result;
65261 + else
65262 + return traverse_tree(handle);
65263 +}
65264 +
65265 +/* Execute actor for each item (or unit, depending on @through_units_p),
65266 + starting from @coord, right-ward, until either:
65267 +
65268 + - end of the tree is reached
65269 + - unformatted node is met
65270 + - error occurred
65271 + - @actor returns 0 or less
65272 +
65273 + Error code, or last actor return value is returned.
65274 +
65275 + This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
65276 + sequence of entries with identical keys and alikes.
65277 +*/
65278 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65279 + coord_t * coord /* coord to start from */ ,
65280 + lock_handle * lh /* lock handle to start with and to
65281 + * update along the way */ ,
65282 + tree_iterate_actor_t actor /* function to call on each
65283 + * item/unit */ ,
65284 + void *arg /* argument to pass to @actor */ ,
65285 + znode_lock_mode mode /* lock mode on scanned nodes */ ,
65286 + int through_units_p /* call @actor on each item or on
65287 + * each unit */ )
65288 +{
65289 + int result;
65290 +
65291 + assert("nikita-1143", tree != NULL);
65292 + assert("nikita-1145", coord != NULL);
65293 + assert("nikita-1146", lh != NULL);
65294 + assert("nikita-1147", actor != NULL);
65295 +
65296 + result = zload(coord->node);
65297 + coord_clear_iplug(coord);
65298 + if (result != 0)
65299 + return result;
65300 + if (!coord_is_existing_unit(coord)) {
65301 + zrelse(coord->node);
65302 + return -ENOENT;
65303 + }
65304 + while ((result = actor(tree, coord, lh, arg)) > 0) {
65305 + /* move further */
65306 + if ((through_units_p && coord_next_unit(coord)) ||
65307 + (!through_units_p && coord_next_item(coord))) {
65308 + do {
65309 + lock_handle couple;
65310 +
65311 + /* move to the next node */
65312 + init_lh(&couple);
65313 + result =
65314 + reiser4_get_right_neighbor(&couple,
65315 + coord->node,
65316 + (int)mode,
65317 + GN_CAN_USE_UPPER_LEVELS);
65318 + zrelse(coord->node);
65319 + if (result == 0) {
65320 +
65321 + result = zload(couple.node);
65322 + if (result != 0) {
65323 + done_lh(&couple);
65324 + return result;
65325 + }
65326 +
65327 + coord_init_first_unit(coord,
65328 + couple.node);
65329 + done_lh(lh);
65330 + move_lh(lh, &couple);
65331 + } else
65332 + return result;
65333 + } while (node_is_empty(coord->node));
65334 + }
65335 +
65336 + assert("nikita-1149", coord_is_existing_unit(coord));
65337 + }
65338 + zrelse(coord->node);
65339 + return result;
65340 +}
65341 +
65342 +/* return locked uber znode for @tree */
65343 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65344 + znode_lock_request pri, lock_handle * lh)
65345 +{
65346 + int result;
65347 +
65348 + result = longterm_lock_znode(lh, tree->uber, mode, pri);
65349 + return result;
65350 +}
65351 +
65352 +/* true if @key is strictly within @node
65353 +
65354 + we are looking for possibly non-unique key and it is item is at the edge of
65355 + @node. May be it is in the neighbor.
65356 +*/
65357 +static int znode_contains_key_strict(znode * node /* node to check key
65358 + * against */ ,
65359 + const reiser4_key *
65360 + key /* key to check */ ,
65361 + int isunique)
65362 +{
65363 + int answer;
65364 +
65365 + assert("nikita-1760", node != NULL);
65366 + assert("nikita-1722", key != NULL);
65367 +
65368 + if (keyge(key, &node->rd_key))
65369 + return 0;
65370 +
65371 + answer = keycmp(&node->ld_key, key);
65372 +
65373 + if (isunique)
65374 + return answer != GREATER_THAN;
65375 + else
65376 + return answer == LESS_THAN;
65377 +}
65378 +
65379 +/*
65380 + * Virtual Root (vroot) code.
65381 + *
65382 + * For given file system object (e.g., regular file or directory) let's
65383 + * define its "virtual root" as lowest in the tree (that is, furtherest
65384 + * from the tree root) node such that all body items of said object are
65385 + * located in a tree rooted at this node.
65386 + *
65387 + * Once vroot of object is found all tree lookups for items within body of
65388 + * this object ("object lookups") can be started from its vroot rather
65389 + * than from real root. This has following advantages:
65390 + *
65391 + * 1. amount of nodes traversed during lookup (and, hence, amount of
65392 + * key comparisons made) decreases, and
65393 + *
65394 + * 2. contention on tree root is decreased. This latter was actually
65395 + * motivating reason behind vroot, because spin lock of root node,
65396 + * which is taken when acquiring long-term lock on root node is the
65397 + * hottest lock in the reiser4.
65398 + *
65399 + * How to find vroot.
65400 + *
65401 + * When vroot of object F is not yet determined, all object lookups start
65402 + * from the root of the tree. At each tree level during traversal we have
65403 + * a node N such that a key we are looking for (which is the key inside
65404 + * object's body) is located within N. In function handle_vroot() called
65405 + * from cbk_level_lookup() we check whether N is possible vroot for
65406 + * F. Check is trivial---if neither leftmost nor rightmost item of N
65407 + * belongs to F (and we already have helpful ->owns_item() method of
65408 + * object plugin for this), then N is possible vroot of F. This, of
65409 + * course, relies on the assumption that each object occupies contiguous
65410 + * range of keys in the tree.
65411 + *
65412 + * Thus, traversing tree downward and checking each node as we go, we can
65413 + * find lowest such node, which, by definition, is vroot.
65414 + *
65415 + * How to track vroot.
65416 + *
65417 + * Nohow. If actual vroot changes, next object lookup will just restart
65418 + * from the actual tree root, refreshing object's vroot along the way.
65419 + *
65420 + */
65421 +
65422 +/*
65423 + * Check whether @node is possible vroot of @object.
65424 + */
65425 +static void handle_vroot(struct inode *object, znode * node)
65426 +{
65427 + file_plugin *fplug;
65428 + coord_t coord;
65429 +
65430 + fplug = inode_file_plugin(object);
65431 + assert("nikita-3353", fplug != NULL);
65432 + assert("nikita-3354", fplug->owns_item != NULL);
65433 +
65434 + if (unlikely(node_is_empty(node)))
65435 + return;
65436 +
65437 + coord_init_first_unit(&coord, node);
65438 + /*
65439 + * if leftmost item of @node belongs to @object, we cannot be sure
65440 + * that @node is vroot of @object, because, some items of @object are
65441 + * probably in the sub-tree rooted at the left neighbor of @node.
65442 + */
65443 + if (fplug->owns_item(object, &coord))
65444 + return;
65445 + coord_init_last_unit(&coord, node);
65446 + /* mutatis mutandis for the rightmost item */
65447 + if (fplug->owns_item(object, &coord))
65448 + return;
65449 + /* otherwise, @node is possible vroot of @object */
65450 + inode_set_vroot(object, node);
65451 +}
65452 +
65453 +/*
65454 + * helper function used by traverse tree to start tree traversal not from the
65455 + * tree root, but from @h->object's vroot, if possible.
65456 + */
65457 +static int prepare_object_lookup(cbk_handle * h)
65458 +{
65459 + znode *vroot;
65460 + int result;
65461 +
65462 + vroot = inode_get_vroot(h->object);
65463 + if (vroot == NULL) {
65464 + /*
65465 + * object doesn't have known vroot, start from real tree root.
65466 + */
65467 + return LOOKUP_CONT;
65468 + }
65469 +
65470 + h->level = znode_get_level(vroot);
65471 + /* take a long-term lock on vroot */
65472 + h->result = longterm_lock_znode(h->active_lh, vroot,
65473 + cbk_lock_mode(h->level, h),
65474 + ZNODE_LOCK_LOPRI);
65475 + result = LOOKUP_REST;
65476 + if (h->result == 0) {
65477 + int isunique;
65478 + int inside;
65479 +
65480 + isunique = h->flags & CBK_UNIQUE;
65481 + /* check that key is inside vroot */
65482 + read_lock_dk(h->tree);
65483 + inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65484 + !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65485 + read_unlock_dk(h->tree);
65486 + if (inside) {
65487 + h->result = zload(vroot);
65488 + if (h->result == 0) {
65489 + /* search for key in vroot. */
65490 + result = cbk_node_lookup(h);
65491 + zrelse(vroot); /*h->active_lh->node); */
65492 + if (h->active_lh->node != vroot) {
65493 + result = LOOKUP_REST;
65494 + } else if (result == LOOKUP_CONT) {
65495 + move_lh(h->parent_lh, h->active_lh);
65496 + h->flags &= ~CBK_DKSET;
65497 + }
65498 + }
65499 + }
65500 + }
65501 +
65502 + zput(vroot);
65503 +
65504 + if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65505 + hput(h);
65506 + return result;
65507 +}
65508 +
65509 +/* main function that handles common parts of tree traversal: starting
65510 + (fake znode handling), restarts, error handling, completion */
65511 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65512 +{
65513 + int done;
65514 + int iterations;
65515 + int vroot_used;
65516 +
65517 + assert("nikita-365", h != NULL);
65518 + assert("nikita-366", h->tree != NULL);
65519 + assert("nikita-367", h->key != NULL);
65520 + assert("nikita-368", h->coord != NULL);
65521 + assert("nikita-369", (h->bias == FIND_EXACT)
65522 + || (h->bias == FIND_MAX_NOT_MORE_THAN));
65523 + assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65524 + assert("nikita-2949", !(h->flags & CBK_DKSET));
65525 + assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65526 +
65527 + done = 0;
65528 + iterations = 0;
65529 + vroot_used = 0;
65530 +
65531 + /* loop for restarts */
65532 + restart:
65533 +
65534 + assert("nikita-3024", reiser4_schedulable());
65535 +
65536 + h->result = CBK_COORD_FOUND;
65537 + /* connect_znode() needs it */
65538 + h->ld_key = *reiser4_min_key();
65539 + h->rd_key = *reiser4_max_key();
65540 + h->flags |= CBK_DKSET;
65541 + h->error = NULL;
65542 +
65543 + if (!vroot_used && h->object != NULL) {
65544 + vroot_used = 1;
65545 + done = prepare_object_lookup(h);
65546 + if (done == LOOKUP_REST) {
65547 + goto restart;
65548 + } else if (done == LOOKUP_DONE)
65549 + return h->result;
65550 + }
65551 + if (h->parent_lh->node == NULL) {
65552 + done =
65553 + get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65554 + h->parent_lh);
65555 +
65556 + assert("nikita-1637", done != -E_DEADLOCK);
65557 +
65558 + h->block = h->tree->root_block;
65559 + h->level = h->tree->height;
65560 + h->coord->node = h->parent_lh->node;
65561 +
65562 + if (done != 0)
65563 + return done;
65564 + }
65565 +
65566 + /* loop descending a tree */
65567 + while (!done) {
65568 +
65569 + if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65570 + IS_POW(iterations))) {
65571 + warning("nikita-1481", "Too many iterations: %i",
65572 + iterations);
65573 + reiser4_print_key("key", h->key);
65574 + ++iterations;
65575 + } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65576 + h->error =
65577 + "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65578 + h->result = RETERR(-EIO);
65579 + break;
65580 + }
65581 + switch (cbk_level_lookup(h)) {
65582 + case LOOKUP_CONT:
65583 + move_lh(h->parent_lh, h->active_lh);
65584 + continue;
65585 + default:
65586 + wrong_return_value("nikita-372", "cbk_level");
65587 + case LOOKUP_DONE:
65588 + done = 1;
65589 + break;
65590 + case LOOKUP_REST:
65591 + hput(h);
65592 + /* deadlock avoidance is normal case. */
65593 + if (h->result != -E_DEADLOCK)
65594 + ++iterations;
65595 + reiser4_preempt_point();
65596 + goto restart;
65597 + }
65598 + }
65599 + /* that's all. The rest is error handling */
65600 + if (unlikely(h->error != NULL)) {
65601 + warning("nikita-373", "%s: level: %i, "
65602 + "lock_level: %i, stop_level: %i "
65603 + "lock_mode: %s, bias: %s",
65604 + h->error, h->level, h->lock_level, h->stop_level,
65605 + lock_mode_name(h->lock_mode), bias_name(h->bias));
65606 + reiser4_print_address("block", &h->block);
65607 + reiser4_print_key("key", h->key);
65608 + print_coord_content("coord", h->coord);
65609 + }
65610 + /* `unlikely' error case */
65611 + if (unlikely(IS_CBKERR(h->result))) {
65612 + /* failure. do cleanup */
65613 + hput(h);
65614 + } else {
65615 + assert("nikita-1605", WITH_DATA_RET
65616 + (h->coord->node, 1,
65617 + ergo((h->result == CBK_COORD_FOUND) &&
65618 + (h->bias == FIND_EXACT) &&
65619 + (!node_is_empty(h->coord->node)),
65620 + coord_is_existing_item(h->coord))));
65621 + }
65622 + return h->result;
65623 +}
65624 +
65625 +/* find delimiting keys of child
65626 +
65627 + Determine left and right delimiting keys for child pointed to by
65628 + @parent_coord.
65629 +
65630 +*/
65631 +static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65632 + * locked */ ,
65633 + const coord_t * parent_coord /* coord where
65634 + * pointer to
65635 + * child is
65636 + * stored */ ,
65637 + reiser4_key * ld /* where to store left
65638 + * delimiting key */ ,
65639 + reiser4_key * rd /* where to store right
65640 + * delimiting key */ )
65641 +{
65642 + coord_t neighbor;
65643 +
65644 + assert("nikita-1484", parent != NULL);
65645 + assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65646 +
65647 + coord_dup(&neighbor, parent_coord);
65648 +
65649 + if (neighbor.between == AT_UNIT)
65650 + /* imitate item ->lookup() behavior. */
65651 + neighbor.between = AFTER_UNIT;
65652 +
65653 + if (coord_set_to_left(&neighbor) == 0)
65654 + unit_key_by_coord(&neighbor, ld);
65655 + else {
65656 + assert("nikita-14851", 0);
65657 + *ld = *znode_get_ld_key(parent);
65658 + }
65659 +
65660 + coord_dup(&neighbor, parent_coord);
65661 + if (neighbor.between == AT_UNIT)
65662 + neighbor.between = AFTER_UNIT;
65663 + if (coord_set_to_right(&neighbor) == 0)
65664 + unit_key_by_coord(&neighbor, rd);
65665 + else
65666 + *rd = *znode_get_rd_key(parent);
65667 +}
65668 +
65669 +/*
65670 + * setup delimiting keys for a child
65671 + *
65672 + * @parent parent node
65673 + *
65674 + * @coord location in @parent where pointer to @child is
65675 + *
65676 + * @child child node
65677 + */
65678 +int
65679 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65680 +{
65681 + reiser4_tree *tree;
65682 +
65683 + assert("nikita-2952",
65684 + znode_get_level(parent) == znode_get_level(coord->node));
65685 +
65686 + /* fast check without taking dk lock. This is safe, because
65687 + * JNODE_DKSET is never cleared once set. */
65688 + if (!ZF_ISSET(child, JNODE_DKSET)) {
65689 + tree = znode_get_tree(parent);
65690 + write_lock_dk(tree);
65691 + if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65692 + find_child_delimiting_keys(parent, coord,
65693 + &child->ld_key,
65694 + &child->rd_key);
65695 + ON_DEBUG(child->ld_key_version =
65696 + atomic_inc_return(&delim_key_version);
65697 + child->rd_key_version =
65698 + atomic_inc_return(&delim_key_version););
65699 + ZF_SET(child, JNODE_DKSET);
65700 + }
65701 + write_unlock_dk(tree);
65702 + return 1;
65703 + }
65704 + return 0;
65705 +}
65706 +
65707 +/* Perform tree lookup at one level. This is called from cbk_traverse()
65708 + function that drives lookup through tree and calls cbk_node_lookup() to
65709 + perform lookup within one node.
65710 +
65711 + See comments in a code.
65712 +*/
65713 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65714 +{
65715 + int ret;
65716 + int setdk;
65717 + int ldkeyset = 0;
65718 + reiser4_key ldkey;
65719 + reiser4_key key;
65720 + znode *active;
65721 +
65722 + assert("nikita-3025", reiser4_schedulable());
65723 +
65724 + /* acquire reference to @active node */
65725 + active =
65726 + zget(h->tree, &h->block, h->parent_lh->node, h->level,
65727 + reiser4_ctx_gfp_mask_get());
65728 +
65729 + if (IS_ERR(active)) {
65730 + h->result = PTR_ERR(active);
65731 + return LOOKUP_DONE;
65732 + }
65733 +
65734 + /* lock @active */
65735 + h->result = longterm_lock_znode(h->active_lh,
65736 + active,
65737 + cbk_lock_mode(h->level, h),
65738 + ZNODE_LOCK_LOPRI);
65739 + /* longterm_lock_znode() acquires additional reference to znode (which
65740 + will be later released by longterm_unlock_znode()). Release
65741 + reference acquired by zget().
65742 + */
65743 + zput(active);
65744 + if (unlikely(h->result != 0))
65745 + goto fail_or_restart;
65746 +
65747 + setdk = 0;
65748 + /* if @active is accessed for the first time, setup delimiting keys on
65749 + it. Delimiting keys are taken from the parent node. See
65750 + setup_delimiting_keys() for details.
65751 + */
65752 + if (h->flags & CBK_DKSET) {
65753 + setdk = setup_delimiting_keys(h);
65754 + h->flags &= ~CBK_DKSET;
65755 + } else {
65756 + znode *parent;
65757 +
65758 + parent = h->parent_lh->node;
65759 + h->result = zload(parent);
65760 + if (unlikely(h->result != 0))
65761 + goto fail_or_restart;
65762 +
65763 + if (!ZF_ISSET(active, JNODE_DKSET))
65764 + setdk = set_child_delimiting_keys(parent,
65765 + h->coord, active);
65766 + else {
65767 + read_lock_dk(h->tree);
65768 + find_child_delimiting_keys(parent, h->coord, &ldkey,
65769 + &key);
65770 + read_unlock_dk(h->tree);
65771 + ldkeyset = 1;
65772 + }
65773 + zrelse(parent);
65774 + }
65775 +
65776 + /* this is ugly kludge. Reminder: this is necessary, because
65777 + ->lookup() method returns coord with ->between field probably set
65778 + to something different from AT_UNIT.
65779 + */
65780 + h->coord->between = AT_UNIT;
65781 +
65782 + if (znode_just_created(active) && (h->coord->node != NULL)) {
65783 + write_lock_tree(h->tree);
65784 + /* if we are going to load znode right now, setup
65785 + ->in_parent: coord where pointer to this node is stored in
65786 + parent.
65787 + */
65788 + coord_to_parent_coord(h->coord, &active->in_parent);
65789 + write_unlock_tree(h->tree);
65790 + }
65791 +
65792 + /* check connectedness without holding tree lock---false negatives
65793 + * will be re-checked by connect_znode(), and false positives are
65794 + * impossible---@active cannot suddenly turn into unconnected
65795 + * state. */
65796 + if (!znode_is_connected(active)) {
65797 + h->result = connect_znode(h->coord, active);
65798 + if (unlikely(h->result != 0)) {
65799 + put_parent(h);
65800 + goto fail_or_restart;
65801 + }
65802 + }
65803 +
65804 + jload_prefetch(ZJNODE(active));
65805 +
65806 + if (setdk)
65807 + update_stale_dk(h->tree, active);
65808 +
65809 + /* put_parent() cannot be called earlier, because connect_znode()
65810 + assumes parent node is referenced; */
65811 + put_parent(h);
65812 +
65813 + if ((!znode_contains_key_lock(active, h->key) &&
65814 + (h->flags & CBK_TRUST_DK))
65815 + || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65816 + /* 1. key was moved out of this node while this thread was
65817 + waiting for the lock. Restart. More elaborate solution is
65818 + to determine where key moved (to the left, or to the right)
65819 + and try to follow it through sibling pointers.
65820 +
65821 + 2. or, node itself is going to be removed from the
65822 + tree. Release lock and restart.
65823 + */
65824 + h->result = -E_REPEAT;
65825 + }
65826 + if (h->result == -E_REPEAT)
65827 + return LOOKUP_REST;
65828 +
65829 + h->result = zload_ra(active, h->ra_info);
65830 + if (h->result) {
65831 + return LOOKUP_DONE;
65832 + }
65833 +
65834 + /* sanity checks */
65835 + if (sanity_check(h)) {
65836 + zrelse(active);
65837 + return LOOKUP_DONE;
65838 + }
65839 +
65840 + /* check that key of leftmost item in the @active is the same as in
65841 + * its parent */
65842 + if (ldkeyset && !node_is_empty(active) &&
65843 + !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65844 + warning("vs-3533", "Keys are inconsistent. Fsck?");
65845 + reiser4_print_key("inparent", &ldkey);
65846 + reiser4_print_key("inchild", &key);
65847 + h->result = RETERR(-EIO);
65848 + zrelse(active);
65849 + return LOOKUP_DONE;
65850 + }
65851 +
65852 + if (h->object != NULL)
65853 + handle_vroot(h->object, active);
65854 +
65855 + ret = cbk_node_lookup(h);
65856 +
65857 + /* h->active_lh->node might change, but active is yet to be zrelsed */
65858 + zrelse(active);
65859 +
65860 + return ret;
65861 +
65862 + fail_or_restart:
65863 + if (h->result == -E_DEADLOCK)
65864 + return LOOKUP_REST;
65865 + return LOOKUP_DONE;
65866 +}
65867 +
65868 +#if REISER4_DEBUG
65869 +/* check left and right delimiting keys of a znode */
65870 +void check_dkeys(znode * node)
65871 +{
65872 + znode *left;
65873 + znode *right;
65874 +
65875 + read_lock_tree(current_tree);
65876 + read_lock_dk(current_tree);
65877 +
65878 + assert("vs-1710", znode_is_any_locked(node));
65879 + assert("vs-1197",
65880 + !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65881 +
65882 + left = node->left;
65883 + right = node->right;
65884 +
65885 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65886 + && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65887 + /* check left neighbor. Note that left neighbor is not locked,
65888 + so it might get wrong delimiting keys therefore */
65889 + assert("vs-1198",
65890 + (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65891 + || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65892 +
65893 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65894 + && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65895 + /* check right neighbor. Note that right neighbor is not
65896 + locked, so it might get wrong delimiting keys therefore */
65897 + assert("vs-1199",
65898 + (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65899 + || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65900 +
65901 + read_unlock_dk(current_tree);
65902 + read_unlock_tree(current_tree);
65903 +}
65904 +#endif
65905 +
65906 +/* true if @key is left delimiting key of @node */
65907 +static int key_is_ld(znode * node, const reiser4_key * key)
65908 +{
65909 + int ld;
65910 +
65911 + assert("nikita-1716", node != NULL);
65912 + assert("nikita-1758", key != NULL);
65913 +
65914 + read_lock_dk(znode_get_tree(node));
65915 + assert("nikita-1759", znode_contains_key(node, key));
65916 + ld = keyeq(znode_get_ld_key(node), key);
65917 + read_unlock_dk(znode_get_tree(node));
65918 + return ld;
65919 +}
65920 +
65921 +/* Process one node during tree traversal.
65922 +
65923 + This is called by cbk_level_lookup(). */
65924 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65925 +{
65926 + /* node plugin of @active */
65927 + node_plugin *nplug;
65928 + /* item plugin of item that was found */
65929 + item_plugin *iplug;
65930 + /* search bias */
65931 + lookup_bias node_bias;
65932 + /* node we are operating upon */
65933 + znode *active;
65934 + /* tree we are searching in */
65935 + reiser4_tree *tree;
65936 + /* result */
65937 + int result;
65938 +
65939 + assert("nikita-379", h != NULL);
65940 +
65941 + active = h->active_lh->node;
65942 + tree = h->tree;
65943 +
65944 + nplug = active->nplug;
65945 + assert("nikita-380", nplug != NULL);
65946 +
65947 + ON_DEBUG(check_dkeys(active));
65948 +
65949 + /* return item from "active" node with maximal key not greater than
65950 + "key" */
65951 + node_bias = h->bias;
65952 + result = nplug->lookup(active, h->key, node_bias, h->coord);
65953 + if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65954 + /* error occurred */
65955 + h->result = result;
65956 + return LOOKUP_DONE;
65957 + }
65958 + if (h->level == h->stop_level) {
65959 + /* welcome to the stop level */
65960 + assert("nikita-381", h->coord->node == active);
65961 + if (result == NS_FOUND) {
65962 + /* success of tree lookup */
65963 + if (!(h->flags & CBK_UNIQUE)
65964 + && key_is_ld(active, h->key)) {
65965 + return search_to_left(h);
65966 + } else
65967 + h->result = CBK_COORD_FOUND;
65968 + } else {
65969 + h->result = CBK_COORD_NOTFOUND;
65970 + }
65971 + if (!(h->flags & CBK_IN_CACHE))
65972 + cbk_cache_add(active);
65973 + return LOOKUP_DONE;
65974 + }
65975 +
65976 + if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65977 + h->error = "not found on internal node";
65978 + h->result = result;
65979 + return LOOKUP_DONE;
65980 + }
65981 +
65982 + assert("vs-361", h->level > h->stop_level);
65983 +
65984 + if (handle_eottl(h, &result)) {
65985 + assert("vs-1674", (result == LOOKUP_DONE ||
65986 + result == LOOKUP_REST));
65987 + return result;
65988 + }
65989 +
65990 + /* go down to next level */
65991 + check_me("vs-12", zload(h->coord->node) == 0);
65992 + assert("nikita-2116", item_is_internal(h->coord));
65993 + iplug = item_plugin_by_coord(h->coord);
65994 + iplug->s.internal.down_link(h->coord, h->key, &h->block);
65995 + zrelse(h->coord->node);
65996 + --h->level;
65997 + return LOOKUP_CONT; /* continue */
65998 +}
65999 +
66000 +/* scan cbk_cache slots looking for a match for @h */
66001 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
66002 +{
66003 + level_lookup_result llr;
66004 + znode *node;
66005 + reiser4_tree *tree;
66006 + cbk_cache_slot *slot;
66007 + cbk_cache *cache;
66008 + tree_level level;
66009 + int isunique;
66010 + const reiser4_key *key;
66011 + int result;
66012 +
66013 + assert("nikita-1317", h != NULL);
66014 + assert("nikita-1315", h->tree != NULL);
66015 + assert("nikita-1316", h->key != NULL);
66016 +
66017 + tree = h->tree;
66018 + cache = &tree->cbk_cache;
66019 + if (cache->nr_slots == 0)
66020 + /* size of cbk cache was set to 0 by mount time option. */
66021 + return RETERR(-ENOENT);
66022 +
66023 + assert("nikita-2474", cbk_cache_invariant(cache));
66024 + node = NULL; /* to keep gcc happy */
66025 + level = h->level;
66026 + key = h->key;
66027 + isunique = h->flags & CBK_UNIQUE;
66028 + result = RETERR(-ENOENT);
66029 +
66030 + /*
66031 + * this is time-critical function and dragons had, hence, been settled
66032 + * here.
66033 + *
66034 + * Loop below scans cbk cache slots trying to find matching node with
66035 + * suitable range of delimiting keys and located at the h->level.
66036 + *
66037 + * Scan is done under cbk cache spin lock that protects slot->node
66038 + * pointers. If suitable node is found we want to pin it in
66039 + * memory. But slot->node can point to the node with x_count 0
66040 + * (unreferenced). Such node can be recycled at any moment, or can
66041 + * already be in the process of being recycled (within jput()).
66042 + *
66043 + * As we found node in the cbk cache, it means that jput() hasn't yet
66044 + * called cbk_cache_invalidate().
66045 + *
66046 + * We acquire reference to the node without holding tree lock, and
66047 + * later, check node's RIP bit. This avoids races with jput().
66048 + */
66049 +
66050 + rcu_read_lock();
66051 + read_lock(&((cbk_cache *)cache)->guard);
66052 +
66053 + slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66054 + slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66055 + BUG_ON(&slot->lru != &cache->lru);/*????*/
66056 + while (1) {
66057 +
66058 + slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66059 +
66060 + if (&cache->lru != &slot->lru)
66061 + node = slot->node;
66062 + else
66063 + node = NULL;
66064 +
66065 + if (unlikely(node == NULL))
66066 + break;
66067 +
66068 + /*
66069 + * this is (hopefully) the only place in the code where we are
66070 + * working with delimiting keys without holding dk lock. This
66071 + * is fine here, because this is only "guess" anyway---keys
66072 + * are rechecked under dk lock below.
66073 + */
66074 + if (znode_get_level(node) == level &&
66075 + /* reiser4_min_key < key < reiser4_max_key */
66076 + znode_contains_key_strict(node, key, isunique)) {
66077 + zref(node);
66078 + result = 0;
66079 + spin_lock_prefetch(&tree->tree_lock);
66080 + break;
66081 + }
66082 + }
66083 + read_unlock(&((cbk_cache *)cache)->guard);
66084 +
66085 + assert("nikita-2475", cbk_cache_invariant(cache));
66086 +
66087 + if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66088 + result = -ENOENT;
66089 +
66090 + rcu_read_unlock();
66091 +
66092 + if (result != 0) {
66093 + h->result = CBK_COORD_NOTFOUND;
66094 + return RETERR(-ENOENT);
66095 + }
66096 +
66097 + result =
66098 + longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66099 + ZNODE_LOCK_LOPRI);
66100 + zput(node);
66101 + if (result != 0)
66102 + return result;
66103 + result = zload(node);
66104 + if (result != 0)
66105 + return result;
66106 +
66107 + /* recheck keys */
66108 + read_lock_dk(tree);
66109 + result = (znode_contains_key_strict(node, key, isunique) &&
66110 + !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66111 + read_unlock_dk(tree);
66112 + if (result) {
66113 + /* do lookup inside node */
66114 + llr = cbk_node_lookup(h);
66115 + /* if cbk_node_lookup() wandered to another node (due to eottl
66116 + or non-unique keys), adjust @node */
66117 + /*node = h->active_lh->node; */
66118 +
66119 + if (llr != LOOKUP_DONE) {
66120 + /* restart or continue on the next level */
66121 + result = RETERR(-ENOENT);
66122 + } else if (IS_CBKERR(h->result))
66123 + /* io or oom */
66124 + result = RETERR(-ENOENT);
66125 + else {
66126 + /* good. Either item found or definitely not found. */
66127 + result = 0;
66128 +
66129 + write_lock(&(cache->guard));
66130 + if (slot->node == h->active_lh->node /*node */ ) {
66131 + /* if this node is still in cbk cache---move
66132 + its slot to the head of the LRU list. */
66133 + list_move(&slot->lru, &cache->lru);
66134 + }
66135 + write_unlock(&(cache->guard));
66136 + }
66137 + } else {
66138 + /* race. While this thread was waiting for the lock, node was
66139 + rebalanced and item we are looking for, shifted out of it
66140 + (if it ever was here).
66141 +
66142 + Continuing scanning is almost hopeless: node key range was
66143 + moved to, is almost certainly at the beginning of the LRU
66144 + list at this time, because it's hot, but restarting
66145 + scanning from the very beginning is complex. Just return,
66146 + so that cbk() will be performed. This is not that
66147 + important, because such races should be rare. Are they?
66148 + */
66149 + result = RETERR(-ENOENT); /* -ERAUGHT */
66150 + }
66151 + zrelse(node);
66152 + assert("nikita-2476", cbk_cache_invariant(cache));
66153 + return result;
66154 +}
66155 +
66156 +/* look for item with given key in the coord cache
66157 +
66158 + This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66159 + which is a small LRU list of znodes accessed lately. For each znode in
66160 + znode in this list, it checks whether key we are looking for fits into key
66161 + range covered by this node. If so, and in addition, node lies at allowed
66162 + level (this is to handle extents on a twig level), node is locked, and
66163 + lookup inside it is performed.
66164 +
66165 + we need a measurement of the cost of this cache search compared to the cost
66166 + of coord_by_key.
66167 +
66168 +*/
66169 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66170 +{
66171 + int result = 0;
66172 + tree_level level;
66173 +
66174 + /* add CBK_IN_CACHE to the handle flags. This means that
66175 + * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66176 + * found node to the cache. */
66177 + h->flags |= CBK_IN_CACHE;
66178 + for (level = h->stop_level; level <= h->lock_level; ++level) {
66179 + h->level = level;
66180 + result = cbk_cache_scan_slots(h);
66181 + if (result != 0) {
66182 + done_lh(h->active_lh);
66183 + done_lh(h->parent_lh);
66184 + } else {
66185 + assert("nikita-1319", !IS_CBKERR(h->result));
66186 + break;
66187 + }
66188 + }
66189 + h->flags &= ~CBK_IN_CACHE;
66190 + return result;
66191 +}
66192 +
66193 +/* type of lock we want to obtain during tree traversal. On stop level
66194 + we want type of lock user asked for, on upper levels: read lock. */
66195 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66196 +{
66197 + assert("nikita-382", h != NULL);
66198 +
66199 + return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66200 +}
66201 +
66202 +/* update outdated delimiting keys */
66203 +static void stale_dk(reiser4_tree * tree, znode * node)
66204 +{
66205 + znode *right;
66206 +
66207 + read_lock_tree(tree);
66208 + write_lock_dk(tree);
66209 + right = node->right;
66210 +
66211 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66212 + right && ZF_ISSET(right, JNODE_DKSET) &&
66213 + !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66214 + znode_set_rd_key(node, znode_get_ld_key(right));
66215 +
66216 + write_unlock_dk(tree);
66217 + read_unlock_tree(tree);
66218 +}
66219 +
66220 +/* check for possibly outdated delimiting keys, and update them if
66221 + * necessary. */
66222 +static void update_stale_dk(reiser4_tree * tree, znode * node)
66223 +{
66224 + znode *right;
66225 + reiser4_key rd;
66226 +
66227 + read_lock_tree(tree);
66228 + read_lock_dk(tree);
66229 + rd = *znode_get_rd_key(node);
66230 + right = node->right;
66231 + if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66232 + right && ZF_ISSET(right, JNODE_DKSET) &&
66233 + !keyeq(&rd, znode_get_ld_key(right)))) {
66234 + assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66235 + read_unlock_dk(tree);
66236 + read_unlock_tree(tree);
66237 + stale_dk(tree, node);
66238 + return;
66239 + }
66240 + read_unlock_dk(tree);
66241 + read_unlock_tree(tree);
66242 +}
66243 +
66244 +/*
66245 + * handle searches a the non-unique key.
66246 + *
66247 + * Suppose that we are looking for an item with possibly non-unique key 100.
66248 + *
66249 + * Root node contains two pointers: one to a node with left delimiting key 0,
66250 + * and another to a node with left delimiting key 100. Item we interested in
66251 + * may well happen in the sub-tree rooted at the first pointer.
66252 + *
66253 + * To handle this search_to_left() is called when search reaches stop
66254 + * level. This function checks it is _possible_ that item we are looking for
66255 + * is in the left neighbor (this can be done by comparing delimiting keys) and
66256 + * if so, tries to lock left neighbor (this is low priority lock, so it can
66257 + * deadlock, tree traversal is just restarted if it did) and then checks
66258 + * whether left neighbor actually contains items with our key.
66259 + *
66260 + * Note that this is done on the stop level only. It is possible to try such
66261 + * left-check on each level, but as duplicate keys are supposed to be rare
66262 + * (very unlikely that more than one node is completely filled with items with
66263 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66264 + *
66265 + */
66266 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66267 +{
66268 + level_lookup_result result;
66269 + coord_t *coord;
66270 + znode *node;
66271 + znode *neighbor;
66272 +
66273 + lock_handle lh;
66274 +
66275 + assert("nikita-1761", h != NULL);
66276 + assert("nikita-1762", h->level == h->stop_level);
66277 +
66278 + init_lh(&lh);
66279 + coord = h->coord;
66280 + node = h->active_lh->node;
66281 + assert("nikita-1763", coord_is_leftmost_unit(coord));
66282 +
66283 + h->result =
66284 + reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66285 + GN_CAN_USE_UPPER_LEVELS);
66286 + neighbor = NULL;
66287 + switch (h->result) {
66288 + case -E_DEADLOCK:
66289 + result = LOOKUP_REST;
66290 + break;
66291 + case 0:{
66292 + node_plugin *nplug;
66293 + coord_t crd;
66294 + lookup_bias bias;
66295 +
66296 + neighbor = lh.node;
66297 + h->result = zload(neighbor);
66298 + if (h->result != 0) {
66299 + result = LOOKUP_DONE;
66300 + break;
66301 + }
66302 +
66303 + nplug = neighbor->nplug;
66304 +
66305 + coord_init_zero(&crd);
66306 + bias = h->bias;
66307 + h->bias = FIND_EXACT;
66308 + h->result =
66309 + nplug->lookup(neighbor, h->key, h->bias, &crd);
66310 + h->bias = bias;
66311 +
66312 + if (h->result == NS_NOT_FOUND) {
66313 + case -E_NO_NEIGHBOR:
66314 + h->result = CBK_COORD_FOUND;
66315 + if (!(h->flags & CBK_IN_CACHE))
66316 + cbk_cache_add(node);
66317 + default: /* some other error */
66318 + result = LOOKUP_DONE;
66319 + } else if (h->result == NS_FOUND) {
66320 + read_lock_dk(znode_get_tree(neighbor));
66321 + h->rd_key = *znode_get_ld_key(node);
66322 + leftmost_key_in_node(neighbor, &h->ld_key);
66323 + read_unlock_dk(znode_get_tree(neighbor));
66324 + h->flags |= CBK_DKSET;
66325 +
66326 + h->block = *znode_get_block(neighbor);
66327 + /* clear coord -> node so that cbk_level_lookup()
66328 + wouldn't overwrite parent hint in neighbor.
66329 +
66330 + Parent hint was set up by
66331 + reiser4_get_left_neighbor()
66332 + */
66333 + /* FIXME: why do we have to spinlock here? */
66334 + write_lock_tree(znode_get_tree(neighbor));
66335 + h->coord->node = NULL;
66336 + write_unlock_tree(znode_get_tree(neighbor));
66337 + result = LOOKUP_CONT;
66338 + } else {
66339 + result = LOOKUP_DONE;
66340 + }
66341 + if (neighbor != NULL)
66342 + zrelse(neighbor);
66343 + }
66344 + }
66345 + done_lh(&lh);
66346 + return result;
66347 +}
66348 +
66349 +/* debugging aid: return symbolic name of search bias */
66350 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66351 +{
66352 + if (bias == FIND_EXACT)
66353 + return "exact";
66354 + else if (bias == FIND_MAX_NOT_MORE_THAN)
66355 + return "left-slant";
66356 +/* else if( bias == RIGHT_SLANT_BIAS ) */
66357 +/* return "right-bias"; */
66358 + else {
66359 + static char buf[30];
66360 +
66361 + sprintf(buf, "unknown: %i", bias);
66362 + return buf;
66363 + }
66364 +}
66365 +
66366 +#if REISER4_DEBUG
66367 +/* debugging aid: print human readable information about @p */
66368 +void print_coord_content(const char *prefix /* prefix to print */ ,
66369 + coord_t * p /* coord to print */ )
66370 +{
66371 + reiser4_key key;
66372 +
66373 + if (p == NULL) {
66374 + printk("%s: null\n", prefix);
66375 + return;
66376 + }
66377 + if ((p->node != NULL) && znode_is_loaded(p->node)
66378 + && coord_is_existing_item(p))
66379 + printk("%s: data: %p, length: %i\n", prefix,
66380 + item_body_by_coord(p), item_length_by_coord(p));
66381 + if (znode_is_loaded(p->node)) {
66382 + item_key_by_coord(p, &key);
66383 + reiser4_print_key(prefix, &key);
66384 + }
66385 +}
66386 +
66387 +/* debugging aid: print human readable information about @block */
66388 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
66389 + const reiser4_block_nr * block /* block number to print */ )
66390 +{
66391 + printk("%s: %s\n", prefix, sprint_address(block));
66392 +}
66393 +#endif
66394 +
66395 +/* return string containing human readable representation of @block */
66396 +char *sprint_address(const reiser4_block_nr *
66397 + block /* block number to print */ )
66398 +{
66399 + static char address[30];
66400 +
66401 + if (block == NULL)
66402 + sprintf(address, "null");
66403 + else if (reiser4_blocknr_is_fake(block))
66404 + sprintf(address, "%llx", (unsigned long long)(*block));
66405 + else
66406 + sprintf(address, "%llu", (unsigned long long)(*block));
66407 + return address;
66408 +}
66409 +
66410 +/* release parent node during traversal */
66411 +static void put_parent(cbk_handle * h /* search handle */ )
66412 +{
66413 + assert("nikita-383", h != NULL);
66414 + if (h->parent_lh->node != NULL) {
66415 + longterm_unlock_znode(h->parent_lh);
66416 + }
66417 +}
66418 +
66419 +/* helper function used by coord_by_key(): release reference to parent znode
66420 + stored in handle before processing its child. */
66421 +static void hput(cbk_handle * h /* search handle */ )
66422 +{
66423 + assert("nikita-385", h != NULL);
66424 + done_lh(h->parent_lh);
66425 + done_lh(h->active_lh);
66426 +}
66427 +
66428 +/* Helper function used by cbk(): update delimiting keys of child node (stored
66429 + in h->active_lh->node) using key taken from parent on the parent level. */
66430 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66431 +{
66432 + znode *active;
66433 + reiser4_tree *tree;
66434 +
66435 + assert("nikita-1088", h != NULL);
66436 +
66437 + active = h->active_lh->node;
66438 +
66439 + /* fast check without taking dk lock. This is safe, because
66440 + * JNODE_DKSET is never cleared once set. */
66441 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66442 + tree = znode_get_tree(active);
66443 + write_lock_dk(tree);
66444 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66445 + znode_set_ld_key(active, &h->ld_key);
66446 + znode_set_rd_key(active, &h->rd_key);
66447 + ZF_SET(active, JNODE_DKSET);
66448 + }
66449 + write_unlock_dk(tree);
66450 + return 1;
66451 + }
66452 + return 0;
66453 +}
66454 +
66455 +/* true if @block makes sense for the @tree. Used to detect corrupted node
66456 + * pointers */
66457 +static int
66458 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66459 + reiser4_tree * tree /* tree to check against */ )
66460 +{
66461 + assert("nikita-757", block != NULL);
66462 + assert("nikita-758", tree != NULL);
66463 +
66464 + /* check to see if it exceeds the size of the device. */
66465 + return reiser4_blocknr_is_sane_for(tree->super, block);
66466 +}
66467 +
66468 +/* check consistency of fields */
66469 +static int sanity_check(cbk_handle * h /* search handle */ )
66470 +{
66471 + assert("nikita-384", h != NULL);
66472 +
66473 + if (h->level < h->stop_level) {
66474 + h->error = "Buried under leaves";
66475 + h->result = RETERR(-EIO);
66476 + return LOOKUP_DONE;
66477 + } else if (!block_nr_is_correct(&h->block, h->tree)) {
66478 + h->error = "bad block number";
66479 + h->result = RETERR(-EIO);
66480 + return LOOKUP_DONE;
66481 + } else
66482 + return 0;
66483 +}
66484 +
66485 +/* Make Linus happy.
66486 + Local variables:
66487 + c-indentation-style: "K&R"
66488 + mode-name: "LC"
66489 + c-basic-offset: 8
66490 + tab-width: 8
66491 + fill-column: 120
66492 + scroll-step: 1
66493 + End:
66494 +*/
66495 diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.c linux-2.6.20/fs/reiser4/status_flags.c
66496 --- linux-2.6.20.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
66497 +++ linux-2.6.20/fs/reiser4/status_flags.c 2007-05-06 14:50:43.875030717 +0400
66498 @@ -0,0 +1,175 @@
66499 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66500 + * reiser4/README */
66501 +
66502 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
66503 +
66504 +#include <linux/bio.h>
66505 +#include <linux/highmem.h>
66506 +#include <linux/fs.h>
66507 +#include <linux/blkdev.h>
66508 +#include "debug.h"
66509 +#include "dformat.h"
66510 +#include "status_flags.h"
66511 +#include "super.h"
66512 +
66513 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66514 + unconditionally unlocks the page, so we can see that io was done.
66515 + We do not free bio, because we hope to reuse that. */
66516 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66517 + int err)
66518 +{
66519 + if (bio->bi_size)
66520 + return 1;
66521 +
66522 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66523 + SetPageUptodate(bio->bi_io_vec->bv_page);
66524 + } else {
66525 + ClearPageUptodate(bio->bi_io_vec->bv_page);
66526 + SetPageError(bio->bi_io_vec->bv_page);
66527 + }
66528 + unlock_page(bio->bi_io_vec->bv_page);
66529 + return 0;
66530 +}
66531 +
66532 +/* Initialise status code. This is expected to be called from the disk format
66533 + code. block paremeter is where status block lives. */
66534 +int reiser4_status_init(reiser4_block_nr block)
66535 +{
66536 + struct super_block *sb = reiser4_get_current_sb();
66537 + struct reiser4_status *statuspage;
66538 + struct bio *bio;
66539 + struct page *page;
66540 +
66541 + get_super_private(sb)->status_page = NULL;
66542 + get_super_private(sb)->status_bio = NULL;
66543 +
66544 + page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
66545 + if (!page)
66546 + return -ENOMEM;
66547 +
66548 + bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
66549 + if (bio != NULL) {
66550 + bio->bi_sector = block * (sb->s_blocksize >> 9);
66551 + bio->bi_bdev = sb->s_bdev;
66552 + bio->bi_io_vec[0].bv_page = page;
66553 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66554 + bio->bi_io_vec[0].bv_offset = 0;
66555 + bio->bi_vcnt = 1;
66556 + bio->bi_size = sb->s_blocksize;
66557 + bio->bi_end_io = reiser4_status_endio;
66558 + } else {
66559 + __free_pages(page, 0);
66560 + return -ENOMEM;
66561 + }
66562 + lock_page(page);
66563 + submit_bio(READ, bio);
66564 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66565 + wait_on_page_locked(page);
66566 + if (!PageUptodate(page)) {
66567 + warning("green-2007",
66568 + "I/O error while tried to read status page\n");
66569 + return -EIO;
66570 + }
66571 +
66572 + statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66573 + if (memcmp
66574 + (statuspage->magic, REISER4_STATUS_MAGIC,
66575 + sizeof(REISER4_STATUS_MAGIC))) {
66576 + /* Magic does not match. */
66577 + kunmap_atomic((char *)statuspage, KM_USER0);
66578 + warning("green-2008", "Wrong magic in status block\n");
66579 + __free_pages(page, 0);
66580 + bio_put(bio);
66581 + return -EINVAL;
66582 + }
66583 + kunmap_atomic((char *)statuspage, KM_USER0);
66584 +
66585 + get_super_private(sb)->status_page = page;
66586 + get_super_private(sb)->status_bio = bio;
66587 + return 0;
66588 +}
66589 +
66590 +/* Query the status of fs. Returns if the FS can be safely mounted.
66591 + Also if "status" and "extended" parameters are given, it will fill
66592 + actual parts of status from disk there. */
66593 +int reiser4_status_query(u64 * status, u64 * extended)
66594 +{
66595 + struct super_block *sb = reiser4_get_current_sb();
66596 + struct reiser4_status *statuspage;
66597 + int retval;
66598 +
66599 + if (!get_super_private(sb)->status_page) { // No status page?
66600 + return REISER4_STATUS_MOUNT_UNKNOWN;
66601 + }
66602 + statuspage = (struct reiser4_status *)
66603 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66604 + switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66605 + case REISER4_STATUS_OK:
66606 + retval = REISER4_STATUS_MOUNT_OK;
66607 + break;
66608 + case REISER4_STATUS_CORRUPTED:
66609 + retval = REISER4_STATUS_MOUNT_WARN;
66610 + break;
66611 + case REISER4_STATUS_DAMAGED:
66612 + case REISER4_STATUS_DESTROYED:
66613 + case REISER4_STATUS_IOERROR:
66614 + retval = REISER4_STATUS_MOUNT_RO;
66615 + break;
66616 + default:
66617 + retval = REISER4_STATUS_MOUNT_UNKNOWN;
66618 + break;
66619 + }
66620 +
66621 + if (status)
66622 + *status = le64_to_cpu(get_unaligned(&statuspage->status));
66623 + if (extended)
66624 + *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66625 +
66626 + kunmap_atomic((char *)statuspage, KM_USER0);
66627 + return retval;
66628 +}
66629 +
66630 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
66631 + It fills the status structure and tries to push it to disk. */
66632 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66633 +{
66634 + struct super_block *sb = reiser4_get_current_sb();
66635 + struct reiser4_status *statuspage;
66636 + struct bio *bio = get_super_private(sb)->status_bio;
66637 +
66638 + if (!get_super_private(sb)->status_page) { // No status page?
66639 + return -1;
66640 + }
66641 + statuspage = (struct reiser4_status *)
66642 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66643 +
66644 + put_unaligned(cpu_to_le64(status), &statuspage->status);
66645 + put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66646 + strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66647 +
66648 + kunmap_atomic((char *)statuspage, KM_USER0);
66649 + bio->bi_bdev = sb->s_bdev;
66650 + bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66651 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66652 + bio->bi_io_vec[0].bv_offset = 0;
66653 + bio->bi_vcnt = 1;
66654 + bio->bi_size = sb->s_blocksize;
66655 + bio->bi_end_io = reiser4_status_endio;
66656 + lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66657 + /* We can block now, but we have no other choice anyway */
66658 + submit_bio(WRITE, bio);
66659 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66660 + return 0; // We do not wait for io to finish.
66661 +}
66662 +
66663 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66664 +int reiser4_status_finish(void)
66665 +{
66666 + struct super_block *sb = reiser4_get_current_sb();
66667 +
66668 + __free_pages(get_super_private(sb)->status_page, 0);
66669 + get_super_private(sb)->status_page = NULL;
66670 + bio_put(get_super_private(sb)->status_bio);
66671 + get_super_private(sb)->status_bio = NULL;
66672 + return 0;
66673 +}
66674 diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.h linux-2.6.20/fs/reiser4/status_flags.h
66675 --- linux-2.6.20.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
66676 +++ linux-2.6.20/fs/reiser4/status_flags.h 2007-05-06 14:50:43.875030717 +0400
66677 @@ -0,0 +1,43 @@
66678 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66679 + * reiser4/README */
66680 +
66681 +/* Here we declare structures and flags that store reiser4 status on disk.
66682 + The status that helps us to find out if the filesystem is valid or if it
66683 + contains some critical, or not so critical errors */
66684 +
66685 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
66686 +#define __REISER4_STATUS_FLAGS_H__
66687 +
66688 +#include "dformat.h"
66689 +/* These are major status flags */
66690 +#define REISER4_STATUS_OK 0
66691 +#define REISER4_STATUS_CORRUPTED 0x1
66692 +#define REISER4_STATUS_DAMAGED 0x2
66693 +#define REISER4_STATUS_DESTROYED 0x4
66694 +#define REISER4_STATUS_IOERROR 0x8
66695 +
66696 +/* Return values for reiser4_status_query() */
66697 +#define REISER4_STATUS_MOUNT_OK 0
66698 +#define REISER4_STATUS_MOUNT_WARN 1
66699 +#define REISER4_STATUS_MOUNT_RO 2
66700 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
66701 +
66702 +#define REISER4_TEXTERROR_LEN 256
66703 +
66704 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66705 +/* We probably need to keep its size under sector size which is 512 bytes */
66706 +struct reiser4_status {
66707 + char magic[16];
66708 + d64 status; /* Current FS state */
66709 + d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66710 + last sector where io error happened if status is "io error encountered" */
66711 + d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66712 + char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66713 +};
66714 +
66715 +int reiser4_status_init(reiser4_block_nr block);
66716 +int reiser4_status_query(u64 * status, u64 * extended);
66717 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
66718 +int reiser4_status_finish(void);
66719 +
66720 +#endif
66721 diff -urN linux-2.6.20.orig/fs/reiser4/super.c linux-2.6.20/fs/reiser4/super.c
66722 --- linux-2.6.20.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
66723 +++ linux-2.6.20/fs/reiser4/super.c 2007-05-06 14:50:43.875030717 +0400
66724 @@ -0,0 +1,316 @@
66725 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66726 + * reiser4/README */
66727 +
66728 +/* Super-block manipulations. */
66729 +
66730 +#include "debug.h"
66731 +#include "dformat.h"
66732 +#include "key.h"
66733 +#include "plugin/security/perm.h"
66734 +#include "plugin/space/space_allocator.h"
66735 +#include "plugin/plugin.h"
66736 +#include "tree.h"
66737 +#include "vfs_ops.h"
66738 +#include "super.h"
66739 +#include "reiser4.h"
66740 +
66741 +#include <linux/types.h> /* for __u?? */
66742 +#include <linux/fs.h> /* for struct super_block */
66743 +
66744 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66745 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66746 +static __u64 reserved_for_root(const struct super_block *super);
66747 +
66748 +/* Return reiser4-specific part of super block */
66749 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66750 + * queried */ )
66751 +{
66752 + return (reiser4_super_info_data *) super->s_fs_info;
66753 +}
66754 +
66755 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
66756 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
66757 +{
66758 + assert("nikita-448", super != NULL);
66759 + assert("nikita-449", is_reiser4_super(super));
66760 + return (long)REISER4_SUPER_MAGIC;
66761 +}
66762 +
66763 +/* functions to read/modify fields of reiser4_super_info_data */
66764 +
66765 +/* get number of blocks in file system */
66766 +__u64 reiser4_block_count(const struct super_block *super /* super block
66767 + queried */ )
66768 +{
66769 + assert("vs-494", super != NULL);
66770 + assert("vs-495", is_reiser4_super(super));
66771 + return get_super_private(super)->block_count;
66772 +}
66773 +
66774 +#if REISER4_DEBUG
66775 +/*
66776 + * number of blocks in the current file system
66777 + */
66778 +__u64 reiser4_current_block_count(void)
66779 +{
66780 + return get_current_super_private()->block_count;
66781 +}
66782 +#endif /* REISER4_DEBUG */
66783 +
66784 +/* set number of block in filesystem */
66785 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66786 +{
66787 + assert("vs-501", super != NULL);
66788 + assert("vs-502", is_reiser4_super(super));
66789 + get_super_private(super)->block_count = nr;
66790 + /*
66791 + * The proper calculation of the reserved space counter (%5 of device
66792 + * block counter) we need a 64 bit division which is missing in Linux
66793 + * on i386 platform. Because we do not need a precise calculation here
66794 + * we can replace a div64 operation by this combination of
66795 + * multiplication and shift: 51. / (2^10) == .0498 .
66796 + * FIXME: this is a bug. It comes up only for very small filesystems
66797 + * which probably are never used. Nevertheless, it is a bug. Number of
66798 + * reserved blocks must be not less than maximal number of blocks which
66799 + * get grabbed with BA_RESERVED.
66800 + */
66801 + get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66802 +}
66803 +
66804 +/* amount of blocks used (allocated for data) in file system */
66805 +__u64 reiser4_data_blocks(const struct super_block *super /* super block
66806 + queried */ )
66807 +{
66808 + assert("nikita-452", super != NULL);
66809 + assert("nikita-453", is_reiser4_super(super));
66810 + return get_super_private(super)->blocks_used;
66811 +}
66812 +
66813 +/* set number of block used in filesystem */
66814 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66815 +{
66816 + assert("vs-503", super != NULL);
66817 + assert("vs-504", is_reiser4_super(super));
66818 + get_super_private(super)->blocks_used = nr;
66819 +}
66820 +
66821 +/* amount of free blocks in file system */
66822 +__u64 reiser4_free_blocks(const struct super_block *super /* super block
66823 + queried */ )
66824 +{
66825 + assert("nikita-454", super != NULL);
66826 + assert("nikita-455", is_reiser4_super(super));
66827 + return get_super_private(super)->blocks_free;
66828 +}
66829 +
66830 +/* set number of blocks free in filesystem */
66831 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66832 +{
66833 + assert("vs-505", super != NULL);
66834 + assert("vs-506", is_reiser4_super(super));
66835 + get_super_private(super)->blocks_free = nr;
66836 +}
66837 +
66838 +/* get mkfs unique identifier */
66839 +__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66840 + queried */ )
66841 +{
66842 + assert("vpf-221", super != NULL);
66843 + assert("vpf-222", is_reiser4_super(super));
66844 + return get_super_private(super)->mkfs_id;
66845 +}
66846 +
66847 +/* amount of free blocks in file system */
66848 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
66849 +{
66850 + assert("vs-497", super != NULL);
66851 + assert("vs-498", is_reiser4_super(super));
66852 + return get_super_private(super)->blocks_free_committed;
66853 +}
66854 +
66855 +/* amount of blocks in the file system reserved for @uid and @gid */
66856 +long reiser4_reserved_blocks(const struct super_block *super /* super block
66857 + queried */ ,
66858 + uid_t uid /* user id */ ,
66859 + gid_t gid /* group id */ )
66860 +{
66861 + long reserved;
66862 +
66863 + assert("nikita-456", super != NULL);
66864 + assert("nikita-457", is_reiser4_super(super));
66865 +
66866 + reserved = 0;
66867 + if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66868 + reserved += reserved_for_gid(super, gid);
66869 + if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66870 + reserved += reserved_for_uid(super, uid);
66871 + if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66872 + reserved += reserved_for_root(super);
66873 + return reserved;
66874 +}
66875 +
66876 +/* get/set value of/to grabbed blocks counter */
66877 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
66878 +{
66879 + assert("zam-512", super != NULL);
66880 + assert("zam-513", is_reiser4_super(super));
66881 +
66882 + return get_super_private(super)->blocks_grabbed;
66883 +}
66884 +
66885 +__u64 reiser4_flush_reserved(const struct super_block * super)
66886 +{
66887 + assert("vpf-285", super != NULL);
66888 + assert("vpf-286", is_reiser4_super(super));
66889 +
66890 + return get_super_private(super)->blocks_flush_reserved;
66891 +}
66892 +
66893 +/* get/set value of/to counter of fake allocated formatted blocks */
66894 +__u64 reiser4_fake_allocated(const struct super_block * super)
66895 +{
66896 + assert("zam-516", super != NULL);
66897 + assert("zam-517", is_reiser4_super(super));
66898 +
66899 + return get_super_private(super)->blocks_fake_allocated;
66900 +}
66901 +
66902 +/* get/set value of/to counter of fake allocated unformatted blocks */
66903 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66904 +{
66905 + assert("zam-516", super != NULL);
66906 + assert("zam-517", is_reiser4_super(super));
66907 +
66908 + return get_super_private(super)->blocks_fake_allocated_unformatted;
66909 +}
66910 +
66911 +/* get/set value of/to counter of clustered blocks */
66912 +__u64 reiser4_clustered_blocks(const struct super_block * super)
66913 +{
66914 + assert("edward-601", super != NULL);
66915 + assert("edward-602", is_reiser4_super(super));
66916 +
66917 + return get_super_private(super)->blocks_clustered;
66918 +}
66919 +
66920 +/* space allocator used by this file system */
66921 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
66922 + *super)
66923 +{
66924 + assert("nikita-1965", super != NULL);
66925 + assert("nikita-1966", is_reiser4_super(super));
66926 + return &get_super_private(super)->space_allocator;
66927 +}
66928 +
66929 +/* return fake inode used to bind formatted nodes in the page cache */
66930 +struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
66931 + queried */ )
66932 +{
66933 + assert("nikita-1757", super != NULL);
66934 + return get_super_private(super)->fake;
66935 +}
66936 +
66937 +/* return fake inode used to bind copied on capture nodes in the page cache */
66938 +struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
66939 + queried */ )
66940 +{
66941 + assert("nikita-1757", super != NULL);
66942 + return get_super_private(super)->cc;
66943 +}
66944 +
66945 +/* return fake inode used to bind bitmaps and journlal heads */
66946 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
66947 +{
66948 + assert("nikita-17571", super != NULL);
66949 + return get_super_private(super)->bitmap;
66950 +}
66951 +
66952 +/* tree used by this file system */
66953 +reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
66954 + * queried */ )
66955 +{
66956 + assert("nikita-460", super != NULL);
66957 + assert("nikita-461", is_reiser4_super(super));
66958 + return &get_super_private(super)->tree;
66959 +}
66960 +
66961 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
66962 + use in assertions. */
66963 +int is_reiser4_super(const struct super_block *super /* super block
66964 + * queried */ )
66965 +{
66966 + return
66967 + super != NULL &&
66968 + get_super_private(super) != NULL &&
66969 + super->s_op == &(get_super_private(super)->ops.super);
66970 +}
66971 +
66972 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66973 +{
66974 + return test_bit((int)f, &get_super_private(super)->fs_flags);
66975 +}
66976 +
66977 +/* amount of blocks reserved for given group in file system */
66978 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66979 + * block
66980 + * queried */ ,
66981 + gid_t gid UNUSED_ARG /* group id */ )
66982 +{
66983 + return 0;
66984 +}
66985 +
66986 +/* amount of blocks reserved for given user in file system */
66987 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66988 + block
66989 + queried */ ,
66990 + uid_t uid UNUSED_ARG /* user id */ )
66991 +{
66992 + return 0;
66993 +}
66994 +
66995 +/* amount of blocks reserved for super user in file system */
66996 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66997 + block
66998 + queried */ )
66999 +{
67000 + return 0;
67001 +}
67002 +
67003 +/*
67004 + * true if block number @blk makes sense for the file system at @super.
67005 + */
67006 +int
67007 +reiser4_blocknr_is_sane_for(const struct super_block *super,
67008 + const reiser4_block_nr * blk)
67009 +{
67010 + reiser4_super_info_data *sbinfo;
67011 +
67012 + assert("nikita-2957", super != NULL);
67013 + assert("nikita-2958", blk != NULL);
67014 +
67015 + if (reiser4_blocknr_is_fake(blk))
67016 + return 1;
67017 +
67018 + sbinfo = get_super_private(super);
67019 + return *blk < sbinfo->block_count;
67020 +}
67021 +
67022 +#if REISER4_DEBUG
67023 +/*
67024 + * true, if block number @blk makes sense for the current file system
67025 + */
67026 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
67027 +{
67028 + return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
67029 +}
67030 +#endif /* REISER4_DEBUG */
67031 +
67032 +/* Make Linus happy.
67033 + Local variables:
67034 + c-indentation-style: "K&R"
67035 + mode-name: "LC"
67036 + c-basic-offset: 8
67037 + tab-width: 8
67038 + fill-column: 120
67039 + End:
67040 +*/
67041 diff -urN linux-2.6.20.orig/fs/reiser4/super.h linux-2.6.20/fs/reiser4/super.h
67042 --- linux-2.6.20.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
67043 +++ linux-2.6.20/fs/reiser4/super.h 2007-05-06 14:50:43.875030717 +0400
67044 @@ -0,0 +1,464 @@
67045 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67046 + * reiser4/README */
67047 +
67048 +/* Super-block functions. See super.c for details. */
67049 +
67050 +#if !defined( __REISER4_SUPER_H__ )
67051 +#define __REISER4_SUPER_H__
67052 +
67053 +#include "tree.h"
67054 +#include "entd.h"
67055 +#include "wander.h"
67056 +#include "fsdata.h"
67057 +#include "plugin/object.h"
67058 +#include "plugin/space/space_allocator.h"
67059 +
67060 +/*
67061 + * Flush algorithms parameters.
67062 + */
67063 +typedef struct {
67064 + unsigned relocate_threshold;
67065 + unsigned relocate_distance;
67066 + unsigned written_threshold;
67067 + unsigned scan_maxnodes;
67068 +} flush_params;
67069 +
67070 +typedef enum {
67071 + /*
67072 + * True if this file system doesn't support hard-links (multiple names)
67073 + * for directories: this is default UNIX behavior.
67074 + *
67075 + * If hard-links on directoires are not allowed, file system is Acyclic
67076 + * Directed Graph (modulo dot, and dotdot, of course).
67077 + *
67078 + * This is used by reiser4_link().
67079 + */
67080 + REISER4_ADG = 0,
67081 + /*
67082 + * set if all nodes in internal tree have the same node layout plugin.
67083 + * If so, znode_guess_plugin() will return tree->node_plugin in stead
67084 + * of guessing plugin by plugin id stored in the node.
67085 + */
67086 + REISER4_ONE_NODE_PLUGIN = 1,
67087 + /* if set, bsd gid assignment is supported. */
67088 + REISER4_BSD_GID = 2,
67089 + /* [mac]_time are 32 bit in inode */
67090 + REISER4_32_BIT_TIMES = 3,
67091 + /* load all bitmap blocks at mount time */
67092 + REISER4_DONT_LOAD_BITMAP = 5,
67093 + /* enforce atomicity during write(2) */
67094 + REISER4_ATOMIC_WRITE = 6,
67095 + /* don't use write barriers in the log writer code. */
67096 + REISER4_NO_WRITE_BARRIER = 7
67097 +} reiser4_fs_flag;
67098 +
67099 +/*
67100 + * VFS related operation vectors.
67101 + */
67102 +typedef struct object_ops {
67103 + struct super_operations super;
67104 + struct dentry_operations dentry;
67105 + struct export_operations export;
67106 +} object_ops;
67107 +
67108 +/* reiser4-specific part of super block
67109 +
67110 + Locking
67111 +
67112 + Fields immutable after mount:
67113 +
67114 + ->oid*
67115 + ->space*
67116 + ->default_[ug]id
67117 + ->mkfs_id
67118 + ->trace_flags
67119 + ->debug_flags
67120 + ->fs_flags
67121 + ->df_plug
67122 + ->optimal_io_size
67123 + ->plug
67124 + ->flush
67125 + ->u (bad name)
67126 + ->txnmgr
67127 + ->ra_params
67128 + ->fsuid
67129 + ->journal_header
67130 + ->journal_footer
67131 +
67132 + Fields protected by ->lnode_guard
67133 +
67134 + ->lnode_htable
67135 +
67136 + Fields protected by per-super block spin lock
67137 +
67138 + ->block_count
67139 + ->blocks_used
67140 + ->blocks_free
67141 + ->blocks_free_committed
67142 + ->blocks_grabbed
67143 + ->blocks_fake_allocated_unformatted
67144 + ->blocks_fake_allocated
67145 + ->blocks_flush_reserved
67146 + ->eflushed
67147 + ->blocknr_hint_default
67148 +
67149 + After journal replaying during mount,
67150 +
67151 + ->last_committed_tx
67152 +
67153 + is protected by ->tmgr.commit_mutex
67154 +
67155 + Invariants involving this data-type:
67156 +
67157 + [sb-block-counts]
67158 + [sb-grabbed]
67159 + [sb-fake-allocated]
67160 +*/
67161 +struct reiser4_super_info_data {
67162 + /*
67163 + * guard spinlock which protects reiser4 super block fields (currently
67164 + * blocks_free, blocks_free_committed)
67165 + */
67166 + spinlock_t guard;
67167 +
67168 + /* next oid that will be returned by oid_allocate() */
67169 + oid_t next_to_use;
67170 + /* total number of used oids */
67171 + oid_t oids_in_use;
67172 +
67173 + /* space manager plugin */
67174 + reiser4_space_allocator space_allocator;
67175 +
67176 + /* reiser4 internal tree */
67177 + reiser4_tree tree;
67178 +
67179 + /*
67180 + * default user id used for light-weight files without their own
67181 + * stat-data.
67182 + */
67183 + uid_t default_uid;
67184 +
67185 + /*
67186 + * default group id used for light-weight files without their own
67187 + * stat-data.
67188 + */
67189 + gid_t default_gid;
67190 +
67191 + /* mkfs identifier generated at mkfs time. */
67192 + __u32 mkfs_id;
67193 + /* amount of blocks in a file system */
67194 + __u64 block_count;
67195 +
67196 + /* inviolable reserve */
67197 + __u64 blocks_reserved;
67198 +
67199 + /* amount of blocks used by file system data and meta-data. */
67200 + __u64 blocks_used;
67201 +
67202 + /*
67203 + * amount of free blocks. This is "working" free blocks counter. It is
67204 + * like "working" bitmap, please see block_alloc.c for description.
67205 + */
67206 + __u64 blocks_free;
67207 +
67208 + /*
67209 + * free block count for fs committed state. This is "commit" version of
67210 + * free block counter.
67211 + */
67212 + __u64 blocks_free_committed;
67213 +
67214 + /*
67215 + * number of blocks reserved for further allocation, for all
67216 + * threads.
67217 + */
67218 + __u64 blocks_grabbed;
67219 +
67220 + /* number of fake allocated unformatted blocks in tree. */
67221 + __u64 blocks_fake_allocated_unformatted;
67222 +
67223 + /* number of fake allocated formatted blocks in tree. */
67224 + __u64 blocks_fake_allocated;
67225 +
67226 + /* number of blocks reserved for flush operations. */
67227 + __u64 blocks_flush_reserved;
67228 +
67229 + /* number of blocks reserved for cluster operations. */
67230 + __u64 blocks_clustered;
67231 +
67232 + /* unique file-system identifier */
67233 + __u32 fsuid;
67234 +
67235 + /* On-disk format version. If does not equal to the disk_format
67236 + plugin version, some format updates (e.g. enlarging plugin
67237 + set, etc) may have place on mount. */
67238 + int version;
67239 +
67240 + /* file-system wide flags. See reiser4_fs_flag enum */
67241 + unsigned long fs_flags;
67242 +
67243 + /* transaction manager */
67244 + txn_mgr tmgr;
67245 +
67246 + /* ent thread */
67247 + entd_context entd;
67248 +
67249 + /* fake inode used to bind formatted nodes */
67250 + struct inode *fake;
67251 + /* inode used to bind bitmaps (and journal heads) */
67252 + struct inode *bitmap;
67253 + /* inode used to bind copied on capture nodes */
67254 + struct inode *cc;
67255 +
67256 + /* disk layout plugin */
67257 + disk_format_plugin *df_plug;
67258 +
67259 + /* disk layout specific part of reiser4 super info data */
67260 + union {
67261 + format40_super_info format40;
67262 + } u;
67263 +
67264 + /* value we return in st_blksize on stat(2) */
67265 + unsigned long optimal_io_size;
67266 +
67267 + /* parameters for the flush algorithm */
67268 + flush_params flush;
67269 +
67270 + /* pointers to jnodes for journal header and footer */
67271 + jnode *journal_header;
67272 + jnode *journal_footer;
67273 +
67274 + journal_location jloc;
67275 +
67276 + /* head block number of last committed transaction */
67277 + __u64 last_committed_tx;
67278 +
67279 + /*
67280 + * we remember last written location for using as a hint for new block
67281 + * allocation
67282 + */
67283 + __u64 blocknr_hint_default;
67284 +
67285 + /* committed number of files (oid allocator state variable ) */
67286 + __u64 nr_files_committed;
67287 +
67288 + ra_params_t ra_params;
67289 +
67290 + /*
67291 + * A mutex for serializing cut tree operation if out-of-free-space:
67292 + * the only one cut_tree thread is allowed to grab space from reserved
67293 + * area (it is 5% of disk space)
67294 + */
67295 + struct mutex delete_mutex;
67296 + /* task owning ->delete_mutex */
67297 + struct task_struct *delete_mutex_owner;
67298 +
67299 + /* Diskmap's blocknumber */
67300 + __u64 diskmap_block;
67301 +
67302 + /* What to do in case of error */
67303 + int onerror;
67304 +
67305 + /* operations for objects on this file system */
67306 + object_ops ops;
67307 +
67308 + /*
67309 + * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67310 + * more details
67311 + */
67312 + d_cursor_info d_info;
67313 +
67314 +#ifdef CONFIG_REISER4_BADBLOCKS
67315 + /* Alternative master superblock offset (in bytes) */
67316 + unsigned long altsuper;
67317 +#endif
67318 + struct repacker *repacker;
67319 + struct page *status_page;
67320 + struct bio *status_bio;
67321 +
67322 +#if REISER4_DEBUG
67323 + /*
67324 + * minimum used blocks value (includes super blocks, bitmap blocks and
67325 + * other fs reserved areas), depends on fs format and fs size.
67326 + */
67327 + __u64 min_blocks_used;
67328 +
67329 + /*
67330 + * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67331 + * are kept on a list anchored at sbinfo->all_jnodes. This list is
67332 + * protected by sbinfo->all_guard spin lock. This lock should be taken
67333 + * with _irq modifier, because it is also modified from interrupt
67334 + * contexts (by RCU).
67335 + */
67336 + spinlock_t all_guard;
67337 + /* list of all jnodes */
67338 + struct list_head all_jnodes;
67339 +#endif
67340 + struct dentry *debugfs_root;
67341 +};
67342 +
67343 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
67344 + super_block *super);
67345 +
67346 +/* Return reiser4-specific part of super block */
67347 +static inline reiser4_super_info_data *get_super_private(const struct
67348 + super_block *super)
67349 +{
67350 + assert("nikita-447", super != NULL);
67351 +
67352 + return (reiser4_super_info_data *) super->s_fs_info;
67353 +}
67354 +
67355 +/* get ent context for the @super */
67356 +static inline entd_context *get_entd_context(struct super_block *super)
67357 +{
67358 + return &get_super_private(super)->entd;
67359 +}
67360 +
67361 +/* "Current" super-block: main super block used during current system
67362 + call. Reference to this super block is stored in reiser4_context. */
67363 +static inline struct super_block *reiser4_get_current_sb(void)
67364 +{
67365 + return get_current_context()->super;
67366 +}
67367 +
67368 +/* Reiser4-specific part of "current" super-block: main super block used
67369 + during current system call. Reference to this super block is stored in
67370 + reiser4_context. */
67371 +static inline reiser4_super_info_data *get_current_super_private(void)
67372 +{
67373 + return get_super_private(reiser4_get_current_sb());
67374 +}
67375 +
67376 +static inline ra_params_t *get_current_super_ra_params(void)
67377 +{
67378 + return &(get_current_super_private()->ra_params);
67379 +}
67380 +
67381 +/*
67382 + * true, if file system on @super is read-only
67383 + */
67384 +static inline int rofs_super(struct super_block *super)
67385 +{
67386 + return super->s_flags & MS_RDONLY;
67387 +}
67388 +
67389 +/*
67390 + * true, if @tree represents read-only file system
67391 + */
67392 +static inline int rofs_tree(reiser4_tree * tree)
67393 +{
67394 + return rofs_super(tree->super);
67395 +}
67396 +
67397 +/*
67398 + * true, if file system where @inode lives on, is read-only
67399 + */
67400 +static inline int rofs_inode(struct inode *inode)
67401 +{
67402 + return rofs_super(inode->i_sb);
67403 +}
67404 +
67405 +/*
67406 + * true, if file system where @node lives on, is read-only
67407 + */
67408 +static inline int rofs_jnode(jnode * node)
67409 +{
67410 + return rofs_tree(jnode_get_tree(node));
67411 +}
67412 +
67413 +extern __u64 reiser4_current_block_count(void);
67414 +
67415 +extern void build_object_ops(struct super_block *super, object_ops * ops);
67416 +
67417 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67418 +
67419 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67420 +{
67421 + spin_lock(&(sbinfo->guard));
67422 +}
67423 +
67424 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67425 +{
67426 + assert_spin_locked(&(sbinfo->guard));
67427 + spin_unlock(&(sbinfo->guard));
67428 +}
67429 +
67430 +extern __u64 reiser4_flush_reserved(const struct super_block *);
67431 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67432 +extern long reiser4_statfs_type(const struct super_block *super);
67433 +extern __u64 reiser4_block_count(const struct super_block *super);
67434 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67435 +extern __u64 reiser4_data_blocks(const struct super_block *super);
67436 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67437 +extern __u64 reiser4_free_blocks(const struct super_block *super);
67438 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67439 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
67440 +
67441 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67442 +
67443 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67444 +extern __u64 reiser4_fake_allocated(const struct super_block *);
67445 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67446 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
67447 +
67448 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67449 + gid_t gid);
67450 +
67451 +extern reiser4_space_allocator *
67452 +reiser4_get_space_allocator(const struct super_block *super);
67453 +extern reiser4_oid_allocator *
67454 +reiser4_get_oid_allocator(const struct super_block *super);
67455 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67456 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67457 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67458 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
67459 +extern int is_reiser4_super(const struct super_block *super);
67460 +
67461 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67462 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67463 + const reiser4_block_nr * blk);
67464 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67465 +extern int reiser4_done_super(struct super_block *s);
67466 +
67467 +/* step of fill super */
67468 +extern int reiser4_init_fs_info(struct super_block *);
67469 +extern void reiser4_done_fs_info(struct super_block *);
67470 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67471 +extern int reiser4_init_read_super(struct super_block *, int silent);
67472 +extern int reiser4_init_root_inode(struct super_block *);
67473 +extern reiser4_plugin *get_default_plugin(pset_member memb);
67474 +
67475 +/* Maximal possible object id. */
67476 +#define ABSOLUTE_MAX_OID ((oid_t)~0)
67477 +
67478 +#define OIDS_RESERVED ( 1 << 16 )
67479 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67480 +oid_t oid_allocate(struct super_block *);
67481 +int oid_release(struct super_block *, oid_t);
67482 +oid_t oid_next(const struct super_block *);
67483 +void oid_count_allocated(void);
67484 +void oid_count_released(void);
67485 +long oids_used(const struct super_block *);
67486 +
67487 +#if REISER4_DEBUG
67488 +void print_fs_info(const char *prefix, const struct super_block *);
67489 +#endif
67490 +
67491 +extern void destroy_reiser4_cache(struct kmem_cache **);
67492 +
67493 +extern struct super_operations reiser4_super_operations;
67494 +extern struct export_operations reiser4_export_operations;
67495 +extern struct dentry_operations reiser4_dentry_operations;
67496 +
67497 +/* __REISER4_SUPER_H__ */
67498 +#endif
67499 +
67500 +/*
67501 + * Local variables:
67502 + * c-indentation-style: "K&R"
67503 + * mode-name: "LC"
67504 + * c-basic-offset: 8
67505 + * tab-width: 8
67506 + * fill-column: 120
67507 + * End:
67508 + */
67509 diff -urN linux-2.6.20.orig/fs/reiser4/super_ops.c linux-2.6.20/fs/reiser4/super_ops.c
67510 --- linux-2.6.20.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
67511 +++ linux-2.6.20/fs/reiser4/super_ops.c 2007-05-06 14:50:43.879031967 +0400
67512 @@ -0,0 +1,728 @@
67513 +/* Copyright 2005 by Hans Reiser, licensing governed by
67514 + * reiser4/README */
67515 +
67516 +#include "inode.h"
67517 +#include "page_cache.h"
67518 +#include "ktxnmgrd.h"
67519 +#include "flush.h"
67520 +#include "safe_link.h"
67521 +
67522 +#include <linux/vfs.h>
67523 +#include <linux/writeback.h>
67524 +#include <linux/mount.h>
67525 +#include <linux/seq_file.h>
67526 +#include <linux/debugfs.h>
67527 +
67528 +/* slab cache for inodes */
67529 +static struct kmem_cache *inode_cache;
67530 +
67531 +static struct dentry *reiser4_debugfs_root = NULL;
67532 +
67533 +/**
67534 + * init_once - constructor for reiser4 inodes
67535 + * @obj: inode to be initialized
67536 + * @cache: cache @obj belongs to
67537 + * @flags: SLAB flags
67538 + *
67539 + * Initialization function to be called when new page is allocated by reiser4
67540 + * inode cache. It is set on inode cache creation.
67541 + */
67542 +static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
67543 +{
67544 + reiser4_inode_object *info;
67545 +
67546 + info = obj;
67547 +
67548 + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
67549 + SLAB_CTOR_CONSTRUCTOR) {
67550 + /* initialize vfs inode */
67551 + inode_init_once(&info->vfs_inode);
67552 +
67553 + /*
67554 + * initialize reiser4 specific part fo inode.
67555 + * NOTE-NIKITA add here initializations for locks, list heads,
67556 + * etc. that will be added to our private inode part.
67557 + */
67558 + INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67559 + init_rwsem(&info->p.conv_sem);
67560 + /* init semaphore which is used during inode loading */
67561 + loading_init_once(&info->p);
67562 + INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67563 + GFP_ATOMIC);
67564 +#if REISER4_DEBUG
67565 + info->p.nr_jnodes = 0;
67566 +#endif
67567 + }
67568 +}
67569 +
67570 +/**
67571 + * init_inodes - create znode cache
67572 + *
67573 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67574 + */
67575 +static int init_inodes(void)
67576 +{
67577 + inode_cache = kmem_cache_create("reiser4_inode",
67578 + sizeof(reiser4_inode_object),
67579 + 0,
67580 + SLAB_HWCACHE_ALIGN |
67581 + SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67582 + if (inode_cache == NULL)
67583 + return RETERR(-ENOMEM);
67584 + return 0;
67585 +}
67586 +
67587 +/**
67588 + * done_inodes - delete inode cache
67589 + *
67590 + * This is called on reiser4 module unloading or system shutdown.
67591 + */
67592 +static void done_inodes(void)
67593 +{
67594 + destroy_reiser4_cache(&inode_cache);
67595 +}
67596 +
67597 +/**
67598 + * reiser4_alloc_inode - alloc_inode of super operations
67599 + * @super: super block new inode is allocated for
67600 + *
67601 + * Allocates new inode, initializes reiser4 specific part of it.
67602 + */
67603 +static struct inode *reiser4_alloc_inode(struct super_block *super)
67604 +{
67605 + reiser4_inode_object *obj;
67606 +
67607 + assert("nikita-1696", super != NULL);
67608 + obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
67609 + if (obj != NULL) {
67610 + reiser4_inode *info;
67611 +
67612 + info = &obj->p;
67613 +
67614 + info->pset = plugin_set_get_empty();
67615 + info->hset = plugin_set_get_empty();
67616 + info->extmask = 0;
67617 + info->locality_id = 0ull;
67618 + info->plugin_mask = 0;
67619 + info->heir_mask = 0;
67620 +#if !REISER4_INO_IS_OID
67621 + info->oid_hi = 0;
67622 +#endif
67623 + reiser4_seal_init(&info->sd_seal, NULL, NULL);
67624 + coord_init_invalid(&info->sd_coord, NULL);
67625 + info->flags = 0;
67626 + spin_lock_init(&info->guard);
67627 + /* this deals with info's loading semaphore */
67628 + loading_alloc(info);
67629 + info->vroot = UBER_TREE_ADDR;
67630 + return &obj->vfs_inode;
67631 + } else
67632 + return NULL;
67633 +}
67634 +
67635 +/**
67636 + * reiser4_destroy_inode - destroy_inode of super operations
67637 + * @inode: inode being destroyed
67638 + *
67639 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67640 + */
67641 +static void reiser4_destroy_inode(struct inode *inode)
67642 +{
67643 + reiser4_inode *info;
67644 +
67645 + info = reiser4_inode_data(inode);
67646 +
67647 + assert("vs-1220", inode_has_no_jnodes(info));
67648 +
67649 + if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67650 + file_plugin *fplug = inode_file_plugin(inode);
67651 + if (fplug->destroy_inode != NULL)
67652 + fplug->destroy_inode(inode);
67653 + }
67654 + reiser4_dispose_cursors(inode);
67655 + if (info->pset)
67656 + plugin_set_put(info->pset);
67657 + if (info->hset)
67658 + plugin_set_put(info->hset);
67659 +
67660 + /*
67661 + * cannot add similar assertion about ->i_list as prune_icache return
67662 + * inode into slab with dangling ->list.{next,prev}. This is safe,
67663 + * because they are re-initialized in the new_inode().
67664 + */
67665 + assert("nikita-2895", list_empty(&inode->i_dentry));
67666 + assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67667 + assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67668 +
67669 + /* this deals with info's loading semaphore */
67670 + loading_destroy(info);
67671 +
67672 + kmem_cache_free(inode_cache,
67673 + container_of(info, reiser4_inode_object, p));
67674 +}
67675 +
67676 +/**
67677 + * reiser4_dirty_inode - dirty_inode of super operations
67678 + * @inode: inode being dirtied
67679 + *
67680 + * Updates stat data.
67681 + */
67682 +static void reiser4_dirty_inode(struct inode *inode)
67683 +{
67684 + int result;
67685 +
67686 + if (!is_in_reiser4_context())
67687 + return;
67688 + assert("", !IS_RDONLY(inode));
67689 + assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67690 + get_current_context()->grabbed_blocks));
67691 +
67692 + result = reiser4_update_sd(inode);
67693 + if (result)
67694 + warning("", "failed to dirty inode for %llu: %d",
67695 + get_inode_oid(inode), result);
67696 +}
67697 +
67698 +/**
67699 + * reiser4_delete_inode - delete_inode of super operations
67700 + * @inode: inode to delete
67701 + *
67702 + * Calls file plugin's delete_object method to delete object items from
67703 + * filesystem tree and calls clear_inode.
67704 + */
67705 +static void reiser4_delete_inode(struct inode *inode)
67706 +{
67707 + reiser4_context *ctx;
67708 + file_plugin *fplug;
67709 +
67710 + ctx = reiser4_init_context(inode->i_sb);
67711 + if (IS_ERR(ctx)) {
67712 + warning("vs-15", "failed to init context");
67713 + return;
67714 + }
67715 +
67716 + if (is_inode_loaded(inode)) {
67717 + fplug = inode_file_plugin(inode);
67718 + if (fplug != NULL && fplug->delete_object != NULL)
67719 + fplug->delete_object(inode);
67720 + }
67721 +
67722 + truncate_inode_pages(&inode->i_data, 0);
67723 + inode->i_blocks = 0;
67724 + clear_inode(inode);
67725 + reiser4_exit_context(ctx);
67726 +}
67727 +
67728 +/**
67729 + * reiser4_put_super - put_super of super operations
67730 + * @super: super block to free
67731 + *
67732 + * Stops daemons, release resources, umounts in short.
67733 + */
67734 +static void reiser4_put_super(struct super_block *super)
67735 +{
67736 + reiser4_super_info_data *sbinfo;
67737 + reiser4_context *ctx;
67738 +
67739 + sbinfo = get_super_private(super);
67740 + assert("vs-1699", sbinfo);
67741 +
67742 + debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67743 + debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67744 + debugfs_remove(sbinfo->debugfs_root);
67745 +
67746 + ctx = reiser4_init_context(super);
67747 + if (IS_ERR(ctx)) {
67748 + warning("vs-17", "failed to init context");
67749 + return;
67750 + }
67751 +
67752 + /* have disk format plugin to free its resources */
67753 + if (get_super_private(super)->df_plug->release)
67754 + get_super_private(super)->df_plug->release(super);
67755 +
67756 + reiser4_done_formatted_fake(super);
67757 +
67758 + /* stop daemons: ktxnmgr and entd */
67759 + reiser4_done_entd(super);
67760 + reiser4_done_ktxnmgrd(super);
67761 + reiser4_done_txnmgr(&sbinfo->tmgr);
67762 +
67763 + reiser4_done_fs_info(super);
67764 + reiser4_exit_context(ctx);
67765 +}
67766 +
67767 +/**
67768 + * reiser4_write_super - write_super of super operations
67769 + * @super: super block to write
67770 + *
67771 + * Captures znode associated with super block, comit all transactions.
67772 + */
67773 +static void reiser4_write_super(struct super_block *super)
67774 +{
67775 + int ret;
67776 + reiser4_context *ctx;
67777 +
67778 + assert("vs-1700", !rofs_super(super));
67779 +
67780 + ctx = reiser4_init_context(super);
67781 + if (IS_ERR(ctx)) {
67782 + warning("vs-16", "failed to init context");
67783 + return;
67784 + }
67785 +
67786 + ret = reiser4_capture_super_block(super);
67787 + if (ret != 0)
67788 + warning("vs-1701",
67789 + "reiser4_capture_super_block failed in write_super: %d",
67790 + ret);
67791 + ret = txnmgr_force_commit_all(super, 0);
67792 + if (ret != 0)
67793 + warning("jmacd-77113",
67794 + "txn_force failed in write_super: %d", ret);
67795 +
67796 + super->s_dirt = 0;
67797 +
67798 + reiser4_exit_context(ctx);
67799 +}
67800 +
67801 +/**
67802 + * reiser4_statfs - statfs of super operations
67803 + * @super: super block of file system in queried
67804 + * @stafs: buffer to fill with statistics
67805 + *
67806 + * Returns information about filesystem.
67807 + */
67808 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
67809 +{
67810 + sector_t total;
67811 + sector_t reserved;
67812 + sector_t free;
67813 + sector_t forroot;
67814 + sector_t deleted;
67815 + reiser4_context *ctx;
67816 + struct super_block *super = dentry->d_sb;
67817 +
67818 + assert("nikita-408", super != NULL);
67819 + assert("nikita-409", statfs != NULL);
67820 +
67821 + ctx = reiser4_init_context(super);
67822 + if (IS_ERR(ctx))
67823 + return PTR_ERR(ctx);
67824 +
67825 + statfs->f_type = reiser4_statfs_type(super);
67826 + statfs->f_bsize = super->s_blocksize;
67827 +
67828 + /*
67829 + * 5% of total block space is reserved. This is needed for flush and
67830 + * for truncates (so that we are able to perform truncate/unlink even
67831 + * on the otherwise completely full file system). If this reservation
67832 + * is hidden from statfs(2), users will mistakenly guess that they
67833 + * have enough free space to complete some operation, which is
67834 + * frustrating.
67835 + *
67836 + * Another possible solution is to subtract ->blocks_reserved from
67837 + * ->f_bfree, but changing available space seems less intrusive than
67838 + * letting user to see 5% of disk space to be used directly after
67839 + * mkfs.
67840 + */
67841 + total = reiser4_block_count(super);
67842 + reserved = get_super_private(super)->blocks_reserved;
67843 + deleted = txnmgr_count_deleted_blocks();
67844 + free = reiser4_free_blocks(super) + deleted;
67845 + forroot = reiser4_reserved_blocks(super, 0, 0);
67846 +
67847 + /*
67848 + * These counters may be in inconsistent state because we take the
67849 + * values without keeping any global spinlock. Here we do a sanity
67850 + * check that free block counter does not exceed the number of all
67851 + * blocks.
67852 + */
67853 + if (free > total)
67854 + free = total;
67855 + statfs->f_blocks = total - reserved;
67856 + /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67857 + if (free > reserved)
67858 + free -= reserved;
67859 + else
67860 + free = 0;
67861 + statfs->f_bfree = free;
67862 +
67863 + if (free > forroot)
67864 + free -= forroot;
67865 + else
67866 + free = 0;
67867 + statfs->f_bavail = free;
67868 +
67869 + statfs->f_files = 0;
67870 + statfs->f_ffree = 0;
67871 +
67872 + /* maximal acceptable name length depends on directory plugin. */
67873 + assert("nikita-3351", super->s_root->d_inode != NULL);
67874 + statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67875 + reiser4_exit_context(ctx);
67876 + return 0;
67877 +}
67878 +
67879 +/**
67880 + * reiser4_clear_inode - clear_inode of super operation
67881 + * @inode: inode about to destroy
67882 + *
67883 + * Does sanity checks: being destroyed should have all jnodes detached.
67884 + */
67885 +static void reiser4_clear_inode(struct inode *inode)
67886 +{
67887 +#if REISER4_DEBUG
67888 + reiser4_inode *r4_inode;
67889 +
67890 + r4_inode = reiser4_inode_data(inode);
67891 + if (!inode_has_no_jnodes(r4_inode))
67892 + warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67893 + r4_inode->nr_jnodes);
67894 +#endif
67895 +}
67896 +
67897 +/**
67898 + * reiser4_sync_inodes - sync_inodes of super operations
67899 + * @super:
67900 + * @wbc:
67901 + *
67902 + * This method is called by background and non-backgound writeback. Reiser4's
67903 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67904 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67905 + * mapping - dirty pages get into atoms. Writeout is called to flush some
67906 + * atoms.
67907 + */
67908 +static void reiser4_sync_inodes(struct super_block *super,
67909 + struct writeback_control *wbc)
67910 +{
67911 + reiser4_context *ctx;
67912 + long to_write;
67913 +
67914 + if (wbc->for_kupdate)
67915 + /* reiser4 has its own means of periodical write-out */
67916 + return;
67917 +
67918 + to_write = wbc->nr_to_write;
67919 + assert("vs-49", wbc->older_than_this == NULL);
67920 +
67921 + ctx = reiser4_init_context(super);
67922 + if (IS_ERR(ctx)) {
67923 + warning("vs-13", "failed to init context");
67924 + return;
67925 + }
67926 +
67927 + /*
67928 + * call reiser4_writepages for each of dirty inodes to turn dirty pages
67929 + * into transactions if they were not yet.
67930 + */
67931 + generic_sync_sb_inodes(super, wbc);
67932 +
67933 + /* flush goes here */
67934 + wbc->nr_to_write = to_write;
67935 + reiser4_writeout(super, wbc);
67936 +
67937 + /* avoid recursive calls to ->sync_inodes */
67938 + context_set_commit_async(ctx);
67939 + reiser4_exit_context(ctx);
67940 +}
67941 +
67942 +/**
67943 + * reiser4_show_options - show_options of super operations
67944 + * @m: file where to write information
67945 + * @mnt: mount structure
67946 + *
67947 + * Makes reiser4 mount options visible in /proc/mounts.
67948 + */
67949 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67950 +{
67951 + struct super_block *super;
67952 + reiser4_super_info_data *sbinfo;
67953 +
67954 + super = mnt->mnt_sb;
67955 + sbinfo = get_super_private(super);
67956 +
67957 + seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67958 + seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67959 + seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67960 + seq_printf(m, ",atom_max_flushers=0x%x",
67961 + sbinfo->tmgr.atom_max_flushers);
67962 + seq_printf(m, ",cbk_cache_slots=0x%x",
67963 + sbinfo->tree.cbk_cache.nr_slots);
67964 +
67965 + return 0;
67966 +}
67967 +
67968 +struct super_operations reiser4_super_operations = {
67969 + .alloc_inode = reiser4_alloc_inode,
67970 + .destroy_inode = reiser4_destroy_inode,
67971 + .dirty_inode = reiser4_dirty_inode,
67972 + .delete_inode = reiser4_delete_inode,
67973 + .put_super = reiser4_put_super,
67974 + .write_super = reiser4_write_super,
67975 + .statfs = reiser4_statfs,
67976 + .clear_inode = reiser4_clear_inode,
67977 + .sync_inodes = reiser4_sync_inodes,
67978 + .show_options = reiser4_show_options
67979 +};
67980 +
67981 +/**
67982 + * fill_super - initialize super block on mount
67983 + * @super: super block to fill
67984 + * @data: reiser4 specific mount option
67985 + * @silent:
67986 + *
67987 + * This is to be called by reiser4_get_sb. Mounts filesystem.
67988 + */
67989 +static int fill_super(struct super_block *super, void *data, int silent)
67990 +{
67991 + reiser4_context ctx;
67992 + int result;
67993 + reiser4_super_info_data *sbinfo;
67994 +
67995 + assert("zam-989", super != NULL);
67996 +
67997 + super->s_op = NULL;
67998 + init_stack_context(&ctx, super);
67999 +
68000 + /* allocate reiser4 specific super block */
68001 + if ((result = reiser4_init_fs_info(super)) != 0)
68002 + goto failed_init_sinfo;
68003 +
68004 + sbinfo = get_super_private(super);
68005 + /* initialize various reiser4 parameters, parse mount options */
68006 + if ((result = reiser4_init_super_data(super, data)) != 0)
68007 + goto failed_init_super_data;
68008 +
68009 + /* read reiser4 master super block, initialize disk format plugin */
68010 + if ((result = reiser4_init_read_super(super, silent)) != 0)
68011 + goto failed_init_read_super;
68012 +
68013 + /* initialize transaction manager */
68014 + reiser4_init_txnmgr(&sbinfo->tmgr);
68015 +
68016 + /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
68017 + if ((result = reiser4_init_ktxnmgrd(super)) != 0)
68018 + goto failed_init_ktxnmgrd;
68019 +
68020 + /* initialize entd context and start kernel thread entd */
68021 + if ((result = reiser4_init_entd(super)) != 0)
68022 + goto failed_init_entd;
68023 +
68024 + /* initialize address spaces for formatted nodes and bitmaps */
68025 + if ((result = reiser4_init_formatted_fake(super)) != 0)
68026 + goto failed_init_formatted_fake;
68027 +
68028 + /* initialize disk format plugin */
68029 + if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
68030 + goto failed_init_disk_format;
68031 +
68032 + /*
68033 + * There are some 'committed' versions of reiser4 super block counters,
68034 + * which correspond to reiser4 on-disk state. These counters are
68035 + * initialized here
68036 + */
68037 + sbinfo->blocks_free_committed = sbinfo->blocks_free;
68038 + sbinfo->nr_files_committed = oids_used(super);
68039 +
68040 + /* get inode of root directory */
68041 + if ((result = reiser4_init_root_inode(super)) != 0)
68042 + goto failed_init_root_inode;
68043 +
68044 + if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
68045 + goto failed_update_format_version;
68046 +
68047 + process_safelinks(super);
68048 + reiser4_exit_context(&ctx);
68049 +
68050 + sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68051 + reiser4_debugfs_root);
68052 + if (sbinfo->debugfs_root) {
68053 + sbinfo->tmgr.debugfs_atom_count =
68054 + debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68055 + sbinfo->debugfs_root,
68056 + &sbinfo->tmgr.atom_count);
68057 + sbinfo->tmgr.debugfs_id_count =
68058 + debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68059 + sbinfo->debugfs_root,
68060 + &sbinfo->tmgr.id_count);
68061 + }
68062 + return 0;
68063 +
68064 + failed_update_format_version:
68065 + failed_init_root_inode:
68066 + if (sbinfo->df_plug->release)
68067 + sbinfo->df_plug->release(super);
68068 + failed_init_disk_format:
68069 + reiser4_done_formatted_fake(super);
68070 + failed_init_formatted_fake:
68071 + reiser4_done_entd(super);
68072 + failed_init_entd:
68073 + reiser4_done_ktxnmgrd(super);
68074 + failed_init_ktxnmgrd:
68075 + reiser4_done_txnmgr(&sbinfo->tmgr);
68076 + failed_init_read_super:
68077 + failed_init_super_data:
68078 + reiser4_done_fs_info(super);
68079 + failed_init_sinfo:
68080 + reiser4_exit_context(&ctx);
68081 + return result;
68082 +}
68083 +
68084 +/**
68085 + * reiser4_get_sb - get_sb of file_system_type operations
68086 + * @fs_type:
68087 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68088 + * @dev_name: block device file name
68089 + * @data: specific mount options
68090 + *
68091 + * Reiser4 mount entry.
68092 + */
68093 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
68094 + const char *dev_name, void *data, struct vfsmount *mnt)
68095 +{
68096 + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
68097 +}
68098 +
68099 +/* structure describing the reiser4 filesystem implementation */
68100 +static struct file_system_type reiser4_fs_type = {
68101 + .owner = THIS_MODULE,
68102 + .name = "reiser4",
68103 + .fs_flags = FS_REQUIRES_DEV,
68104 + .get_sb = reiser4_get_sb,
68105 + .kill_sb = kill_block_super,
68106 + .next = NULL
68107 +};
68108 +
68109 +void destroy_reiser4_cache(struct kmem_cache **cachep)
68110 +{
68111 + BUG_ON(*cachep == NULL);
68112 + kmem_cache_destroy(*cachep);
68113 + *cachep = NULL;
68114 +}
68115 +
68116 +/**
68117 + * init_reiser4 - reiser4 initialization entry point
68118 + *
68119 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68120 + * on kernel initialization or during reiser4 module load.
68121 + */
68122 +static int __init init_reiser4(void)
68123 +{
68124 + int result;
68125 +
68126 + printk(KERN_INFO
68127 + "Loading Reiser4. "
68128 + "See www.namesys.com for a description of Reiser4.\n");
68129 +
68130 + /* initialize slab cache of inodes */
68131 + if ((result = init_inodes()) != 0)
68132 + goto failed_inode_cache;
68133 +
68134 + /* initialize cache of znodes */
68135 + if ((result = init_znodes()) != 0)
68136 + goto failed_init_znodes;
68137 +
68138 + /* initialize all plugins */
68139 + if ((result = init_plugins()) != 0)
68140 + goto failed_init_plugins;
68141 +
68142 + /* initialize cache of plugin_set-s and plugin_set's hash table */
68143 + if ((result = init_plugin_set()) != 0)
68144 + goto failed_init_plugin_set;
68145 +
68146 + /* initialize caches of txn_atom-s and txn_handle-s */
68147 + if ((result = init_txnmgr_static()) != 0)
68148 + goto failed_init_txnmgr_static;
68149 +
68150 + /* initialize cache of jnodes */
68151 + if ((result = init_jnodes()) != 0)
68152 + goto failed_init_jnodes;
68153 +
68154 + /* initialize cache of flush queues */
68155 + if ((result = reiser4_init_fqs()) != 0)
68156 + goto failed_init_fqs;
68157 +
68158 + /* initialize cache of structures attached to dentry->d_fsdata */
68159 + if ((result = reiser4_init_dentry_fsdata()) != 0)
68160 + goto failed_init_dentry_fsdata;
68161 +
68162 + /* initialize cache of structures attached to file->private_data */
68163 + if ((result = reiser4_init_file_fsdata()) != 0)
68164 + goto failed_init_file_fsdata;
68165 +
68166 + /*
68167 + * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68168 + * more details
68169 + */
68170 + if ((result = reiser4_init_d_cursor()) != 0)
68171 + goto failed_init_d_cursor;
68172 +
68173 + if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68174 + reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68175 + return 0;
68176 + }
68177 +
68178 + reiser4_done_d_cursor();
68179 + failed_init_d_cursor:
68180 + reiser4_done_file_fsdata();
68181 + failed_init_file_fsdata:
68182 + reiser4_done_dentry_fsdata();
68183 + failed_init_dentry_fsdata:
68184 + reiser4_done_fqs();
68185 + failed_init_fqs:
68186 + done_jnodes();
68187 + failed_init_jnodes:
68188 + done_txnmgr_static();
68189 + failed_init_txnmgr_static:
68190 + done_plugin_set();
68191 + failed_init_plugin_set:
68192 + failed_init_plugins:
68193 + done_znodes();
68194 + failed_init_znodes:
68195 + done_inodes();
68196 + failed_inode_cache:
68197 + return result;
68198 +}
68199 +
68200 +/**
68201 + * done_reiser4 - reiser4 exit entry point
68202 + *
68203 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68204 + * or at module unload.
68205 + */
68206 +static void __exit done_reiser4(void)
68207 +{
68208 + int result;
68209 +
68210 + debugfs_remove(reiser4_debugfs_root);
68211 + result = unregister_filesystem(&reiser4_fs_type);
68212 + BUG_ON(result != 0);
68213 + reiser4_done_d_cursor();
68214 + reiser4_done_file_fsdata();
68215 + reiser4_done_dentry_fsdata();
68216 + reiser4_done_fqs();
68217 + done_jnodes();
68218 + done_txnmgr_static();
68219 + done_plugin_set();
68220 + done_znodes();
68221 + destroy_reiser4_cache(&inode_cache);
68222 +}
68223 +
68224 +module_init(init_reiser4);
68225 +module_exit(done_reiser4);
68226 +
68227 +MODULE_DESCRIPTION("Reiser4 filesystem");
68228 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68229 +
68230 +MODULE_LICENSE("GPL");
68231 +
68232 +/*
68233 + * Local variables:
68234 + * c-indentation-style: "K&R"
68235 + * mode-name: "LC"
68236 + * c-basic-offset: 8
68237 + * tab-width: 8
68238 + * fill-column: 79
68239 + * End:
68240 + */
68241 diff -urN linux-2.6.20.orig/fs/reiser4/tap.c linux-2.6.20/fs/reiser4/tap.c
68242 --- linux-2.6.20.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
68243 +++ linux-2.6.20/fs/reiser4/tap.c 2007-05-06 14:50:43.879031967 +0400
68244 @@ -0,0 +1,377 @@
68245 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68246 + * reiser4/README */
68247 +
68248 +/*
68249 + Tree Access Pointer (tap).
68250 +
68251 + tap is data structure combining coord and lock handle (mostly). It is
68252 + useful when one has to scan tree nodes (for example, in readdir, or flush),
68253 + for tap functions allow to move tap in either direction transparently
68254 + crossing unit/item/node borders.
68255 +
68256 + Tap doesn't provide automatic synchronization of its fields as it is
68257 + supposed to be per-thread object.
68258 +*/
68259 +
68260 +#include "forward.h"
68261 +#include "debug.h"
68262 +#include "coord.h"
68263 +#include "tree.h"
68264 +#include "context.h"
68265 +#include "tap.h"
68266 +#include "znode.h"
68267 +#include "tree_walk.h"
68268 +
68269 +#if REISER4_DEBUG
68270 +static int tap_invariant(const tap_t * tap);
68271 +static void tap_check(const tap_t * tap);
68272 +#else
68273 +#define tap_check(tap) noop
68274 +#endif
68275 +
68276 +/** load node tap is pointing to, if not loaded already */
68277 +int reiser4_tap_load(tap_t * tap)
68278 +{
68279 + tap_check(tap);
68280 + if (tap->loaded == 0) {
68281 + int result;
68282 +
68283 + result = zload_ra(tap->coord->node, &tap->ra_info);
68284 + if (result != 0)
68285 + return result;
68286 + coord_clear_iplug(tap->coord);
68287 + }
68288 + ++tap->loaded;
68289 + tap_check(tap);
68290 + return 0;
68291 +}
68292 +
68293 +/** release node tap is pointing to. Dual to tap_load() */
68294 +void reiser4_tap_relse(tap_t * tap)
68295 +{
68296 + tap_check(tap);
68297 + if (tap->loaded > 0) {
68298 + --tap->loaded;
68299 + if (tap->loaded == 0) {
68300 + zrelse(tap->coord->node);
68301 + }
68302 + }
68303 + tap_check(tap);
68304 +}
68305 +
68306 +/**
68307 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68308 + * @mode
68309 + */
68310 +void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68311 + znode_lock_mode mode)
68312 +{
68313 + tap->coord = coord;
68314 + tap->lh = lh;
68315 + tap->mode = mode;
68316 + tap->loaded = 0;
68317 + INIT_LIST_HEAD(&tap->linkage);
68318 + reiser4_init_ra_info(&tap->ra_info);
68319 +}
68320 +
68321 +/** add @tap to the per-thread list of all taps */
68322 +void reiser4_tap_monitor(tap_t * tap)
68323 +{
68324 + assert("nikita-2623", tap != NULL);
68325 + tap_check(tap);
68326 + list_add(&tap->linkage, reiser4_taps_list());
68327 + tap_check(tap);
68328 +}
68329 +
68330 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68331 + * loaded. */
68332 +void reiser4_tap_copy(tap_t * dst, tap_t * src)
68333 +{
68334 + assert("nikita-3193", src != NULL);
68335 + assert("nikita-3194", dst != NULL);
68336 +
68337 + *dst->coord = *src->coord;
68338 + if (src->lh->node)
68339 + copy_lh(dst->lh, src->lh);
68340 + dst->mode = src->mode;
68341 + dst->loaded = 0;
68342 + INIT_LIST_HEAD(&dst->linkage);
68343 + dst->ra_info = src->ra_info;
68344 +}
68345 +
68346 +/** finish with @tap */
68347 +void reiser4_tap_done(tap_t * tap)
68348 +{
68349 + assert("nikita-2565", tap != NULL);
68350 + tap_check(tap);
68351 + if (tap->loaded > 0)
68352 + zrelse(tap->coord->node);
68353 + done_lh(tap->lh);
68354 + tap->loaded = 0;
68355 + list_del_init(&tap->linkage);
68356 + tap->coord->node = NULL;
68357 +}
68358 +
68359 +/**
68360 + * move @tap to the new node, locked with @target. Load @target, if @tap was
68361 + * already loaded.
68362 + */
68363 +int reiser4_tap_move(tap_t * tap, lock_handle * target)
68364 +{
68365 + int result = 0;
68366 +
68367 + assert("nikita-2567", tap != NULL);
68368 + assert("nikita-2568", target != NULL);
68369 + assert("nikita-2570", target->node != NULL);
68370 + assert("nikita-2569", tap->coord->node == tap->lh->node);
68371 +
68372 + tap_check(tap);
68373 + if (tap->loaded > 0)
68374 + result = zload_ra(target->node, &tap->ra_info);
68375 +
68376 + if (result == 0) {
68377 + if (tap->loaded > 0)
68378 + zrelse(tap->coord->node);
68379 + done_lh(tap->lh);
68380 + copy_lh(tap->lh, target);
68381 + tap->coord->node = target->node;
68382 + coord_clear_iplug(tap->coord);
68383 + }
68384 + tap_check(tap);
68385 + return result;
68386 +}
68387 +
68388 +/**
68389 + * move @tap to @target. Acquire lock on @target, if @tap was already
68390 + * loaded.
68391 + */
68392 +static int tap_to(tap_t * tap, znode * target)
68393 +{
68394 + int result;
68395 +
68396 + assert("nikita-2624", tap != NULL);
68397 + assert("nikita-2625", target != NULL);
68398 +
68399 + tap_check(tap);
68400 + result = 0;
68401 + if (tap->coord->node != target) {
68402 + lock_handle here;
68403 +
68404 + init_lh(&here);
68405 + result = longterm_lock_znode(&here, target,
68406 + tap->mode, ZNODE_LOCK_HIPRI);
68407 + if (result == 0) {
68408 + result = reiser4_tap_move(tap, &here);
68409 + done_lh(&here);
68410 + }
68411 + }
68412 + tap_check(tap);
68413 + return result;
68414 +}
68415 +
68416 +/**
68417 + * move @tap to given @target, loading and locking @target->node if
68418 + * necessary
68419 + */
68420 +int tap_to_coord(tap_t * tap, coord_t * target)
68421 +{
68422 + int result;
68423 +
68424 + tap_check(tap);
68425 + result = tap_to(tap, target->node);
68426 + if (result == 0)
68427 + coord_dup(tap->coord, target);
68428 + tap_check(tap);
68429 + return result;
68430 +}
68431 +
68432 +/** return list of all taps */
68433 +struct list_head *reiser4_taps_list(void)
68434 +{
68435 + return &get_current_context()->taps;
68436 +}
68437 +
68438 +/** helper function for go_{next,prev}_{item,unit,node}() */
68439 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
68440 +{
68441 + coord_t dup;
68442 + coord_t *coord;
68443 + int result;
68444 +
68445 + int (*coord_dir) (coord_t *);
68446 + int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68447 + void (*coord_init) (coord_t *, const znode *);
68448 + ON_DEBUG(int (*coord_check) (const coord_t *));
68449 +
68450 + assert("nikita-2556", tap != NULL);
68451 + assert("nikita-2557", tap->coord != NULL);
68452 + assert("nikita-2558", tap->lh != NULL);
68453 + assert("nikita-2559", tap->coord->node != NULL);
68454 +
68455 + tap_check(tap);
68456 + if (dir == LEFT_SIDE) {
68457 + coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68458 + get_dir_neighbor = reiser4_get_left_neighbor;
68459 + coord_init = coord_init_last_unit;
68460 + } else {
68461 + coord_dir = units_p ? coord_next_unit : coord_next_item;
68462 + get_dir_neighbor = reiser4_get_right_neighbor;
68463 + coord_init = coord_init_first_unit;
68464 + }
68465 + ON_DEBUG(coord_check =
68466 + units_p ? coord_is_existing_unit : coord_is_existing_item);
68467 + assert("nikita-2560", coord_check(tap->coord));
68468 +
68469 + coord = tap->coord;
68470 + coord_dup(&dup, coord);
68471 + if (coord_dir(&dup) != 0) {
68472 + do {
68473 + /* move to the left neighboring node */
68474 + lock_handle dup;
68475 +
68476 + init_lh(&dup);
68477 + result =
68478 + get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68479 + GN_CAN_USE_UPPER_LEVELS);
68480 + if (result == 0) {
68481 + result = reiser4_tap_move(tap, &dup);
68482 + if (result == 0)
68483 + coord_init(tap->coord, dup.node);
68484 + done_lh(&dup);
68485 + }
68486 + /* skip empty nodes */
68487 + } while ((result == 0) && node_is_empty(coord->node));
68488 + } else {
68489 + result = 0;
68490 + coord_dup(coord, &dup);
68491 + }
68492 + assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68493 + tap_check(tap);
68494 + return result;
68495 +}
68496 +
68497 +/**
68498 + * move @tap to the next unit, transparently crossing item and node
68499 + * boundaries
68500 + */
68501 +int go_next_unit(tap_t * tap)
68502 +{
68503 + return go_dir_el(tap, RIGHT_SIDE, 1);
68504 +}
68505 +
68506 +/**
68507 + * move @tap to the previous unit, transparently crossing item and node
68508 + * boundaries
68509 + */
68510 +int go_prev_unit(tap_t * tap)
68511 +{
68512 + return go_dir_el(tap, LEFT_SIDE, 1);
68513 +}
68514 +
68515 +/**
68516 + * @shift times apply @actor to the @tap. This is used to move @tap by
68517 + * @shift units (or items, or nodes) in either direction.
68518 + */
68519 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68520 +{
68521 + int result;
68522 +
68523 + assert("nikita-2555", shift >= 0);
68524 + assert("nikita-2562", tap->coord->node == tap->lh->node);
68525 +
68526 + tap_check(tap);
68527 + result = reiser4_tap_load(tap);
68528 + if (result != 0)
68529 + return result;
68530 +
68531 + for (; shift > 0; --shift) {
68532 + result = actor(tap);
68533 + assert("nikita-2563", tap->coord->node == tap->lh->node);
68534 + if (result != 0)
68535 + break;
68536 + }
68537 + reiser4_tap_relse(tap);
68538 + tap_check(tap);
68539 + return result;
68540 +}
68541 +
68542 +/** move @tap @shift units rightward */
68543 +int rewind_right(tap_t * tap, int shift)
68544 +{
68545 + return rewind_to(tap, go_next_unit, shift);
68546 +}
68547 +
68548 +/** move @tap @shift units leftward */
68549 +int rewind_left(tap_t * tap, int shift)
68550 +{
68551 + return rewind_to(tap, go_prev_unit, shift);
68552 +}
68553 +
68554 +#if REISER4_DEBUG
68555 +/** debugging function: print @tap content in human readable form */
68556 +static void print_tap(const char *prefix, const tap_t * tap)
68557 +{
68558 + if (tap == NULL) {
68559 + printk("%s: null tap\n", prefix);
68560 + return;
68561 + }
68562 + printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68563 + tap->loaded, (&tap->linkage == tap->linkage.next &&
68564 + &tap->linkage == tap->linkage.prev),
68565 + tap->lh->node,
68566 + lock_mode_name(tap->mode));
68567 + print_coord("\tcoord", tap->coord, 0);
68568 +}
68569 +
68570 +/** check [tap-sane] invariant */
68571 +static int tap_invariant(const tap_t * tap)
68572 +{
68573 + /* [tap-sane] invariant */
68574 +
68575 + if (tap == NULL)
68576 + return 1;
68577 + /* tap->mode is one of
68578 + *
68579 + * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68580 + */
68581 + if (tap->mode != ZNODE_NO_LOCK &&
68582 + tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68583 + return 2;
68584 + /* tap->coord != NULL, and */
68585 + if (tap->coord == NULL)
68586 + return 3;
68587 + /* tap->lh != NULL, and */
68588 + if (tap->lh == NULL)
68589 + return 4;
68590 + /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68591 + if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68592 + return 5;
68593 + /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68594 + if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68595 + return 6;
68596 + return 0;
68597 +}
68598 +
68599 +/** debugging function: check internal @tap consistency */
68600 +static void tap_check(const tap_t * tap)
68601 +{
68602 + int result;
68603 +
68604 + result = tap_invariant(tap);
68605 + if (result != 0) {
68606 + print_tap("broken", tap);
68607 + reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68608 + }
68609 +}
68610 +#endif
68611 +
68612 +/* Make Linus happy.
68613 + Local variables:
68614 + c-indentation-style: "K&R"
68615 + mode-name: "LC"
68616 + c-basic-offset: 8
68617 + tab-width: 8
68618 + fill-column: 120
68619 + scroll-step: 1
68620 + End:
68621 +*/
68622 diff -urN linux-2.6.20.orig/fs/reiser4/tap.h linux-2.6.20/fs/reiser4/tap.h
68623 --- linux-2.6.20.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
68624 +++ linux-2.6.20/fs/reiser4/tap.h 2007-05-06 14:50:43.879031967 +0400
68625 @@ -0,0 +1,70 @@
68626 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68627 +
68628 +/* Tree Access Pointers. See tap.c for more details. */
68629 +
68630 +#if !defined( __REISER4_TAP_H__ )
68631 +#define __REISER4_TAP_H__
68632 +
68633 +#include "forward.h"
68634 +#include "readahead.h"
68635 +
68636 +/**
68637 + tree_access_pointer aka tap. Data structure combining coord_t and lock
68638 + handle.
68639 + Invariants involving this data-type, see doc/lock-ordering for details:
68640 +
68641 + [tap-sane]
68642 + */
68643 +struct tree_access_pointer {
68644 + /* coord tap is at */
68645 + coord_t *coord;
68646 + /* lock handle on ->coord->node */
68647 + lock_handle *lh;
68648 + /* mode of lock acquired by this tap */
68649 + znode_lock_mode mode;
68650 + /* incremented by reiser4_tap_load().
68651 + Decremented by reiser4_tap_relse(). */
68652 + int loaded;
68653 + /* list of taps */
68654 + struct list_head linkage;
68655 + /* read-ahead hint */
68656 + ra_info_t ra_info;
68657 +};
68658 +
68659 +typedef int (*go_actor_t) (tap_t * tap);
68660 +
68661 +extern int reiser4_tap_load(tap_t * tap);
68662 +extern void reiser4_tap_relse(tap_t * tap);
68663 +extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68664 + znode_lock_mode mode);
68665 +extern void reiser4_tap_monitor(tap_t * tap);
68666 +extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
68667 +extern void reiser4_tap_done(tap_t * tap);
68668 +extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
68669 +extern int tap_to_coord(tap_t * tap, coord_t * target);
68670 +
68671 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68672 +extern int go_next_unit(tap_t * tap);
68673 +extern int go_prev_unit(tap_t * tap);
68674 +extern int rewind_right(tap_t * tap, int shift);
68675 +extern int rewind_left(tap_t * tap, int shift);
68676 +
68677 +extern struct list_head *reiser4_taps_list(void);
68678 +
68679 +#define for_all_taps(tap) \
68680 + for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
68681 + reiser4_taps_list() != &tap->linkage; \
68682 + tap = list_entry(tap->linkage.next, tap_t, linkage))
68683 +
68684 +/* __REISER4_TAP_H__ */
68685 +#endif
68686 +/* Make Linus happy.
68687 + Local variables:
68688 + c-indentation-style: "K&R"
68689 + mode-name: "LC"
68690 + c-basic-offset: 8
68691 + tab-width: 8
68692 + fill-column: 120
68693 + scroll-step: 1
68694 + End:
68695 +*/
68696 diff -urN linux-2.6.20.orig/fs/reiser4/tree.c linux-2.6.20/fs/reiser4/tree.c
68697 --- linux-2.6.20.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
68698 +++ linux-2.6.20/fs/reiser4/tree.c 2007-05-06 14:50:43.883033217 +0400
68699 @@ -0,0 +1,1876 @@
68700 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68701 + * reiser4/README */
68702 +
68703 +/*
68704 + * KEYS IN A TREE.
68705 + *
68706 + * The tree consists of nodes located on the disk. Node in the tree is either
68707 + * formatted or unformatted. Formatted node is one that has structure
68708 + * understood by the tree balancing and traversal code. Formatted nodes are
68709 + * further classified into leaf and internal nodes. Latter distinctions is
68710 + * (almost) of only historical importance: general structure of leaves and
68711 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68712 + * that are part of bodies of ordinary files and attributes.
68713 + *
68714 + * Each node in the tree spawns some interval in the key space. Key ranges for
68715 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
68716 + * sense, because of the non-unique keys: intersection of key ranges for
68717 + * different nodes is either empty, or consists of exactly one key.
68718 + *
68719 + * Formatted node consists of a sequence of items. Each item spawns some
68720 + * interval in key space. Key ranges for all items in a tree are disjoint,
68721 + * modulo non-unique keys again. Items within nodes are ordered in the key
68722 + * order of the smallest key in a item.
68723 + *
68724 + * Particular type of item can be further split into units. Unit is piece of
68725 + * item that can be cut from item and moved into another item of the same
68726 + * time. Units are used by balancing code to repack data during balancing.
68727 + *
68728 + * Unit can be further split into smaller entities (for example, extent unit
68729 + * represents several pages, and it is natural for extent code to operate on
68730 + * particular pages and even bytes within one unit), but this is of no
68731 + * relevance to the generic balancing and lookup code.
68732 + *
68733 + * Although item is said to "spawn" range or interval of keys, it is not
68734 + * necessary that item contains piece of data addressable by each and every
68735 + * key in this range. For example, compound directory item, consisting of
68736 + * units corresponding to directory entries and keyed by hashes of file names,
68737 + * looks more as having "discrete spectrum": only some disjoint keys inside
68738 + * range occupied by this item really address data.
68739 + *
68740 + * No than less, each item always has well-defined least (minimal) key, that
68741 + * is recorded in item header, stored in the node this item is in. Also, item
68742 + * plugin can optionally define method ->max_key_inside() returning maximal
68743 + * key that can _possibly_ be located within this item. This method is used
68744 + * (mainly) to determine when given piece of data should be merged into
68745 + * existing item, in stead of creating new one. Because of this, even though
68746 + * ->max_key_inside() can be larger that any key actually located in the item,
68747 + * intervals
68748 + *
68749 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
68750 + *
68751 + * are still disjoint for all items within the _same_ node.
68752 + *
68753 + * In memory node is represented by znode. It plays several roles:
68754 + *
68755 + * . something locks are taken on
68756 + *
68757 + * . something tracked by transaction manager (this is going to change)
68758 + *
68759 + * . something used to access node data
68760 + *
68761 + * . something used to maintain tree structure in memory: sibling and
68762 + * parental linkage.
68763 + *
68764 + * . something used to organize nodes into "slums"
68765 + *
68766 + * More on znodes see in znode.[ch]
68767 + *
68768 + * DELIMITING KEYS
68769 + *
68770 + * To simplify balancing, allow some flexibility in locking and speed up
68771 + * important coord cache optimization, we keep delimiting keys of nodes in
68772 + * memory. Depending on disk format (implemented by appropriate node plugin)
68773 + * node on disk can record both left and right delimiting key, only one of
68774 + * them, or none. Still, our balancing and tree traversal code keep both
68775 + * delimiting keys for a node that is in memory stored in the znode. When
68776 + * node is first brought into memory during tree traversal, its left
68777 + * delimiting key is taken from its parent, and its right delimiting key is
68778 + * either next key in its parent, or is right delimiting key of parent if
68779 + * node is the rightmost child of parent.
68780 + *
68781 + * Physical consistency of delimiting key is protected by special dk
68782 + * read-write lock. That is, delimiting keys can only be inspected or
68783 + * modified under this lock. But dk lock is only sufficient for fast
68784 + * "pessimistic" check, because to simplify code and to decrease lock
68785 + * contention, balancing (carry) only updates delimiting keys right before
68786 + * unlocking all locked nodes on the given tree level. For example,
68787 + * coord-by-key cache scans LRU list of recently accessed znodes. For each
68788 + * node it first does fast check under dk spin lock. If key looked for is
68789 + * not between delimiting keys for this node, next node is inspected and so
68790 + * on. If key is inside of the key range, long term lock is taken on node
68791 + * and key range is rechecked.
68792 + *
68793 + * COORDINATES
68794 + *
68795 + * To find something in the tree, you supply a key, and the key is resolved
68796 + * by coord_by_key() into a coord (coordinate) that is valid as long as the
68797 + * node the coord points to remains locked. As mentioned above trees
68798 + * consist of nodes that consist of items that consist of units. A unit is
68799 + * the smallest and indivisible piece of tree as far as balancing and tree
68800 + * search are concerned. Each node, item, and unit can be addressed by
68801 + * giving its level in the tree and the key occupied by this entity. A node
68802 + * knows what the key ranges are of the items within it, and how to find its
68803 + * items and invoke their item handlers, but it does not know how to access
68804 + * individual units within its items except through the item handlers.
68805 + * coord is a structure containing a pointer to the node, the ordinal number
68806 + * of the item within this node (a sort of item offset), and the ordinal
68807 + * number of the unit within this item.
68808 + *
68809 + * TREE LOOKUP
68810 + *
68811 + * There are two types of access to the tree: lookup and modification.
68812 + *
68813 + * Lookup is a search for the key in the tree. Search can look for either
68814 + * exactly the key given to it, or for the largest key that is not greater
68815 + * than the key given to it. This distinction is determined by "bias"
68816 + * parameter of search routine (coord_by_key()). coord_by_key() either
68817 + * returns error (key is not in the tree, or some kind of external error
68818 + * occurred), or successfully resolves key into coord.
68819 + *
68820 + * This resolution is done by traversing tree top-to-bottom from root level
68821 + * to the desired level. On levels above twig level (level one above the
68822 + * leaf level) nodes consist exclusively of internal items. Internal item is
68823 + * nothing more than pointer to the tree node on the child level. On twig
68824 + * level nodes consist of internal items intermixed with extent
68825 + * items. Internal items form normal search tree structure used by traversal
68826 + * to descent through the tree.
68827 + *
68828 + * TREE LOOKUP OPTIMIZATIONS
68829 + *
68830 + * Tree lookup described above is expensive even if all nodes traversed are
68831 + * already in the memory: for each node binary search within it has to be
68832 + * performed and binary searches are CPU consuming and tend to destroy CPU
68833 + * caches.
68834 + *
68835 + * Several optimizations are used to work around this:
68836 + *
68837 + * . cbk_cache (look-aside cache for tree traversals, see search.c for
68838 + * details)
68839 + *
68840 + * . seals (see seal.[ch])
68841 + *
68842 + * . vroot (see search.c)
68843 + *
68844 + * General search-by-key is layered thusly:
68845 + *
68846 + * [check seal, if any] --ok--> done
68847 + * |
68848 + * failed
68849 + * |
68850 + * V
68851 + * [vroot defined] --no--> node = tree_root
68852 + * | |
68853 + * yes |
68854 + * | |
68855 + * V |
68856 + * node = vroot |
68857 + * | |
68858 + * | |
68859 + * | |
68860 + * V V
68861 + * [check cbk_cache for key] --ok--> done
68862 + * |
68863 + * failed
68864 + * |
68865 + * V
68866 + * [start tree traversal from node]
68867 + *
68868 + */
68869 +
68870 +#include "forward.h"
68871 +#include "debug.h"
68872 +#include "dformat.h"
68873 +#include "key.h"
68874 +#include "coord.h"
68875 +#include "plugin/item/static_stat.h"
68876 +#include "plugin/item/item.h"
68877 +#include "plugin/node/node.h"
68878 +#include "plugin/plugin.h"
68879 +#include "txnmgr.h"
68880 +#include "jnode.h"
68881 +#include "znode.h"
68882 +#include "block_alloc.h"
68883 +#include "tree_walk.h"
68884 +#include "carry.h"
68885 +#include "carry_ops.h"
68886 +#include "tap.h"
68887 +#include "tree.h"
68888 +#include "vfs_ops.h"
68889 +#include "page_cache.h"
68890 +#include "super.h"
68891 +#include "reiser4.h"
68892 +#include "inode.h"
68893 +
68894 +#include <linux/fs.h> /* for struct super_block */
68895 +#include <linux/spinlock.h>
68896 +
68897 +/* Disk address (block number) never ever used for any real tree node. This is
68898 + used as block number of "uber" znode.
68899 +
68900 + Invalid block addresses are 0 by tradition.
68901 +
68902 +*/
68903 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68904 +
68905 +#define CUT_TREE_MIN_ITERATIONS 64
68906 +
68907 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68908 +
68909 +/* return node plugin of coord->node */
68910 +node_plugin *node_plugin_by_coord(const coord_t * coord)
68911 +{
68912 + assert("vs-1", coord != NULL);
68913 + assert("vs-2", coord->node != NULL);
68914 +
68915 + return coord->node->nplug;
68916 +}
68917 +
68918 +/* insert item into tree. Fields of @coord are updated so that they can be
68919 + * used by consequent insert operation. */
68920 +insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68921 + * into */ ,
68922 + const reiser4_key * key /* key of new item */ ,
68923 + reiser4_item_data * data /* parameters for item
68924 + * creation */ ,
68925 + coord_t * coord /* resulting insertion coord */ ,
68926 + lock_handle * lh /* resulting lock
68927 + * handle */ ,
68928 + tree_level stop_level /** level where to insert */ ,
68929 + __u32 flags /* insertion flags */ )
68930 +{
68931 + int result;
68932 +
68933 + assert("nikita-358", tree != NULL);
68934 + assert("nikita-360", coord != NULL);
68935 +
68936 + result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68937 + FIND_EXACT, stop_level, stop_level,
68938 + flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68939 + switch (result) {
68940 + default:
68941 + break;
68942 + case CBK_COORD_FOUND:
68943 + result = IBK_ALREADY_EXISTS;
68944 + break;
68945 + case CBK_COORD_NOTFOUND:
68946 + assert("nikita-2017", coord->node != NULL);
68947 + result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68948 + break;
68949 + }
68950 + return result;
68951 +}
68952 +
68953 +/* insert item by calling carry. Helper function called if short-cut
68954 + insertion failed */
68955 +static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68956 + lock_handle * lh, /* lock handle of insertion
68957 + * node */
68958 + reiser4_item_data * data, /* parameters of new
68959 + * item */
68960 + const reiser4_key * key, /* key of new item */
68961 + carry_opcode cop, /* carry operation to perform */
68962 + cop_insert_flag flags
68963 + /* carry flags */ )
68964 +{
68965 + int result;
68966 + carry_pool *pool;
68967 + carry_level *lowest_level;
68968 + carry_insert_data *cdata;
68969 + carry_op *op;
68970 +
68971 + assert("umka-314", coord != NULL);
68972 +
68973 + /* allocate carry_pool and 3 carry_level-s */
68974 + pool =
68975 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68976 + sizeof(*cdata));
68977 + if (IS_ERR(pool))
68978 + return PTR_ERR(pool);
68979 + lowest_level = (carry_level *) (pool + 1);
68980 + init_carry_level(lowest_level, pool);
68981 +
68982 + op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
68983 + if (IS_ERR(op) || (op == NULL)) {
68984 + done_carry_pool(pool);
68985 + return RETERR(op ? PTR_ERR(op) : -EIO);
68986 + }
68987 + cdata = (carry_insert_data *) (lowest_level + 3);
68988 + cdata->coord = coord;
68989 + cdata->data = data;
68990 + cdata->key = key;
68991 + op->u.insert.d = cdata;
68992 + if (flags == 0)
68993 + flags = znode_get_tree(coord->node)->carry.insert_flags;
68994 + op->u.insert.flags = flags;
68995 + op->u.insert.type = COPT_ITEM_DATA;
68996 + op->u.insert.child = NULL;
68997 + if (lh != NULL) {
68998 + assert("nikita-3245", lh->node == coord->node);
68999 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69000 + lowest_level->tracked = lh;
69001 + }
69002 +
69003 + result = reiser4_carry(lowest_level, NULL);
69004 + done_carry_pool(pool);
69005 +
69006 + return result;
69007 +}
69008 +
69009 +/* form carry queue to perform paste of @data with @key at @coord, and launch
69010 + its execution by calling carry().
69011 +
69012 + Instruct carry to update @lh it after balancing insertion coord moves into
69013 + different block.
69014 +
69015 +*/
69016 +static int paste_with_carry(coord_t * coord, /* coord of paste */
69017 + lock_handle * lh, /* lock handle of node
69018 + * where item is
69019 + * pasted */
69020 + reiser4_item_data * data, /* parameters of new
69021 + * item */
69022 + const reiser4_key * key, /* key of new item */
69023 + unsigned flags /* paste flags */ )
69024 +{
69025 + int result;
69026 + carry_pool *pool;
69027 + carry_level *lowest_level;
69028 + carry_insert_data *cdata;
69029 + carry_op *op;
69030 +
69031 + assert("umka-315", coord != NULL);
69032 + assert("umka-316", key != NULL);
69033 +
69034 + pool =
69035 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69036 + sizeof(*cdata));
69037 + if (IS_ERR(pool))
69038 + return PTR_ERR(pool);
69039 + lowest_level = (carry_level *) (pool + 1);
69040 + init_carry_level(lowest_level, pool);
69041 +
69042 + op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
69043 + if (IS_ERR(op) || (op == NULL)) {
69044 + done_carry_pool(pool);
69045 + return RETERR(op ? PTR_ERR(op) : -EIO);
69046 + }
69047 + cdata = (carry_insert_data *) (lowest_level + 3);
69048 + cdata->coord = coord;
69049 + cdata->data = data;
69050 + cdata->key = key;
69051 + op->u.paste.d = cdata;
69052 + if (flags == 0)
69053 + flags = znode_get_tree(coord->node)->carry.paste_flags;
69054 + op->u.paste.flags = flags;
69055 + op->u.paste.type = COPT_ITEM_DATA;
69056 + if (lh != NULL) {
69057 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69058 + lowest_level->tracked = lh;
69059 + }
69060 +
69061 + result = reiser4_carry(lowest_level, NULL);
69062 + done_carry_pool(pool);
69063 +
69064 + return result;
69065 +}
69066 +
69067 +/* insert item at the given coord.
69068 +
69069 + First try to skip carry by directly calling ->create_item() method of node
69070 + plugin. If this is impossible (there is not enough free space in the node,
69071 + or leftmost item in the node is created), call insert_with_carry_by_coord()
69072 + that will do full carry().
69073 +
69074 +*/
69075 +insert_result insert_by_coord(coord_t * coord /* coord where to
69076 + * insert. coord->node has
69077 + * to be write locked by
69078 + * caller */ ,
69079 + reiser4_item_data * data /* data to be
69080 + * inserted */ ,
69081 + const reiser4_key * key /* key of new item */ ,
69082 + lock_handle * lh /* lock handle of write
69083 + * lock on node */ ,
69084 + __u32 flags /* insertion flags */ )
69085 +{
69086 + unsigned item_size;
69087 + int result;
69088 + znode *node;
69089 +
69090 + assert("vs-247", coord != NULL);
69091 + assert("vs-248", data != NULL);
69092 + assert("vs-249", data->length >= 0);
69093 + assert("nikita-1191", znode_is_write_locked(coord->node));
69094 +
69095 + node = coord->node;
69096 + coord_clear_iplug(coord);
69097 + result = zload(node);
69098 + if (result != 0)
69099 + return result;
69100 +
69101 + item_size = space_needed(node, NULL, data, 1);
69102 + if (item_size > znode_free_space(node) &&
69103 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69104 + && (flags & COPI_DONT_ALLOCATE)) {
69105 + /* we are forced to use free space of coord->node and new item
69106 + does not fit into it.
69107 +
69108 + Currently we get here only when we allocate and copy units
69109 + of extent item from a node to its left neighbor during
69110 + "squalloc"-ing. If @node (this is left neighbor) does not
69111 + have enough free space - we do not want to attempt any
69112 + shifting and allocations because we are in squeezing and
69113 + everything to the left of @node is tightly packed.
69114 + */
69115 + result = -E_NODE_FULL;
69116 + } else if ((item_size <= znode_free_space(node)) &&
69117 + !coord_is_before_leftmost(coord) &&
69118 + (node_plugin_by_node(node)->fast_insert != NULL)
69119 + && node_plugin_by_node(node)->fast_insert(coord)) {
69120 + /* shortcut insertion without carry() overhead.
69121 +
69122 + Only possible if:
69123 +
69124 + - there is enough free space
69125 +
69126 + - insertion is not into the leftmost position in a node
69127 + (otherwise it would require updating of delimiting key in a
69128 + parent)
69129 +
69130 + - node plugin agrees with this
69131 +
69132 + */
69133 + result =
69134 + node_plugin_by_node(node)->create_item(coord, key, data,
69135 + NULL);
69136 + znode_make_dirty(node);
69137 + } else {
69138 + /* otherwise do full-fledged carry(). */
69139 + result =
69140 + insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69141 + flags);
69142 + }
69143 + zrelse(node);
69144 + return result;
69145 +}
69146 +
69147 +/* @coord is set to leaf level and @data is to be inserted to twig level */
69148 +insert_result
69149 +insert_extent_by_coord(coord_t *
69150 + coord
69151 + /* coord where to insert. coord->node * has to be write * locked by caller */
69152 + ,
69153 + reiser4_item_data * data /* data to be inserted */ ,
69154 + const reiser4_key * key /* key of new item */ ,
69155 + lock_handle *
69156 + lh /* lock handle of write lock on * node */ )
69157 +{
69158 + assert("vs-405", coord != NULL);
69159 + assert("vs-406", data != NULL);
69160 + assert("vs-407", data->length > 0);
69161 + assert("vs-408", znode_is_write_locked(coord->node));
69162 + assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69163 +
69164 + return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69165 + 0 /*flags */ );
69166 +}
69167 +
69168 +/* Insert into the item at the given coord.
69169 +
69170 + First try to skip carry by directly calling ->paste() method of item
69171 + plugin. If this is impossible (there is not enough free space in the node,
69172 + or we are pasting into leftmost position in the node), call
69173 + paste_with_carry() that will do full carry().
69174 +
69175 +*/
69176 +/* paste_into_item */
69177 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
69178 + lock_handle * lh /* lock handle on node involved */ ,
69179 + const reiser4_key * key /* key of unit being pasted */ ,
69180 + reiser4_item_data * data /* parameters for new unit */ ,
69181 + unsigned flags /* insert/paste flags */ )
69182 +{
69183 + int result;
69184 + int size_change;
69185 + node_plugin *nplug;
69186 + item_plugin *iplug;
69187 +
69188 + assert("umka-317", coord != NULL);
69189 + assert("umka-318", key != NULL);
69190 +
69191 + iplug = item_plugin_by_coord(coord);
69192 + nplug = node_plugin_by_coord(coord);
69193 +
69194 + assert("nikita-1480", iplug == data->iplug);
69195 +
69196 + size_change = space_needed(coord->node, coord, data, 0);
69197 + if (size_change > (int)znode_free_space(coord->node) &&
69198 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69199 + && (flags & COPI_DONT_ALLOCATE)) {
69200 + /* we are forced to use free space of coord->node and new data
69201 + does not fit into it. */
69202 + return -E_NODE_FULL;
69203 + }
69204 +
69205 + /* shortcut paste without carry() overhead.
69206 +
69207 + Only possible if:
69208 +
69209 + - there is enough free space
69210 +
69211 + - paste is not into the leftmost unit in a node (otherwise
69212 + it would require updating of delimiting key in a parent)
69213 +
69214 + - node plugin agrees with this
69215 +
69216 + - item plugin agrees with us
69217 + */
69218 + if (size_change <= (int)znode_free_space(coord->node) &&
69219 + (coord->item_pos != 0 ||
69220 + coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69221 + coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69222 + nplug->fast_paste(coord) &&
69223 + iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69224 + if (size_change > 0)
69225 + nplug->change_item_size(coord, size_change);
69226 + /* NOTE-NIKITA: huh? where @key is used? */
69227 + result = iplug->b.paste(coord, data, NULL);
69228 + if (size_change < 0)
69229 + nplug->change_item_size(coord, size_change);
69230 + znode_make_dirty(coord->node);
69231 + } else
69232 + /* otherwise do full-fledged carry(). */
69233 + result = paste_with_carry(coord, lh, data, key, flags);
69234 + return result;
69235 +}
69236 +
69237 +/* this either appends or truncates item @coord */
69238 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
69239 + reiser4_item_data * data /* parameters of resize */ ,
69240 + reiser4_key * key /* key of new unit */ ,
69241 + lock_handle * lh /* lock handle of node
69242 + * being modified */ ,
69243 + cop_insert_flag flags /* carry flags */ )
69244 +{
69245 + int result;
69246 + znode *node;
69247 +
69248 + assert("nikita-362", coord != NULL);
69249 + assert("nikita-363", data != NULL);
69250 + assert("vs-245", data->length != 0);
69251 +
69252 + node = coord->node;
69253 + coord_clear_iplug(coord);
69254 + result = zload(node);
69255 + if (result != 0)
69256 + return result;
69257 +
69258 + if (data->length < 0)
69259 + result = node_plugin_by_coord(coord)->shrink_item(coord,
69260 + -data->length);
69261 + else
69262 + result = insert_into_item(coord, lh, key, data, flags);
69263 +
69264 + zrelse(node);
69265 + return result;
69266 +}
69267 +
69268 +/* insert flow @f */
69269 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69270 +{
69271 + int result;
69272 + carry_pool *pool;
69273 + carry_level *lowest_level;
69274 + reiser4_item_data *data;
69275 + carry_op *op;
69276 +
69277 + pool =
69278 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69279 + sizeof(*data));
69280 + if (IS_ERR(pool))
69281 + return PTR_ERR(pool);
69282 + lowest_level = (carry_level *) (pool + 1);
69283 + init_carry_level(lowest_level, pool);
69284 +
69285 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69286 + 0 /* operate directly on coord -> node */ );
69287 + if (IS_ERR(op) || (op == NULL)) {
69288 + done_carry_pool(pool);
69289 + return RETERR(op ? PTR_ERR(op) : -EIO);
69290 + }
69291 +
69292 + /* these are permanent during insert_flow */
69293 + data = (reiser4_item_data *) (lowest_level + 3);
69294 + data->user = 1;
69295 + data->iplug = item_plugin_by_id(FORMATTING_ID);
69296 + data->arg = NULL;
69297 + /* data.length and data.data will be set before calling paste or
69298 + insert */
69299 + data->length = 0;
69300 + data->data = NULL;
69301 +
69302 + op->u.insert_flow.flags = 0;
69303 + op->u.insert_flow.insert_point = coord;
69304 + op->u.insert_flow.flow = f;
69305 + op->u.insert_flow.data = data;
69306 + op->u.insert_flow.new_nodes = 0;
69307 +
69308 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69309 + lowest_level->tracked = lh;
69310 +
69311 + result = reiser4_carry(lowest_level, NULL);
69312 + done_carry_pool(pool);
69313 +
69314 + return result;
69315 +}
69316 +
69317 +/* Given a coord in parent node, obtain a znode for the corresponding child */
69318 +znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69319 + * child */ ,
69320 + znode * parent /* parent of child */ ,
69321 + int incore_p /* if !0 only return child if already in
69322 + * memory */ ,
69323 + int setup_dkeys_p /* if !0 update delimiting keys of
69324 + * child */ )
69325 +{
69326 + znode *child;
69327 +
69328 + assert("nikita-1374", parent_coord != NULL);
69329 + assert("nikita-1482", parent != NULL);
69330 +#if REISER4_DEBUG
69331 + if (setup_dkeys_p)
69332 + assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69333 +#endif
69334 + assert("nikita-2947", znode_is_any_locked(parent));
69335 +
69336 + if (znode_get_level(parent) <= LEAF_LEVEL) {
69337 + /* trying to get child of leaf node */
69338 + warning("nikita-1217", "Child of maize?");
69339 + return ERR_PTR(RETERR(-EIO));
69340 + }
69341 + if (item_is_internal(parent_coord)) {
69342 + reiser4_block_nr addr;
69343 + item_plugin *iplug;
69344 + reiser4_tree *tree;
69345 +
69346 + iplug = item_plugin_by_coord(parent_coord);
69347 + assert("vs-512", iplug->s.internal.down_link);
69348 + iplug->s.internal.down_link(parent_coord, NULL, &addr);
69349 +
69350 + tree = znode_get_tree(parent);
69351 + if (incore_p)
69352 + child = zlook(tree, &addr);
69353 + else
69354 + child =
69355 + zget(tree, &addr, parent,
69356 + znode_get_level(parent) - 1,
69357 + reiser4_ctx_gfp_mask_get());
69358 + if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69359 + set_child_delimiting_keys(parent, parent_coord, child);
69360 + } else {
69361 + warning("nikita-1483", "Internal item expected");
69362 + child = ERR_PTR(RETERR(-EIO));
69363 + }
69364 + return child;
69365 +}
69366 +
69367 +/* remove znode from transaction */
69368 +static void uncapture_znode(znode * node)
69369 +{
69370 + struct page *page;
69371 +
69372 + assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69373 +
69374 + if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
69375 + int ret;
69376 +
69377 + /* An already allocated block goes right to the atom's delete set. */
69378 + ret =
69379 + reiser4_dealloc_block(znode_get_block(node), 0,
69380 + BA_DEFER | BA_FORMATTED);
69381 + if (ret)
69382 + warning("zam-942",
69383 + "can\'t add a block (%llu) number to atom's delete set\n",
69384 + (unsigned long long)(*znode_get_block(node)));
69385 +
69386 + spin_lock_znode(node);
69387 + /* Here we return flush reserved block which was reserved at the
69388 + * moment when this allocated node was marked dirty and still
69389 + * not used by flush in node relocation procedure. */
69390 + if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69391 + txn_atom *atom;
69392 +
69393 + atom = jnode_get_atom(ZJNODE(node));
69394 + assert("zam-939", atom != NULL);
69395 + spin_unlock_znode(node);
69396 + flush_reserved2grabbed(atom, (__u64) 1);
69397 + spin_unlock_atom(atom);
69398 + } else
69399 + spin_unlock_znode(node);
69400 + } else {
69401 + /* znode has assigned block which is counted as "fake
69402 + allocated". Return it back to "free blocks") */
69403 + fake_allocated2free((__u64) 1, BA_FORMATTED);
69404 + }
69405 +
69406 + /*
69407 + * uncapture page from transaction. There is a possibility of a race
69408 + * with ->releasepage(): reiser4_releasepage() detaches page from this
69409 + * jnode and we have nothing to uncapture. To avoid this, get
69410 + * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69411 + * will deal with released page itself.
69412 + */
69413 + spin_lock_znode(node);
69414 + page = znode_page(node);
69415 + if (likely(page != NULL)) {
69416 + /*
69417 + * reiser4_uncapture_page() can only be called when we are sure
69418 + * that znode is pinned in memory, which we are, because
69419 + * forget_znode() is only called from longterm_unlock_znode().
69420 + */
69421 + page_cache_get(page);
69422 + spin_unlock_znode(node);
69423 + lock_page(page);
69424 + reiser4_uncapture_page(page);
69425 + unlock_page(page);
69426 + page_cache_release(page);
69427 + } else {
69428 + txn_atom *atom;
69429 +
69430 + /* handle "flush queued" znodes */
69431 + while (1) {
69432 + atom = jnode_get_atom(ZJNODE(node));
69433 + assert("zam-943", atom != NULL);
69434 +
69435 + if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69436 + || !atom->nr_running_queues)
69437 + break;
69438 +
69439 + spin_unlock_znode(node);
69440 + reiser4_atom_wait_event(atom);
69441 + spin_lock_znode(node);
69442 + }
69443 +
69444 + reiser4_uncapture_block(ZJNODE(node));
69445 + spin_unlock_atom(atom);
69446 + zput(node);
69447 + }
69448 +}
69449 +
69450 +/* This is called from longterm_unlock_znode() when last lock is released from
69451 + the node that has been removed from the tree. At this point node is removed
69452 + from sibling list and its lock is invalidated. */
69453 +void forget_znode(lock_handle * handle)
69454 +{
69455 + znode *node;
69456 + reiser4_tree *tree;
69457 +
69458 + assert("umka-319", handle != NULL);
69459 +
69460 + node = handle->node;
69461 + tree = znode_get_tree(node);
69462 +
69463 + assert("vs-164", znode_is_write_locked(node));
69464 + assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69465 + assert_rw_locked(&(node->lock.guard));
69466 +
69467 + /* We assume that this node was detached from its parent before
69468 + * unlocking, it gives no way to reach this node from parent through a
69469 + * down link. The node should have no children and, thereby, can't be
69470 + * reached from them by their parent pointers. The only way to obtain a
69471 + * reference to the node is to use sibling pointers from its left and
69472 + * right neighbors. In the next several lines we remove the node from
69473 + * the sibling list. */
69474 +
69475 + write_lock_tree(tree);
69476 + sibling_list_remove(node);
69477 + znode_remove(node, tree);
69478 + write_unlock_tree(tree);
69479 +
69480 + /* Here we set JNODE_DYING and cancel all pending lock requests. It
69481 + * forces all lock requestor threads to repeat iterations of getting
69482 + * lock on a child, neighbor or parent node. But, those threads can't
69483 + * come to this node again, because this node is no longer a child,
69484 + * neighbor or parent of any other node. This order of znode
69485 + * invalidation does not allow other threads to waste cpu time is a busy
69486 + * loop, trying to lock dying object. The exception is in the flush
69487 + * code when we take node directly from atom's capture list.*/
69488 + reiser4_invalidate_lock(handle);
69489 + uncapture_znode(node);
69490 +}
69491 +
69492 +/* Check that internal item at @pointer really contains pointer to @child. */
69493 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69494 + * @child */ ,
69495 + const znode * child /* child znode */ )
69496 +{
69497 + assert("nikita-1016", pointer != NULL);
69498 + assert("nikita-1017", child != NULL);
69499 + assert("nikita-1018", pointer->node != NULL);
69500 +
69501 + assert("nikita-1325", znode_is_any_locked(pointer->node));
69502 +
69503 + assert("nikita-2985",
69504 + znode_get_level(pointer->node) == znode_get_level(child) + 1);
69505 +
69506 + coord_clear_iplug((coord_t *) pointer);
69507 +
69508 + if (coord_is_existing_unit(pointer)) {
69509 + item_plugin *iplug;
69510 + reiser4_block_nr addr;
69511 +
69512 + if (item_is_internal(pointer)) {
69513 + iplug = item_plugin_by_coord(pointer);
69514 + assert("vs-513", iplug->s.internal.down_link);
69515 + iplug->s.internal.down_link(pointer, NULL, &addr);
69516 + /* check that cached value is correct */
69517 + if (disk_addr_eq(&addr, znode_get_block(child))) {
69518 + return NS_FOUND;
69519 + }
69520 + }
69521 + }
69522 + /* warning ("jmacd-1002", "tree pointer incorrect"); */
69523 + return NS_NOT_FOUND;
69524 +}
69525 +
69526 +/* find coord of pointer to new @child in @parent.
69527 +
69528 + Find the &coord_t in the @parent where pointer to a given @child will
69529 + be in.
69530 +
69531 +*/
69532 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69533 + znode *
69534 + child UNUSED_ARG /* child znode, passed locked */ ,
69535 + znode * left /* left brother of new node */ ,
69536 + coord_t * result /* where result is stored in */ )
69537 +{
69538 + int ret;
69539 +
69540 + assert("nikita-1486", parent != NULL);
69541 + assert("nikita-1487", child != NULL);
69542 + assert("nikita-1488", result != NULL);
69543 +
69544 + ret = find_child_ptr(parent, left, result);
69545 + if (ret != NS_FOUND) {
69546 + warning("nikita-1489", "Cannot find brother position: %i", ret);
69547 + return RETERR(-EIO);
69548 + } else {
69549 + result->between = AFTER_UNIT;
69550 + return RETERR(NS_NOT_FOUND);
69551 + }
69552 +}
69553 +
69554 +/* find coord of pointer to @child in @parent.
69555 +
69556 + Find the &coord_t in the @parent where pointer to a given @child is in.
69557 +
69558 +*/
69559 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69560 + znode * child /* child znode, passed locked */ ,
69561 + coord_t * result /* where result is stored in */ )
69562 +{
69563 + int lookup_res;
69564 + node_plugin *nplug;
69565 + /* left delimiting key of a child */
69566 + reiser4_key ld;
69567 + reiser4_tree *tree;
69568 +
69569 + assert("nikita-934", parent != NULL);
69570 + assert("nikita-935", child != NULL);
69571 + assert("nikita-936", result != NULL);
69572 + assert("zam-356", znode_is_loaded(parent));
69573 +
69574 + coord_init_zero(result);
69575 + result->node = parent;
69576 +
69577 + nplug = parent->nplug;
69578 + assert("nikita-939", nplug != NULL);
69579 +
69580 + tree = znode_get_tree(parent);
69581 + /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69582 + * not aliased to ->in_parent of some znode. Otherwise,
69583 + * parent_coord_to_coord() below would modify data protected by tree
69584 + * lock. */
69585 + read_lock_tree(tree);
69586 + /* fast path. Try to use cached value. Lock tree to keep
69587 + node->pos_in_parent and pos->*_blocknr consistent. */
69588 + if (child->in_parent.item_pos + 1 != 0) {
69589 + parent_coord_to_coord(&child->in_parent, result);
69590 + if (check_tree_pointer(result, child) == NS_FOUND) {
69591 + read_unlock_tree(tree);
69592 + return NS_FOUND;
69593 + }
69594 +
69595 + child->in_parent.item_pos = (unsigned short)~0;
69596 + }
69597 + read_unlock_tree(tree);
69598 +
69599 + /* is above failed, find some key from @child. We are looking for the
69600 + least key in a child. */
69601 + read_lock_dk(tree);
69602 + ld = *znode_get_ld_key(child);
69603 + read_unlock_dk(tree);
69604 + /*
69605 + * now, lookup parent with key just found. Note, that left delimiting
69606 + * key doesn't identify node uniquely, because (in extremely rare
69607 + * case) two nodes can have equal left delimiting keys, if one of them
69608 + * is completely filled with directory entries that all happened to be
69609 + * hash collision. But, we check block number in check_tree_pointer()
69610 + * and, so, are safe.
69611 + */
69612 + lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69613 + /* update cached pos_in_node */
69614 + if (lookup_res == NS_FOUND) {
69615 + write_lock_tree(tree);
69616 + coord_to_parent_coord(result, &child->in_parent);
69617 + write_unlock_tree(tree);
69618 + lookup_res = check_tree_pointer(result, child);
69619 + }
69620 + if (lookup_res == NS_NOT_FOUND)
69621 + lookup_res = find_child_by_addr(parent, child, result);
69622 + return lookup_res;
69623 +}
69624 +
69625 +/* find coord of pointer to @child in @parent by scanning
69626 +
69627 + Find the &coord_t in the @parent where pointer to a given @child
69628 + is in by scanning all internal items in @parent and comparing block
69629 + numbers in them with that of @child.
69630 +
69631 +*/
69632 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69633 + znode * child /* child znode, passed locked */ ,
69634 + coord_t * result /* where result is stored in */ )
69635 +{
69636 + int ret;
69637 +
69638 + assert("nikita-1320", parent != NULL);
69639 + assert("nikita-1321", child != NULL);
69640 + assert("nikita-1322", result != NULL);
69641 +
69642 + ret = NS_NOT_FOUND;
69643 +
69644 + for_all_units(result, parent) {
69645 + if (check_tree_pointer(result, child) == NS_FOUND) {
69646 + write_lock_tree(znode_get_tree(parent));
69647 + coord_to_parent_coord(result, &child->in_parent);
69648 + write_unlock_tree(znode_get_tree(parent));
69649 + ret = NS_FOUND;
69650 + break;
69651 + }
69652 + }
69653 + return ret;
69654 +}
69655 +
69656 +/* true, if @addr is "unallocated block number", which is just address, with
69657 + highest bit set. */
69658 +int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69659 + * check */ )
69660 +{
69661 + assert("nikita-1766", addr != NULL);
69662 + cassert(sizeof(reiser4_block_nr) == 8);
69663 + return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69664 + REISER4_UNALLOCATED_STATUS_VALUE;
69665 +}
69666 +
69667 +/* returns true if removing bytes of given range of key [from_key, to_key]
69668 + causes removing of whole item @from */
69669 +static int
69670 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
69671 + const reiser4_key * to_key)
69672 +{
69673 + item_plugin *iplug;
69674 + reiser4_key key_in_item;
69675 +
69676 + assert("umka-325", from != NULL);
69677 + assert("", item_is_extent(from));
69678 +
69679 + /* check first key just for case */
69680 + item_key_by_coord(from, &key_in_item);
69681 + if (keygt(from_key, &key_in_item))
69682 + return 0;
69683 +
69684 + /* check last key */
69685 + iplug = item_plugin_by_coord(from);
69686 + assert("vs-611", iplug && iplug->s.file.append_key);
69687 +
69688 + iplug->s.file.append_key(from, &key_in_item);
69689 + set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69690 +
69691 + if (keylt(to_key, &key_in_item))
69692 + /* last byte is not removed */
69693 + return 0;
69694 + return 1;
69695 +}
69696 +
69697 +/* helper function for prepare_twig_kill(): @left and @right are formatted
69698 + * neighbors of extent item being completely removed. Load and lock neighbors
69699 + * and store lock handles into @cdata for later use by kill_hook_extent() */
69700 +static int
69701 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69702 +{
69703 + int result;
69704 + int left_loaded;
69705 + int right_loaded;
69706 +
69707 + result = 0;
69708 + left_loaded = right_loaded = 0;
69709 +
69710 + if (left != NULL) {
69711 + result = zload(left);
69712 + if (result == 0) {
69713 + left_loaded = 1;
69714 + result = longterm_lock_znode(kdata->left, left,
69715 + ZNODE_READ_LOCK,
69716 + ZNODE_LOCK_LOPRI);
69717 + }
69718 + }
69719 + if (result == 0 && right != NULL) {
69720 + result = zload(right);
69721 + if (result == 0) {
69722 + right_loaded = 1;
69723 + result = longterm_lock_znode(kdata->right, right,
69724 + ZNODE_READ_LOCK,
69725 + ZNODE_LOCK_HIPRI |
69726 + ZNODE_LOCK_NONBLOCK);
69727 + }
69728 + }
69729 + if (result != 0) {
69730 + done_lh(kdata->left);
69731 + done_lh(kdata->right);
69732 + if (left_loaded != 0)
69733 + zrelse(left);
69734 + if (right_loaded != 0)
69735 + zrelse(right);
69736 + }
69737 + return result;
69738 +}
69739 +
69740 +static void done_children(carry_kill_data * kdata)
69741 +{
69742 + if (kdata->left != NULL && kdata->left->node != NULL) {
69743 + zrelse(kdata->left->node);
69744 + done_lh(kdata->left);
69745 + }
69746 + if (kdata->right != NULL && kdata->right->node != NULL) {
69747 + zrelse(kdata->right->node);
69748 + done_lh(kdata->right);
69749 + }
69750 +}
69751 +
69752 +/* part of cut_node. It is called when cut_node is called to remove or cut part
69753 + of extent item. When head of that item is removed - we have to update right
69754 + delimiting of left neighbor of extent. When item is removed completely - we
69755 + have to set sibling link between left and right neighbor of removed
69756 + extent. This may return -E_DEADLOCK because of trying to get left neighbor
69757 + locked. So, caller should repeat an attempt
69758 +*/
69759 +/* Audited by: umka (2002.06.16) */
69760 +static int
69761 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69762 +{
69763 + int result;
69764 + reiser4_key key;
69765 + lock_handle left_lh;
69766 + lock_handle right_lh;
69767 + coord_t left_coord;
69768 + coord_t *from;
69769 + znode *left_child;
69770 + znode *right_child;
69771 + reiser4_tree *tree;
69772 + int left_zloaded_here, right_zloaded_here;
69773 +
69774 + from = kdata->params.from;
69775 + assert("umka-326", from != NULL);
69776 + assert("umka-327", kdata->params.to != NULL);
69777 +
69778 + /* for one extent item only yet */
69779 + assert("vs-591", item_is_extent(from));
69780 + assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69781 +
69782 + if ((kdata->params.from_key
69783 + && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69784 + || from->unit_pos != 0) {
69785 + /* head of item @from is not removed, there is nothing to
69786 + worry about */
69787 + return 0;
69788 + }
69789 +
69790 + result = 0;
69791 + left_zloaded_here = 0;
69792 + right_zloaded_here = 0;
69793 +
69794 + left_child = right_child = NULL;
69795 +
69796 + coord_dup(&left_coord, from);
69797 + init_lh(&left_lh);
69798 + init_lh(&right_lh);
69799 + if (coord_prev_unit(&left_coord)) {
69800 + /* @from is leftmost item in its node */
69801 + if (!locked_left_neighbor) {
69802 + result =
69803 + reiser4_get_left_neighbor(&left_lh, from->node,
69804 + ZNODE_READ_LOCK,
69805 + GN_CAN_USE_UPPER_LEVELS);
69806 + switch (result) {
69807 + case 0:
69808 + break;
69809 + case -E_NO_NEIGHBOR:
69810 + /* there is no formatted node to the left of
69811 + from->node */
69812 + warning("vs-605",
69813 + "extent item has smallest key in "
69814 + "the tree and it is about to be removed");
69815 + return 0;
69816 + case -E_DEADLOCK:
69817 + /* need to restart */
69818 + default:
69819 + return result;
69820 + }
69821 +
69822 + /* we have acquired left neighbor of from->node */
69823 + result = zload(left_lh.node);
69824 + if (result)
69825 + goto done;
69826 +
69827 + locked_left_neighbor = left_lh.node;
69828 + } else {
69829 + /* squalloc_right_twig_cut should have supplied locked
69830 + * left neighbor */
69831 + assert("vs-834",
69832 + znode_is_write_locked(locked_left_neighbor));
69833 + result = zload(locked_left_neighbor);
69834 + if (result)
69835 + return result;
69836 + }
69837 +
69838 + left_zloaded_here = 1;
69839 + coord_init_last_unit(&left_coord, locked_left_neighbor);
69840 + }
69841 +
69842 + if (!item_is_internal(&left_coord)) {
69843 + /* what else but extent can be on twig level */
69844 + assert("vs-606", item_is_extent(&left_coord));
69845 +
69846 + /* there is no left formatted child */
69847 + if (left_zloaded_here)
69848 + zrelse(locked_left_neighbor);
69849 + done_lh(&left_lh);
69850 + return 0;
69851 + }
69852 +
69853 + tree = znode_get_tree(left_coord.node);
69854 + left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69855 +
69856 + if (IS_ERR(left_child)) {
69857 + result = PTR_ERR(left_child);
69858 + goto done;
69859 + }
69860 +
69861 + /* left child is acquired, calculate new right delimiting key for it
69862 + and get right child if it is necessary */
69863 + if (item_removed_completely
69864 + (from, kdata->params.from_key, kdata->params.to_key)) {
69865 + /* try to get right child of removed item */
69866 + coord_t right_coord;
69867 +
69868 + assert("vs-607",
69869 + kdata->params.to->unit_pos ==
69870 + coord_last_unit_pos(kdata->params.to));
69871 + coord_dup(&right_coord, kdata->params.to);
69872 + if (coord_next_unit(&right_coord)) {
69873 + /* @to is rightmost unit in the node */
69874 + result =
69875 + reiser4_get_right_neighbor(&right_lh, from->node,
69876 + ZNODE_READ_LOCK,
69877 + GN_CAN_USE_UPPER_LEVELS);
69878 + switch (result) {
69879 + case 0:
69880 + result = zload(right_lh.node);
69881 + if (result)
69882 + goto done;
69883 +
69884 + right_zloaded_here = 1;
69885 + coord_init_first_unit(&right_coord,
69886 + right_lh.node);
69887 + item_key_by_coord(&right_coord, &key);
69888 + break;
69889 +
69890 + case -E_NO_NEIGHBOR:
69891 + /* there is no formatted node to the right of
69892 + from->node */
69893 + read_lock_dk(tree);
69894 + key = *znode_get_rd_key(from->node);
69895 + read_unlock_dk(tree);
69896 + right_coord.node = NULL;
69897 + result = 0;
69898 + break;
69899 + default:
69900 + /* real error */
69901 + goto done;
69902 + }
69903 + } else {
69904 + /* there is an item to the right of @from - take its key */
69905 + item_key_by_coord(&right_coord, &key);
69906 + }
69907 +
69908 + /* try to get right child of @from */
69909 + if (right_coord.node && /* there is right neighbor of @from */
69910 + item_is_internal(&right_coord)) { /* it is internal item */
69911 + right_child = child_znode(&right_coord,
69912 + right_coord.node, 1, 0);
69913 +
69914 + if (IS_ERR(right_child)) {
69915 + result = PTR_ERR(right_child);
69916 + goto done;
69917 + }
69918 +
69919 + }
69920 + /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69921 + update of right delimiting key of left_child */
69922 + result = prepare_children(left_child, right_child, kdata);
69923 + } else {
69924 + /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69925 + result = prepare_children(left_child, NULL, kdata);
69926 + }
69927 +
69928 + done:
69929 + if (right_child)
69930 + zput(right_child);
69931 + if (right_zloaded_here)
69932 + zrelse(right_lh.node);
69933 + done_lh(&right_lh);
69934 +
69935 + if (left_child)
69936 + zput(left_child);
69937 + if (left_zloaded_here)
69938 + zrelse(locked_left_neighbor);
69939 + done_lh(&left_lh);
69940 + return result;
69941 +}
69942 +
69943 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69944 + are to be cut completely */
69945 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
69946 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69947 + const reiser4_key * to_key, /* last key to be removed */
69948 + reiser4_key *
69949 + smallest_removed /* smallest key actually removed */ )
69950 +{
69951 + int result;
69952 + carry_pool *pool;
69953 + carry_level *lowest_level;
69954 + carry_cut_data *cut_data;
69955 + carry_op *op;
69956 +
69957 + assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69958 +
69959 + pool =
69960 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69961 + sizeof(*cut_data));
69962 + if (IS_ERR(pool))
69963 + return PTR_ERR(pool);
69964 + lowest_level = (carry_level *) (pool + 1);
69965 + init_carry_level(lowest_level, pool);
69966 +
69967 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
69968 + assert("vs-1509", op != 0);
69969 + if (IS_ERR(op)) {
69970 + done_carry_pool(pool);
69971 + return PTR_ERR(op);
69972 + }
69973 +
69974 + cut_data = (carry_cut_data *) (lowest_level + 3);
69975 + cut_data->params.from = from;
69976 + cut_data->params.to = to;
69977 + cut_data->params.from_key = from_key;
69978 + cut_data->params.to_key = to_key;
69979 + cut_data->params.smallest_removed = smallest_removed;
69980 +
69981 + op->u.cut_or_kill.is_cut = 1;
69982 + op->u.cut_or_kill.u.cut = cut_data;
69983 +
69984 + result = reiser4_carry(lowest_level, NULL);
69985 + done_carry_pool(pool);
69986 +
69987 + return result;
69988 +}
69989 +
69990 +/* cut part of the node
69991 +
69992 + Cut part or whole content of node.
69993 +
69994 + cut data between @from and @to of @from->node and call carry() to make
69995 + corresponding changes in the tree. @from->node may become empty. If so -
69996 + pointer to it will be removed. Neighboring nodes are not changed. Smallest
69997 + removed key is stored in @smallest_removed
69998 +
69999 +*/
70000 +int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
70001 + coord_t * to, /* coord of the last unit/item that will be eliminated */
70002 + const reiser4_key * from_key, /* first key to be removed */
70003 + const reiser4_key * to_key, /* last key to be removed */
70004 + reiser4_key * smallest_removed, /* smallest key actually removed */
70005 + znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
70006 + * locked (in squalloc_right_twig_cut, namely) */
70007 + struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
70008 + invalidate pages together with item pointing to them */
70009 + int truncate)
70010 +{ /* this call is made for file truncate) */
70011 + int result;
70012 + carry_pool *pool;
70013 + carry_level *lowest_level;
70014 + carry_kill_data *kdata;
70015 + lock_handle *left_child;
70016 + lock_handle *right_child;
70017 + carry_op *op;
70018 +
70019 + assert("umka-328", from != NULL);
70020 + assert("vs-316", !node_is_empty(from->node));
70021 + assert("nikita-1812", coord_is_existing_unit(from)
70022 + && coord_is_existing_unit(to));
70023 +
70024 + /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
70025 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70026 + sizeof(carry_kill_data) +
70027 + 2 * sizeof(lock_handle) +
70028 + 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
70029 + if (IS_ERR(pool))
70030 + return PTR_ERR(pool);
70031 +
70032 + lowest_level = (carry_level *) (pool + 1);
70033 + init_carry_level(lowest_level, pool);
70034 +
70035 + kdata = (carry_kill_data *) (lowest_level + 3);
70036 + left_child = (lock_handle *) (kdata + 1);
70037 + right_child = left_child + 1;
70038 +
70039 + init_lh(left_child);
70040 + init_lh(right_child);
70041 +
70042 + kdata->params.from = from;
70043 + kdata->params.to = to;
70044 + kdata->params.from_key = from_key;
70045 + kdata->params.to_key = to_key;
70046 + kdata->params.smallest_removed = smallest_removed;
70047 + kdata->params.truncate = truncate;
70048 + kdata->flags = 0;
70049 + kdata->inode = inode;
70050 + kdata->left = left_child;
70051 + kdata->right = right_child;
70052 + /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70053 + kdata->buf = (char *)(right_child + 1);
70054 +
70055 + if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70056 + /* left child of extent item may have to get updated right
70057 + delimiting key and to get linked with right child of extent
70058 + @from if it will be removed completely */
70059 + result = prepare_twig_kill(kdata, locked_left_neighbor);
70060 + if (result) {
70061 + done_children(kdata);
70062 + done_carry_pool(pool);
70063 + return result;
70064 + }
70065 + }
70066 +
70067 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70068 + if (IS_ERR(op) || (op == NULL)) {
70069 + done_children(kdata);
70070 + done_carry_pool(pool);
70071 + return RETERR(op ? PTR_ERR(op) : -EIO);
70072 + }
70073 +
70074 + op->u.cut_or_kill.is_cut = 0;
70075 + op->u.cut_or_kill.u.kill = kdata;
70076 +
70077 + result = reiser4_carry(lowest_level, NULL);
70078 +
70079 + done_children(kdata);
70080 + done_carry_pool(pool);
70081 + return result;
70082 +}
70083 +
70084 +void
70085 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70086 +{
70087 + if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
70088 + pgoff_t start_pg, end_pg;
70089 +
70090 + start_pg = start >> PAGE_CACHE_SHIFT;
70091 + end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70092 +
70093 + if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70094 + /*
70095 + * kill up to the page boundary.
70096 + */
70097 + assert("vs-123456", start_pg == end_pg);
70098 + reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70099 + truncate);
70100 + } else if (start_pg != end_pg) {
70101 + /*
70102 + * page boundary is within killed portion of node.
70103 + */
70104 + assert("vs-654321", end_pg - start_pg == 1);
70105 + reiser4_invalidate_pages(inode->i_mapping, end_pg,
70106 + end_pg - start_pg, 1);
70107 + }
70108 + }
70109 + inode_sub_bytes(inode, end - start);
70110 +}
70111 +
70112 +/**
70113 + * Delete whole @node from the reiser4 tree without loading it.
70114 + *
70115 + * @left: locked left neighbor,
70116 + * @node: node to be deleted,
70117 + * @smallest_removed: leftmost key of deleted node,
70118 + * @object: inode pointer, if we truncate a file body.
70119 + * @truncate: true if called for file truncate.
70120 + *
70121 + * @return: 0 if success, error code otherwise.
70122 + *
70123 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70124 + * contains the right value of the smallest removed key from the previous
70125 + * cut_worker() iteration. This is needed for proper accounting of
70126 + * "i_blocks" and "i_bytes" fields of the @object.
70127 + */
70128 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
70129 + struct inode *object, int truncate)
70130 +{
70131 + lock_handle parent_lock;
70132 + coord_t cut_from;
70133 + coord_t cut_to;
70134 + reiser4_tree *tree;
70135 + int ret;
70136 +
70137 + assert("zam-937", node != NULL);
70138 + assert("zam-933", znode_is_write_locked(node));
70139 + assert("zam-999", smallest_removed != NULL);
70140 +
70141 + init_lh(&parent_lock);
70142 +
70143 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70144 + if (ret)
70145 + return ret;
70146 +
70147 + assert("zam-934", !znode_above_root(parent_lock.node));
70148 +
70149 + ret = zload(parent_lock.node);
70150 + if (ret)
70151 + goto failed_nozrelse;
70152 +
70153 + ret = find_child_ptr(parent_lock.node, node, &cut_from);
70154 + if (ret)
70155 + goto failed;
70156 +
70157 + /* decrement child counter and set parent pointer to NULL before
70158 + deleting the list from parent node because of checks in
70159 + internal_kill_item_hook (we can delete the last item from the parent
70160 + node, the parent node is going to be deleted and its c_count should
70161 + be zero). */
70162 +
70163 + tree = znode_get_tree(node);
70164 + write_lock_tree(tree);
70165 + init_parent_coord(&node->in_parent, NULL);
70166 + --parent_lock.node->c_count;
70167 + write_unlock_tree(tree);
70168 +
70169 + assert("zam-989", item_is_internal(&cut_from));
70170 +
70171 + /* @node should be deleted after unlocking. */
70172 + ZF_SET(node, JNODE_HEARD_BANSHEE);
70173 +
70174 + /* remove a pointer from the parent node to the node being deleted. */
70175 + coord_dup(&cut_to, &cut_from);
70176 + /* FIXME: shouldn't this be kill_node_content */
70177 + ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70178 + if (ret)
70179 + /* FIXME(Zam): Should we re-connect the node to its parent if
70180 + * cut_node fails? */
70181 + goto failed;
70182 +
70183 + {
70184 + reiser4_tree *tree = current_tree;
70185 + __u64 start_offset = 0, end_offset = 0;
70186 +
70187 + read_lock_tree(tree);
70188 + write_lock_dk(tree);
70189 + if (object) {
70190 + /* We use @smallest_removed and the left delimiting of
70191 + * the current node for @object->i_blocks, i_bytes
70192 + * calculation. We assume that the items after the
70193 + * *@smallest_removed key have been deleted from the
70194 + * file body. */
70195 + start_offset = get_key_offset(znode_get_ld_key(node));
70196 + end_offset = get_key_offset(smallest_removed);
70197 + }
70198 +
70199 + assert("zam-1021", znode_is_connected(node));
70200 + if (node->left)
70201 + znode_set_rd_key(node->left, znode_get_rd_key(node));
70202 +
70203 + *smallest_removed = *znode_get_ld_key(node);
70204 +
70205 + write_unlock_dk(tree);
70206 + read_unlock_tree(tree);
70207 +
70208 + if (object) {
70209 + /* we used to perform actions which are to be performed on items on their removal from tree in
70210 + special item method - kill_hook. Here for optimization reasons we avoid reading node
70211 + containing item we remove and can not call item's kill hook. Instead we call function which
70212 + does exactly the same things as tail kill hook in assumption that node we avoid reading
70213 + contains only one item and that item is a tail one. */
70214 + fake_kill_hook_tail(object, start_offset, end_offset,
70215 + truncate);
70216 + }
70217 + }
70218 + failed:
70219 + zrelse(parent_lock.node);
70220 + failed_nozrelse:
70221 + done_lh(&parent_lock);
70222 +
70223 + return ret;
70224 +}
70225 +
70226 +static int can_delete(const reiser4_key *key, znode *node)
70227 +{
70228 + int result;
70229 +
70230 + read_lock_dk(current_tree);
70231 + result = keyle(key, znode_get_ld_key(node));
70232 + read_unlock_dk(current_tree);
70233 + return result;
70234 +}
70235 +
70236 +/**
70237 + * This subroutine is not optimal but implementation seems to
70238 + * be easier).
70239 + *
70240 + * @tap: the point deletion process begins from,
70241 + * @from_key: the beginning of the deleted key range,
70242 + * @to_key: the end of the deleted key range,
70243 + * @smallest_removed: the smallest removed key,
70244 + * @truncate: true if called for file truncate.
70245 + * @progress: return true if a progress in file items deletions was made,
70246 + * @smallest_removed value is actual in that case.
70247 + *
70248 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
70249 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
70250 + */
70251 +int
70252 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70253 + const reiser4_key * to_key,
70254 + reiser4_key * smallest_removed, struct inode *object,
70255 + int truncate, int *progress)
70256 +{
70257 + lock_handle next_node_lock;
70258 + coord_t left_coord;
70259 + int result;
70260 +
70261 + assert("zam-931", tap->coord->node != NULL);
70262 + assert("zam-932", znode_is_write_locked(tap->coord->node));
70263 +
70264 + *progress = 0;
70265 + init_lh(&next_node_lock);
70266 +
70267 + while (1) {
70268 + znode *node; /* node from which items are cut */
70269 + node_plugin *nplug; /* node plugin for @node */
70270 +
70271 + node = tap->coord->node;
70272 +
70273 + /* Move next_node_lock to the next node on the left. */
70274 + result =
70275 + reiser4_get_left_neighbor(&next_node_lock, node,
70276 + ZNODE_WRITE_LOCK,
70277 + GN_CAN_USE_UPPER_LEVELS);
70278 + if (result != 0 && result != -E_NO_NEIGHBOR)
70279 + break;
70280 + /* Check can we delete the node as a whole. */
70281 + if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70282 + can_delete(from_key, node)) {
70283 + result = reiser4_delete_node(node, smallest_removed,
70284 + object, truncate);
70285 + } else {
70286 + result = reiser4_tap_load(tap);
70287 + if (result)
70288 + return result;
70289 +
70290 + /* Prepare the second (right) point for cut_node() */
70291 + if (*progress)
70292 + coord_init_last_unit(tap->coord, node);
70293 +
70294 + else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70295 + NULL)
70296 + /* set rightmost unit for the items without lookup method */
70297 + tap->coord->unit_pos =
70298 + coord_last_unit_pos(tap->coord);
70299 +
70300 + nplug = node->nplug;
70301 +
70302 + assert("vs-686", nplug);
70303 + assert("vs-687", nplug->lookup);
70304 +
70305 + /* left_coord is leftmost unit cut from @node */
70306 + result = nplug->lookup(node, from_key,
70307 + FIND_MAX_NOT_MORE_THAN,
70308 + &left_coord);
70309 +
70310 + if (IS_CBKERR(result))
70311 + break;
70312 +
70313 + /* adjust coordinates so that they are set to existing units */
70314 + if (coord_set_to_right(&left_coord)
70315 + || coord_set_to_left(tap->coord)) {
70316 + result = 0;
70317 + break;
70318 + }
70319 +
70320 + if (coord_compare(&left_coord, tap->coord) ==
70321 + COORD_CMP_ON_RIGHT) {
70322 + /* keys from @from_key to @to_key are not in the tree */
70323 + result = 0;
70324 + break;
70325 + }
70326 +
70327 + if (left_coord.item_pos != tap->coord->item_pos) {
70328 + /* do not allow to cut more than one item. It is added to solve problem of truncating
70329 + partially converted files. If file is partially converted there may exist a twig node
70330 + containing both internal item or items pointing to leaf nodes with formatting items
70331 + and extent item. We do not want to kill internal items being at twig node here
70332 + because cut_tree_worker assumes killing them from level level */
70333 + coord_dup(&left_coord, tap->coord);
70334 + assert("vs-1652",
70335 + coord_is_existing_unit(&left_coord));
70336 + left_coord.unit_pos = 0;
70337 + }
70338 +
70339 + /* cut data from one node */
70340 + // *smallest_removed = *reiser4_min_key();
70341 + result =
70342 + kill_node_content(&left_coord, tap->coord, from_key,
70343 + to_key, smallest_removed,
70344 + next_node_lock.node, object,
70345 + truncate);
70346 + reiser4_tap_relse(tap);
70347 + }
70348 + if (result)
70349 + break;
70350 +
70351 + ++(*progress);
70352 +
70353 + /* Check whether all items with keys >= from_key were removed
70354 + * from the tree. */
70355 + if (keyle(smallest_removed, from_key))
70356 + /* result = 0; */
70357 + break;
70358 +
70359 + if (next_node_lock.node == NULL)
70360 + break;
70361 +
70362 + result = reiser4_tap_move(tap, &next_node_lock);
70363 + done_lh(&next_node_lock);
70364 + if (result)
70365 + break;
70366 +
70367 + /* Break long reiser4_cut_tree operation (deletion of a large
70368 + file) if atom requires commit. */
70369 + if (*progress > CUT_TREE_MIN_ITERATIONS
70370 + && current_atom_should_commit()) {
70371 + result = -E_REPEAT;
70372 + break;
70373 + }
70374 + }
70375 + done_lh(&next_node_lock);
70376 + // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
70377 + return result;
70378 +}
70379 +
70380 +/* there is a fundamental problem with optimizing deletes: VFS does it
70381 + one file at a time. Another problem is that if an item can be
70382 + anything, then deleting items must be done one at a time. It just
70383 + seems clean to writes this to specify a from and a to key, and cut
70384 + everything between them though. */
70385 +
70386 +/* use this function with care if deleting more than what is part of a single file. */
70387 +/* do not use this when cutting a single item, it is suboptimal for that */
70388 +
70389 +/* You are encouraged to write plugin specific versions of this. It
70390 + cannot be optimal for all plugins because it works item at a time,
70391 + and some plugins could sometimes work node at a time. Regular files
70392 + however are not optimizable to work node at a time because of
70393 + extents needing to free the blocks they point to.
70394 +
70395 + Optimizations compared to v3 code:
70396 +
70397 + It does not balance (that task is left to memory pressure code).
70398 +
70399 + Nodes are deleted only if empty.
70400 +
70401 + Uses extents.
70402 +
70403 + Performs read-ahead of formatted nodes whose contents are part of
70404 + the deletion.
70405 +*/
70406 +
70407 +/**
70408 + * Delete everything from the reiser4 tree between two keys: @from_key and
70409 + * @to_key.
70410 + *
70411 + * @from_key: the beginning of the deleted key range,
70412 + * @to_key: the end of the deleted key range,
70413 + * @smallest_removed: the smallest removed key,
70414 + * @object: owner of cutting items.
70415 + * @truncate: true if called for file truncate.
70416 + * @progress: return true if a progress in file items deletions was made,
70417 + * @smallest_removed value is actual in that case.
70418 + *
70419 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70420 + * operation was interrupted for allowing atom commit .
70421 + */
70422 +
70423 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70424 + const reiser4_key * to_key,
70425 + reiser4_key * smallest_removed_p,
70426 + struct inode *object, int truncate, int *progress)
70427 +{
70428 + lock_handle lock;
70429 + int result;
70430 + tap_t tap;
70431 + coord_t right_coord;
70432 + reiser4_key smallest_removed;
70433 + int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70434 + const reiser4_key *, reiser4_key *,
70435 + struct inode *, int, int *);
70436 + STORE_COUNTERS;
70437 +
70438 + assert("umka-329", tree != NULL);
70439 + assert("umka-330", from_key != NULL);
70440 + assert("umka-331", to_key != NULL);
70441 + assert("zam-936", keyle(from_key, to_key));
70442 +
70443 + if (smallest_removed_p == NULL)
70444 + smallest_removed_p = &smallest_removed;
70445 +
70446 + init_lh(&lock);
70447 +
70448 + do {
70449 + /* Find rightmost item to cut away from the tree. */
70450 + result = reiser4_object_lookup(object, to_key, &right_coord,
70451 + &lock, ZNODE_WRITE_LOCK,
70452 + FIND_MAX_NOT_MORE_THAN,
70453 + TWIG_LEVEL, LEAF_LEVEL,
70454 + CBK_UNIQUE, NULL /*ra_info */);
70455 + if (result != CBK_COORD_FOUND)
70456 + break;
70457 + if (object == NULL
70458 + || inode_file_plugin(object)->cut_tree_worker == NULL)
70459 + cut_tree_worker = cut_tree_worker_common;
70460 + else
70461 + cut_tree_worker =
70462 + inode_file_plugin(object)->cut_tree_worker;
70463 + reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70464 + result =
70465 + cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70466 + object, truncate, progress);
70467 + reiser4_tap_done(&tap);
70468 +
70469 + reiser4_preempt_point();
70470 +
70471 + } while (0);
70472 +
70473 + done_lh(&lock);
70474 +
70475 + if (result) {
70476 + switch (result) {
70477 + case -E_NO_NEIGHBOR:
70478 + result = 0;
70479 + break;
70480 + case -E_DEADLOCK:
70481 + result = -E_REPEAT;
70482 + case -E_REPEAT:
70483 + case -ENOMEM:
70484 + case -ENOENT:
70485 + break;
70486 + default:
70487 + warning("nikita-2861", "failure: %i", result);
70488 + }
70489 + }
70490 +
70491 + CHECK_COUNTERS;
70492 + return result;
70493 +}
70494 +
70495 +/* repeat reiser4_cut_tree_object until everything is deleted.
70496 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70497 + * is returned by cut_tree_object. */
70498 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70499 + const reiser4_key * to, struct inode *inode, int truncate)
70500 +{
70501 + int result;
70502 + int progress;
70503 +
70504 + do {
70505 + result = reiser4_cut_tree_object(tree, from, to, NULL,
70506 + inode, truncate, &progress);
70507 + } while (result == -E_REPEAT);
70508 +
70509 + return result;
70510 +}
70511 +
70512 +/* finishing reiser4 initialization */
70513 +int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
70514 + * initialized */ ,
70515 + const reiser4_block_nr * root_block /* address of a root block
70516 + * on a disk */ ,
70517 + tree_level height /* height of a tree */ ,
70518 + node_plugin * nplug /* default node plugin */ )
70519 +{
70520 + int result;
70521 +
70522 + assert("nikita-306", tree != NULL);
70523 + assert("nikita-307", root_block != NULL);
70524 + assert("nikita-308", height > 0);
70525 + assert("nikita-309", nplug != NULL);
70526 + assert("zam-587", tree->super != NULL);
70527 +
70528 + tree->root_block = *root_block;
70529 + tree->height = height;
70530 + tree->estimate_one_insert = calc_estimate_one_insert(height);
70531 + tree->nplug = nplug;
70532 +
70533 + tree->znode_epoch = 1ull;
70534 +
70535 + cbk_cache_init(&tree->cbk_cache);
70536 +
70537 + result = znodes_tree_init(tree);
70538 + if (result == 0)
70539 + result = jnodes_tree_init(tree);
70540 + if (result == 0) {
70541 + tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
70542 + reiser4_ctx_gfp_mask_get());
70543 + if (IS_ERR(tree->uber)) {
70544 + result = PTR_ERR(tree->uber);
70545 + tree->uber = NULL;
70546 + }
70547 + }
70548 + return result;
70549 +}
70550 +
70551 +/* release resources associated with @tree */
70552 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
70553 +{
70554 + if (tree == NULL)
70555 + return;
70556 +
70557 + if (tree->uber != NULL) {
70558 + zput(tree->uber);
70559 + tree->uber = NULL;
70560 + }
70561 + znodes_tree_done(tree);
70562 + jnodes_tree_done(tree);
70563 + cbk_cache_done(&tree->cbk_cache);
70564 +}
70565 +
70566 +/* Make Linus happy.
70567 + Local variables:
70568 + c-indentation-style: "K&R"
70569 + mode-name: "LC"
70570 + c-basic-offset: 8
70571 + tab-width: 8
70572 + fill-column: 120
70573 + scroll-step: 1
70574 + End:
70575 +*/
70576 diff -urN linux-2.6.20.orig/fs/reiser4/tree.h linux-2.6.20/fs/reiser4/tree.h
70577 --- linux-2.6.20.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
70578 +++ linux-2.6.20/fs/reiser4/tree.h 2007-05-06 14:50:43.883033217 +0400
70579 @@ -0,0 +1,577 @@
70580 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70581 + * reiser4/README */
70582 +
70583 +/* Tree operations. See fs/reiser4/tree.c for comments */
70584 +
70585 +#if !defined( __REISER4_TREE_H__ )
70586 +#define __REISER4_TREE_H__
70587 +
70588 +#include "forward.h"
70589 +#include "debug.h"
70590 +#include "dformat.h"
70591 +#include "plugin/node/node.h"
70592 +#include "plugin/plugin.h"
70593 +#include "znode.h"
70594 +#include "tap.h"
70595 +
70596 +#include <linux/types.h> /* for __u?? */
70597 +#include <linux/fs.h> /* for struct super_block */
70598 +#include <linux/spinlock.h>
70599 +#include <linux/sched.h> /* for struct task_struct */
70600 +
70601 +/* fictive block number never actually used */
70602 +extern const reiser4_block_nr UBER_TREE_ADDR;
70603 +
70604 +/* &cbk_cache_slot - entry in a coord cache.
70605 +
70606 + This is entry in a coord_by_key (cbk) cache, represented by
70607 + &cbk_cache.
70608 +
70609 +*/
70610 +typedef struct cbk_cache_slot {
70611 + /* cached node */
70612 + znode *node;
70613 + /* linkage to the next cbk cache slot in a LRU order */
70614 + struct list_head lru;
70615 +} cbk_cache_slot;
70616 +
70617 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
70618 +
70619 + cbk_cache is supposed to speed up tree lookups by caching results of recent
70620 + successful lookups (we don't cache negative results as dentry cache
70621 + does). Cache consists of relatively small number of entries kept in a LRU
70622 + order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70623 + which we can obtain a range of keys that covered by this znode. Before
70624 + embarking into real tree traversal we scan cbk_cache slot by slot and for
70625 + each slot check whether key we are looking for is between minimal and
70626 + maximal keys for node pointed to by this slot. If no match is found, real
70627 + tree traversal is performed and if result is successful, appropriate entry
70628 + is inserted into cache, possibly pulling least recently used entry out of
70629 + it.
70630 +
70631 + Tree spin lock is used to protect coord cache. If contention for this
70632 + lock proves to be too high, more finer grained locking can be added.
70633 +
70634 + Invariants involving parts of this data-type:
70635 +
70636 + [cbk-cache-invariant]
70637 +*/
70638 +typedef struct cbk_cache {
70639 + /* serializator */
70640 + rwlock_t guard;
70641 + int nr_slots;
70642 + /* head of LRU list of cache slots */
70643 + struct list_head lru;
70644 + /* actual array of slots */
70645 + cbk_cache_slot *slot;
70646 +} cbk_cache;
70647 +
70648 +/* level_lookup_result - possible outcome of looking up key at some level.
70649 + This is used by coord_by_key when traversing tree downward. */
70650 +typedef enum {
70651 + /* continue to the next level */
70652 + LOOKUP_CONT,
70653 + /* done. Either required item was found, or we can prove it
70654 + doesn't exist, or some error occurred. */
70655 + LOOKUP_DONE,
70656 + /* restart traversal from the root. Infamous "repetition". */
70657 + LOOKUP_REST
70658 +} level_lookup_result;
70659 +
70660 +/* This is representation of internal reiser4 tree where all file-system
70661 + data and meta-data are stored. This structure is passed to all tree
70662 + manipulation functions. It's different from the super block because:
70663 + we don't want to limit ourselves to strictly one to one mapping
70664 + between super blocks and trees, and, because they are logically
70665 + different: there are things in a super block that have no relation to
70666 + the tree (bitmaps, journalling area, mount options, etc.) and there
70667 + are things in a tree that bear no relation to the super block, like
70668 + tree of znodes.
70669 +
70670 + At this time, there is only one tree
70671 + per filesystem, and this struct is part of the super block. We only
70672 + call the super block the super block for historical reasons (most
70673 + other filesystems call the per filesystem metadata the super block).
70674 +*/
70675 +
70676 +struct reiser4_tree {
70677 + /* block_nr == 0 is fake znode. Write lock it, while changing
70678 + tree height. */
70679 + /* disk address of root node of a tree */
70680 + reiser4_block_nr root_block;
70681 +
70682 + /* level of the root node. If this is 1, tree consists of root
70683 + node only */
70684 + tree_level height;
70685 +
70686 + /*
70687 + * this is cached here avoid calling plugins through function
70688 + * dereference all the time.
70689 + */
70690 + __u64 estimate_one_insert;
70691 +
70692 + /* cache of recent tree lookup results */
70693 + cbk_cache cbk_cache;
70694 +
70695 + /* hash table to look up znodes by block number. */
70696 + z_hash_table zhash_table;
70697 + z_hash_table zfake_table;
70698 + /* hash table to look up jnodes by inode and offset. */
70699 + j_hash_table jhash_table;
70700 +
70701 + /* lock protecting:
70702 + - parent pointers,
70703 + - sibling pointers,
70704 + - znode hash table
70705 + - coord cache
70706 + */
70707 + /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70708 + hoping they will be less contented. We can use one spin lock per one
70709 + znode hash bucket. With adding of some code complexity, sibling
70710 + pointers can be protected by both znode spin locks. However it looks
70711 + more SMP scalable we should test this locking change on n-ways (n >
70712 + 4) SMP machines. Current 4-ways machine test does not show that tree
70713 + lock is contented and it is a bottleneck (2003.07.25). */
70714 +
70715 + rwlock_t tree_lock;
70716 +
70717 + /* lock protecting delimiting keys */
70718 + rwlock_t dk_lock;
70719 +
70720 + /* spin lock protecting znode_epoch */
70721 + spinlock_t epoch_lock;
70722 + /* version stamp used to mark znode updates. See seal.[ch] for more
70723 + * information. */
70724 + __u64 znode_epoch;
70725 +
70726 + znode *uber;
70727 + node_plugin *nplug;
70728 + struct super_block *super;
70729 + struct {
70730 + /* carry flags used for insertion of new nodes */
70731 + __u32 new_node_flags;
70732 + /* carry flags used for insertion of new extents */
70733 + __u32 new_extent_flags;
70734 + /* carry flags used for paste operations */
70735 + __u32 paste_flags;
70736 + /* carry flags used for insert operations */
70737 + __u32 insert_flags;
70738 + } carry;
70739 +};
70740 +
70741 +extern int reiser4_init_tree(reiser4_tree * tree,
70742 + const reiser4_block_nr * root_block,
70743 + tree_level height, node_plugin * default_plugin);
70744 +extern void reiser4_done_tree(reiser4_tree * tree);
70745 +
70746 +/* cbk flags: options for coord_by_key() */
70747 +typedef enum {
70748 + /* coord_by_key() is called for insertion. This is necessary because
70749 + of extents being located at the twig level. For explanation, see
70750 + comment just above is_next_item_internal().
70751 + */
70752 + CBK_FOR_INSERT = (1 << 0),
70753 + /* coord_by_key() is called with key that is known to be unique */
70754 + CBK_UNIQUE = (1 << 1),
70755 + /* coord_by_key() can trust delimiting keys. This options is not user
70756 + accessible. coord_by_key() will set it automatically. It will be
70757 + only cleared by special-case in extents-on-the-twig-level handling
70758 + where it is necessary to insert item with a key smaller than
70759 + leftmost key in a node. This is necessary because of extents being
70760 + located at the twig level. For explanation, see comment just above
70761 + is_next_item_internal().
70762 + */
70763 + CBK_TRUST_DK = (1 << 2),
70764 + CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70765 + CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70766 + CBK_DKSET = (1 << 5),
70767 + CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70768 + CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70769 + CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70770 + * lock */
70771 +} cbk_flags;
70772 +
70773 +/* insertion outcome. IBK = insert by key */
70774 +typedef enum {
70775 + IBK_INSERT_OK = 0,
70776 + IBK_ALREADY_EXISTS = -EEXIST,
70777 + IBK_IO_ERROR = -EIO,
70778 + IBK_NO_SPACE = -E_NODE_FULL,
70779 + IBK_OOM = -ENOMEM
70780 +} insert_result;
70781 +
70782 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70783 +
70784 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70785 + lock_handle * lh, void *arg);
70786 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
70787 + lock_handle * lh,
70788 + tree_iterate_actor_t actor, void *arg,
70789 + znode_lock_mode mode, int through_units_p);
70790 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70791 + znode_lock_request pri, lock_handle * lh);
70792 +
70793 +/* return node plugin of @node */
70794 +static inline node_plugin *node_plugin_by_node(const znode *
70795 + node /* node to query */ )
70796 +{
70797 + assert("vs-213", node != NULL);
70798 + assert("vs-214", znode_is_loaded(node));
70799 +
70800 + return node->nplug;
70801 +}
70802 +
70803 +/* number of items in @node */
70804 +static inline pos_in_node_t node_num_items(const znode * node)
70805 +{
70806 + assert("nikita-2754", znode_is_loaded(node));
70807 + assert("nikita-2468",
70808 + node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70809 +
70810 + return node->nr_items;
70811 +}
70812 +
70813 +/* Return the number of items at the present node. Asserts coord->node !=
70814 + NULL. */
70815 +static inline unsigned coord_num_items(const coord_t * coord)
70816 +{
70817 + assert("jmacd-9805", coord->node != NULL);
70818 +
70819 + return node_num_items(coord->node);
70820 +}
70821 +
70822 +/* true if @node is empty */
70823 +static inline int node_is_empty(const znode * node)
70824 +{
70825 + return node_num_items(node) == 0;
70826 +}
70827 +
70828 +typedef enum {
70829 + SHIFTED_SOMETHING = 0,
70830 + SHIFT_NO_SPACE = -E_NODE_FULL,
70831 + SHIFT_IO_ERROR = -EIO,
70832 + SHIFT_OOM = -ENOMEM,
70833 +} shift_result;
70834 +
70835 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70836 +extern int is_coord_in_node(const coord_t * coord);
70837 +extern int key_in_node(const reiser4_key *, const coord_t *);
70838 +extern void coord_item_move_to(coord_t * coord, int items);
70839 +extern void coord_unit_move_to(coord_t * coord, int units);
70840 +
70841 +/* there are two types of repetitive accesses (ra): intra-syscall
70842 + (local) and inter-syscall (global). Local ra is used when
70843 + during single syscall we add/delete several items and units in the
70844 + same place in a tree. Note that plan-A fragments local ra by
70845 + separating stat-data and file body in key-space. Global ra is
70846 + used when user does repetitive modifications in the same place in a
70847 + tree.
70848 +
70849 + Our ra implementation serves following purposes:
70850 + 1 it affects balancing decisions so that next operation in a row
70851 + can be performed faster;
70852 + 2 it affects lower-level read-ahead in page-cache;
70853 + 3 it allows to avoid unnecessary lookups by maintaining some state
70854 + across several operations (this is only for local ra);
70855 + 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70856 + operations they are performed without actually doing any intra-node
70857 + shifts, until we finish sequence or scope of sequence leaves
70858 + current node, only then we really pack node (local ra only).
70859 +*/
70860 +
70861 +/* another thing that can be useful is to keep per-tree and/or
70862 + per-process cache of recent lookups. This cache can be organised as a
70863 + list of block numbers of formatted nodes sorted by starting key in
70864 + this node. Balancings should invalidate appropriate parts of this
70865 + cache.
70866 +*/
70867 +
70868 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70869 + coord_t * coord, lock_handle * handle,
70870 + znode_lock_mode lock, lookup_bias bias,
70871 + tree_level lock_level, tree_level stop_level,
70872 + __u32 flags, ra_info_t *);
70873 +
70874 +lookup_result reiser4_object_lookup(struct inode *object,
70875 + const reiser4_key * key,
70876 + coord_t * coord,
70877 + lock_handle * lh,
70878 + znode_lock_mode lock_mode,
70879 + lookup_bias bias,
70880 + tree_level lock_level,
70881 + tree_level stop_level,
70882 + __u32 flags, ra_info_t * info);
70883 +
70884 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70885 + reiser4_item_data * data, coord_t * coord,
70886 + lock_handle * lh,
70887 + tree_level stop_level, __u32 flags);
70888 +insert_result insert_by_coord(coord_t * coord,
70889 + reiser4_item_data * data, const reiser4_key * key,
70890 + lock_handle * lh, __u32);
70891 +insert_result insert_extent_by_coord(coord_t * coord,
70892 + reiser4_item_data * data,
70893 + const reiser4_key * key, lock_handle * lh);
70894 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70895 + const reiser4_key * to_key,
70896 + reiser4_key * smallest_removed);
70897 +int kill_node_content(coord_t * from, coord_t * to,
70898 + const reiser4_key * from_key, const reiser4_key * to_key,
70899 + reiser4_key * smallest_removed,
70900 + znode * locked_left_neighbor, struct inode *inode,
70901 + int truncate);
70902 +
70903 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
70904 + reiser4_key * key, lock_handle * lh, cop_insert_flag);
70905 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70906 + reiser4_item_data * data, unsigned);
70907 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
70908 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
70909 + coord_t * result);
70910 +
70911 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70912 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70913 +
70914 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70915 +
70916 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70917 + const reiser4_key *, reiser4_key *,
70918 + struct inode *, int, int *);
70919 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
70920 + const reiser4_key *, reiser4_key *,
70921 + struct inode *, int, int *);
70922 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70923 + const reiser4_key * to, struct inode *, int);
70924 +
70925 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
70926 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70927 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70928 + znode * left, coord_t * result);
70929 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70930 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70931 + znode * child);
70932 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
70933 + int incore_p, int setup_dkeys_p);
70934 +
70935 +extern int cbk_cache_init(cbk_cache * cache);
70936 +extern void cbk_cache_done(cbk_cache * cache);
70937 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70938 +
70939 +extern char *sprint_address(const reiser4_block_nr * block);
70940 +
70941 +#if REISER4_DEBUG
70942 +extern void print_coord_content(const char *prefix, coord_t * p);
70943 +extern void reiser4_print_address(const char *prefix,
70944 + const reiser4_block_nr * block);
70945 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70946 + __u32 flags);
70947 +extern void check_dkeys(znode *node);
70948 +#else
70949 +#define print_coord_content(p, c) noop
70950 +#define reiser4_print_address(p, b) noop
70951 +#endif
70952 +
70953 +extern void forget_znode(lock_handle * handle);
70954 +extern int deallocate_znode(znode * node);
70955 +
70956 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70957 +
70958 +/* struct used internally to pack all numerous arguments of tree lookup.
70959 + Used to avoid passing a lot of arguments to helper functions. */
70960 +typedef struct cbk_handle {
70961 + /* tree we are in */
70962 + reiser4_tree *tree;
70963 + /* key we are going after */
70964 + const reiser4_key *key;
70965 + /* coord we will store result in */
70966 + coord_t *coord;
70967 + /* type of lock to take on target node */
70968 + znode_lock_mode lock_mode;
70969 + /* lookup bias. See comments at the declaration of lookup_bias */
70970 + lookup_bias bias;
70971 + /* lock level: level starting from which tree traversal starts taking
70972 + * write locks. */
70973 + tree_level lock_level;
70974 + /* level where search will stop. Either item will be found between
70975 + lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70976 + returned.
70977 + */
70978 + tree_level stop_level;
70979 + /* level we are currently at */
70980 + tree_level level;
70981 + /* block number of @active node. Tree traversal operates on two
70982 + nodes: active and parent. */
70983 + reiser4_block_nr block;
70984 + /* put here error message to be printed by caller */
70985 + const char *error;
70986 + /* result passed back to caller */
70987 + lookup_result result;
70988 + /* lock handles for active and parent */
70989 + lock_handle *parent_lh;
70990 + lock_handle *active_lh;
70991 + reiser4_key ld_key;
70992 + reiser4_key rd_key;
70993 + /* flags, passed to the cbk routine. Bits of this bitmask are defined
70994 + in tree.h:cbk_flags enum. */
70995 + __u32 flags;
70996 + ra_info_t *ra_info;
70997 + struct inode *object;
70998 +} cbk_handle;
70999 +
71000 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
71001 +
71002 +/* eottl.c */
71003 +extern int handle_eottl(cbk_handle *h, int *outcome);
71004 +
71005 +int lookup_multikey(cbk_handle * handle, int nr_keys);
71006 +int lookup_couple(reiser4_tree * tree,
71007 + const reiser4_key * key1, const reiser4_key * key2,
71008 + coord_t * coord1, coord_t * coord2,
71009 + lock_handle * lh1, lock_handle * lh2,
71010 + znode_lock_mode lock_mode, lookup_bias bias,
71011 + tree_level lock_level, tree_level stop_level, __u32 flags,
71012 + int *result1, int *result2);
71013 +
71014 +static inline void read_lock_tree(reiser4_tree *tree)
71015 +{
71016 + /* check that tree is not locked */
71017 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71018 + LOCK_CNT_NIL(read_locked_tree) &&
71019 + LOCK_CNT_NIL(write_locked_tree)));
71020 + /* check that spinlocks of lower priorities are not held */
71021 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71022 + LOCK_CNT_NIL(rw_locked_dk) &&
71023 + LOCK_CNT_NIL(spin_locked_stack)));
71024 +
71025 + read_lock(&(tree->tree_lock));
71026 +
71027 + LOCK_CNT_INC(read_locked_tree);
71028 + LOCK_CNT_INC(rw_locked_tree);
71029 + LOCK_CNT_INC(spin_locked);
71030 +}
71031 +
71032 +static inline void read_unlock_tree(reiser4_tree *tree)
71033 +{
71034 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
71035 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71036 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71037 +
71038 + LOCK_CNT_DEC(read_locked_tree);
71039 + LOCK_CNT_DEC(rw_locked_tree);
71040 + LOCK_CNT_DEC(spin_locked);
71041 +
71042 + read_unlock(&(tree->tree_lock));
71043 +}
71044 +
71045 +static inline void write_lock_tree(reiser4_tree *tree)
71046 +{
71047 + /* check that tree is not locked */
71048 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71049 + LOCK_CNT_NIL(read_locked_tree) &&
71050 + LOCK_CNT_NIL(write_locked_tree)));
71051 + /* check that spinlocks of lower priorities are not held */
71052 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71053 + LOCK_CNT_NIL(rw_locked_dk) &&
71054 + LOCK_CNT_NIL(spin_locked_stack)));
71055 +
71056 + write_lock(&(tree->tree_lock));
71057 +
71058 + LOCK_CNT_INC(write_locked_tree);
71059 + LOCK_CNT_INC(rw_locked_tree);
71060 + LOCK_CNT_INC(spin_locked);
71061 +}
71062 +
71063 +static inline void write_unlock_tree(reiser4_tree *tree)
71064 +{
71065 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71066 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71067 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71068 +
71069 + LOCK_CNT_DEC(write_locked_tree);
71070 + LOCK_CNT_DEC(rw_locked_tree);
71071 + LOCK_CNT_DEC(spin_locked);
71072 +
71073 + write_unlock(&(tree->tree_lock));
71074 +}
71075 +
71076 +static inline void read_lock_dk(reiser4_tree *tree)
71077 +{
71078 + /* check that dk is not locked */
71079 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71080 + LOCK_CNT_NIL(read_locked_dk) &&
71081 + LOCK_CNT_NIL(write_locked_dk)));
71082 + /* check that spinlocks of lower priorities are not held */
71083 + assert("", LOCK_CNT_NIL(spin_locked_stack));
71084 +
71085 + read_lock(&((tree)->dk_lock));
71086 +
71087 + LOCK_CNT_INC(read_locked_dk);
71088 + LOCK_CNT_INC(rw_locked_dk);
71089 + LOCK_CNT_INC(spin_locked);
71090 +}
71091 +
71092 +static inline void read_unlock_dk(reiser4_tree *tree)
71093 +{
71094 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71095 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71096 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71097 +
71098 + LOCK_CNT_DEC(read_locked_dk);
71099 + LOCK_CNT_DEC(rw_locked_dk);
71100 + LOCK_CNT_DEC(spin_locked);
71101 +
71102 + read_unlock(&(tree->dk_lock));
71103 +}
71104 +
71105 +static inline void write_lock_dk(reiser4_tree *tree)
71106 +{
71107 + /* check that dk is not locked */
71108 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71109 + LOCK_CNT_NIL(read_locked_dk) &&
71110 + LOCK_CNT_NIL(write_locked_dk)));
71111 + /* check that spinlocks of lower priorities are not held */
71112 + assert("", LOCK_CNT_NIL(spin_locked_stack));
71113 +
71114 + write_lock(&((tree)->dk_lock));
71115 +
71116 + LOCK_CNT_INC(write_locked_dk);
71117 + LOCK_CNT_INC(rw_locked_dk);
71118 + LOCK_CNT_INC(spin_locked);
71119 +}
71120 +
71121 +static inline void write_unlock_dk(reiser4_tree *tree)
71122 +{
71123 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71124 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71125 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71126 +
71127 + LOCK_CNT_DEC(write_locked_dk);
71128 + LOCK_CNT_DEC(rw_locked_dk);
71129 + LOCK_CNT_DEC(spin_locked);
71130 +
71131 + write_unlock(&(tree->dk_lock));
71132 +}
71133 +
71134 +/* estimate api. Implementation is in estimate.c */
71135 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71136 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71137 +reiser4_block_nr estimate_insert_flow(tree_level);
71138 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71139 +reiser4_block_nr calc_estimate_one_insert(tree_level);
71140 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
71141 +reiser4_block_nr estimate_insert_cluster(struct inode *);
71142 +reiser4_block_nr estimate_update_cluster(struct inode *);
71143 +
71144 +/* __REISER4_TREE_H__ */
71145 +#endif
71146 +
71147 +/* Make Linus happy.
71148 + Local variables:
71149 + c-indentation-style: "K&R"
71150 + mode-name: "LC"
71151 + c-basic-offset: 8
71152 + tab-width: 8
71153 + fill-column: 120
71154 + scroll-step: 1
71155 + End:
71156 +*/
71157 diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.c linux-2.6.20/fs/reiser4/tree_mod.c
71158 --- linux-2.6.20.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
71159 +++ linux-2.6.20/fs/reiser4/tree_mod.c 2007-05-06 14:50:43.887034467 +0400
71160 @@ -0,0 +1,386 @@
71161 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71162 + * reiser4/README */
71163 +
71164 +/*
71165 + * Functions to add/delete new nodes to/from the tree.
71166 + *
71167 + * Functions from this file are used by carry (see carry*) to handle:
71168 + *
71169 + * . insertion of new formatted node into tree
71170 + *
71171 + * . addition of new tree root, increasing tree height
71172 + *
71173 + * . removing tree root, decreasing tree height
71174 + *
71175 + */
71176 +
71177 +#include "forward.h"
71178 +#include "debug.h"
71179 +#include "dformat.h"
71180 +#include "key.h"
71181 +#include "coord.h"
71182 +#include "plugin/plugin.h"
71183 +#include "jnode.h"
71184 +#include "znode.h"
71185 +#include "tree_mod.h"
71186 +#include "block_alloc.h"
71187 +#include "tree_walk.h"
71188 +#include "tree.h"
71189 +#include "super.h"
71190 +
71191 +#include <linux/err.h>
71192 +
71193 +static int add_child_ptr(znode * parent, znode * child);
71194 +/* warning only issued if error is not -E_REPEAT */
71195 +#define ewarning( error, ... ) \
71196 + if( ( error ) != -E_REPEAT ) \
71197 + warning( __VA_ARGS__ )
71198 +
71199 +/* allocate new node on the @level and immediately on the right of @brother. */
71200 +znode * reiser4_new_node(znode * brother /* existing left neighbor
71201 + * of new node */,
71202 + tree_level level /* tree level at which new node is to
71203 + * be allocated */)
71204 +{
71205 + znode *result;
71206 + int retcode;
71207 + reiser4_block_nr blocknr;
71208 +
71209 + assert("nikita-930", brother != NULL);
71210 + assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71211 +
71212 + retcode = assign_fake_blocknr_formatted(&blocknr);
71213 + if (retcode == 0) {
71214 + result =
71215 + zget(znode_get_tree(brother), &blocknr, NULL, level,
71216 + reiser4_ctx_gfp_mask_get());
71217 + if (IS_ERR(result)) {
71218 + ewarning(PTR_ERR(result), "nikita-929",
71219 + "Cannot allocate znode for carry: %li",
71220 + PTR_ERR(result));
71221 + return result;
71222 + }
71223 + /* cheap test, can be executed even when debugging is off */
71224 + if (!znode_just_created(result)) {
71225 + warning("nikita-2213",
71226 + "Allocated already existing block: %llu",
71227 + (unsigned long long)blocknr);
71228 + zput(result);
71229 + return ERR_PTR(RETERR(-EIO));
71230 + }
71231 +
71232 + assert("nikita-931", result != NULL);
71233 + result->nplug = znode_get_tree(brother)->nplug;
71234 + assert("nikita-933", result->nplug != NULL);
71235 +
71236 + retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
71237 + if (retcode == 0) {
71238 + ZF_SET(result, JNODE_CREATED);
71239 + zrelse(result);
71240 + } else {
71241 + zput(result);
71242 + result = ERR_PTR(retcode);
71243 + }
71244 + } else {
71245 + /* failure to allocate new node during balancing.
71246 + This should never happen. Ever. Returning -E_REPEAT
71247 + is not viable solution, because "out of disk space"
71248 + is not transient error that will go away by itself.
71249 + */
71250 + ewarning(retcode, "nikita-928",
71251 + "Cannot allocate block for carry: %i", retcode);
71252 + result = ERR_PTR(retcode);
71253 + }
71254 + assert("nikita-1071", result != NULL);
71255 + return result;
71256 +}
71257 +
71258 +/* allocate new root and add it to the tree
71259 +
71260 + This helper function is called by add_new_root().
71261 +
71262 +*/
71263 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
71264 + znode * fake /* "fake" znode */ )
71265 +{
71266 + reiser4_tree *tree = znode_get_tree(old_root);
71267 + znode *new_root = NULL; /* to shut gcc up */
71268 + int result;
71269 +
71270 + assert("nikita-1069", old_root != NULL);
71271 + assert("umka-262", fake != NULL);
71272 + assert("umka-263", tree != NULL);
71273 +
71274 + /* "fake" znode---one always hanging just above current root. This
71275 + node is locked when new root is created or existing root is
71276 + deleted. Downward tree traversal takes lock on it before taking
71277 + lock on a root node. This avoids race conditions with root
71278 + manipulations.
71279 +
71280 + */
71281 + assert("nikita-1348", znode_above_root(fake));
71282 + assert("nikita-1211", znode_is_root(old_root));
71283 +
71284 + result = 0;
71285 + if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71286 + warning("nikita-1344", "Tree is too tall: %i", tree->height);
71287 + /* ext2 returns -ENOSPC when it runs out of free inodes with a
71288 + following comment (fs/ext2/ialloc.c:441): Is it really
71289 + ENOSPC?
71290 +
71291 + -EXFULL? -EINVAL?
71292 + */
71293 + result = RETERR(-ENOSPC);
71294 + } else {
71295 + /* Allocate block for new root. It's not that
71296 + important where it will be allocated, as root is
71297 + almost always in memory. Moreover, allocate on
71298 + flush can be going here.
71299 + */
71300 + assert("nikita-1448", znode_is_root(old_root));
71301 + new_root = reiser4_new_node(fake, tree->height + 1);
71302 + if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71303 + lock_handle rlh;
71304 +
71305 + init_lh(&rlh);
71306 + result =
71307 + longterm_lock_znode(&rlh, new_root,
71308 + ZNODE_WRITE_LOCK,
71309 + ZNODE_LOCK_LOPRI);
71310 + if (result == 0) {
71311 + parent_coord_t *in_parent;
71312 +
71313 + znode_make_dirty(fake);
71314 +
71315 + /* new root is a child of "fake" node */
71316 + write_lock_tree(tree);
71317 +
71318 + ++tree->height;
71319 +
71320 + /* recalculate max balance overhead */
71321 + tree->estimate_one_insert =
71322 + estimate_one_insert_item(tree);
71323 +
71324 + tree->root_block = *znode_get_block(new_root);
71325 + in_parent = &new_root->in_parent;
71326 + init_parent_coord(in_parent, fake);
71327 + /* manually insert new root into sibling
71328 + * list. With this all nodes involved into
71329 + * balancing are connected after balancing is
71330 + * done---useful invariant to check. */
71331 + sibling_list_insert_nolock(new_root, NULL);
71332 + write_unlock_tree(tree);
71333 +
71334 + /* insert into new root pointer to the
71335 + @old_root. */
71336 + assert("nikita-1110",
71337 + WITH_DATA(new_root,
71338 + node_is_empty(new_root)));
71339 + write_lock_dk(tree);
71340 + znode_set_ld_key(new_root, reiser4_min_key());
71341 + znode_set_rd_key(new_root, reiser4_max_key());
71342 + write_unlock_dk(tree);
71343 + if (REISER4_DEBUG) {
71344 + ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71345 + ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71346 + ZF_SET(old_root, JNODE_ORPHAN);
71347 + }
71348 + result = add_child_ptr(new_root, old_root);
71349 + done_lh(&rlh);
71350 + }
71351 + zrelse(new_root);
71352 + }
71353 + }
71354 + if (result != 0)
71355 + new_root = ERR_PTR(result);
71356 + return new_root;
71357 +}
71358 +
71359 +/* build &reiser4_item_data for inserting child pointer
71360 +
71361 + Build &reiser4_item_data that can be later used to insert pointer to @child
71362 + in its parent.
71363 +
71364 +*/
71365 +void build_child_ptr_data(znode * child /* node pointer to which will be
71366 + * inserted */ ,
71367 + reiser4_item_data * data /* where to store result */ )
71368 +{
71369 + assert("nikita-1116", child != NULL);
71370 + assert("nikita-1117", data != NULL);
71371 +
71372 + /*
71373 + * NOTE: use address of child's blocknr as address of data to be
71374 + * inserted. As result of this data gets into on-disk structure in cpu
71375 + * byte order. internal's create_hook converts it to little endian byte
71376 + * order.
71377 + */
71378 + data->data = (char *)znode_get_block(child);
71379 + /* data -> data is kernel space */
71380 + data->user = 0;
71381 + data->length = sizeof(reiser4_block_nr);
71382 + /* FIXME-VS: hardcoded internal item? */
71383 +
71384 + /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71385 + data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71386 +}
71387 +
71388 +/* add pointer to @child into empty @parent.
71389 +
71390 + This is used when pointer to old root is inserted into new root which is
71391 + empty.
71392 +*/
71393 +static int add_child_ptr(znode * parent, znode * child)
71394 +{
71395 + coord_t coord;
71396 + reiser4_item_data data;
71397 + int result;
71398 + reiser4_key key;
71399 +
71400 + assert("nikita-1111", parent != NULL);
71401 + assert("nikita-1112", child != NULL);
71402 + assert("nikita-1115",
71403 + znode_get_level(parent) == znode_get_level(child) + 1);
71404 +
71405 + result = zload(parent);
71406 + if (result != 0)
71407 + return result;
71408 + assert("nikita-1113", node_is_empty(parent));
71409 + coord_init_first_unit(&coord, parent);
71410 +
71411 + build_child_ptr_data(child, &data);
71412 + data.arg = NULL;
71413 +
71414 + read_lock_dk(znode_get_tree(parent));
71415 + key = *znode_get_ld_key(child);
71416 + read_unlock_dk(znode_get_tree(parent));
71417 +
71418 + result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71419 + NULL);
71420 + znode_make_dirty(parent);
71421 + zrelse(parent);
71422 + return result;
71423 +}
71424 +
71425 +/* actually remove tree root */
71426 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71427 + * being removed */,
71428 + znode * old_root /* root node that is being
71429 + * removed */ ,
71430 + znode * new_root /* new root---sole child of
71431 + * @old_root */,
71432 + const reiser4_block_nr * new_root_blk /* disk address of
71433 + * @new_root */)
71434 +{
71435 + znode *uber;
71436 + int result;
71437 + lock_handle handle_for_uber;
71438 +
71439 + assert("umka-265", tree != NULL);
71440 + assert("nikita-1198", new_root != NULL);
71441 + assert("nikita-1199",
71442 + znode_get_level(new_root) + 1 == znode_get_level(old_root));
71443 +
71444 + assert("nikita-1201", znode_is_write_locked(old_root));
71445 +
71446 + assert("nikita-1203",
71447 + disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71448 +
71449 + init_lh(&handle_for_uber);
71450 + /* obtain and lock "fake" znode protecting changes in tree height. */
71451 + result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71452 + &handle_for_uber);
71453 + if (result == 0) {
71454 + uber = handle_for_uber.node;
71455 +
71456 + znode_make_dirty(uber);
71457 +
71458 + /* don't take long term lock a @new_root. Take spinlock. */
71459 +
71460 + write_lock_tree(tree);
71461 +
71462 + tree->root_block = *new_root_blk;
71463 + --tree->height;
71464 +
71465 + /* recalculate max balance overhead */
71466 + tree->estimate_one_insert = estimate_one_insert_item(tree);
71467 +
71468 + assert("nikita-1202",
71469 + tree->height == znode_get_level(new_root));
71470 +
71471 + /* new root is child on "fake" node */
71472 + init_parent_coord(&new_root->in_parent, uber);
71473 + ++uber->c_count;
71474 +
71475 + /* sibling_list_insert_nolock(new_root, NULL); */
71476 + write_unlock_tree(tree);
71477 +
71478 + /* reinitialise old root. */
71479 + result = node_plugin_by_node(old_root)->init(old_root);
71480 + znode_make_dirty(old_root);
71481 + if (result == 0) {
71482 + assert("nikita-1279", node_is_empty(old_root));
71483 + ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71484 + old_root->c_count = 0;
71485 + }
71486 + }
71487 + done_lh(&handle_for_uber);
71488 +
71489 + return result;
71490 +}
71491 +
71492 +/* remove tree root
71493 +
71494 + This function removes tree root, decreasing tree height by one. Tree root
71495 + and its only child (that is going to become new tree root) are write locked
71496 + at the entry.
71497 +
71498 + To remove tree root we need to take lock on special "fake" znode that
71499 + protects changes of tree height. See comments in reiser4_add_tree_root() for
71500 + more on this.
71501 +
71502 + Also parent pointers have to be updated in
71503 + old and new root. To simplify code, function is split into two parts: outer
71504 + reiser4_kill_tree_root() collects all necessary arguments and calls
71505 + reiser4_kill_root() to do the actual job.
71506 +
71507 +*/
71508 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71509 + removing*/)
71510 +{
71511 + int result;
71512 + coord_t down_link;
71513 + znode *new_root;
71514 + reiser4_tree *tree;
71515 +
71516 + assert("umka-266", current_tree != NULL);
71517 + assert("nikita-1194", old_root != NULL);
71518 + assert("nikita-1196", znode_is_root(old_root));
71519 + assert("nikita-1200", node_num_items(old_root) == 1);
71520 + assert("nikita-1401", znode_is_write_locked(old_root));
71521 +
71522 + coord_init_first_unit(&down_link, old_root);
71523 +
71524 + tree = znode_get_tree(old_root);
71525 + new_root = child_znode(&down_link, old_root, 0, 1);
71526 + if (!IS_ERR(new_root)) {
71527 + result =
71528 + reiser4_kill_root(tree, old_root, new_root,
71529 + znode_get_block(new_root));
71530 + zput(new_root);
71531 + } else
71532 + result = PTR_ERR(new_root);
71533 +
71534 + return result;
71535 +}
71536 +
71537 +/* Make Linus happy.
71538 + Local variables:
71539 + c-indentation-style: "K&R"
71540 + mode-name: "LC"
71541 + c-basic-offset: 8
71542 + tab-width: 8
71543 + fill-column: 120
71544 + scroll-step: 1
71545 + End:
71546 +*/
71547 diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.h linux-2.6.20/fs/reiser4/tree_mod.h
71548 --- linux-2.6.20.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
71549 +++ linux-2.6.20/fs/reiser4/tree_mod.h 2007-05-06 14:50:43.887034467 +0400
71550 @@ -0,0 +1,29 @@
71551 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71552 + * reiser4/README */
71553 +
71554 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71555 + * comments. */
71556 +
71557 +#if !defined( __REISER4_TREE_MOD_H__ )
71558 +#define __REISER4_TREE_MOD_H__
71559 +
71560 +#include "forward.h"
71561 +
71562 +znode *reiser4_new_node(znode * brother, tree_level level);
71563 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
71564 +int reiser4_kill_tree_root(znode * old_root);
71565 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
71566 +
71567 +/* __REISER4_TREE_MOD_H__ */
71568 +#endif
71569 +
71570 +/* Make Linus happy.
71571 + Local variables:
71572 + c-indentation-style: "K&R"
71573 + mode-name: "LC"
71574 + c-basic-offset: 8
71575 + tab-width: 8
71576 + fill-column: 120
71577 + scroll-step: 1
71578 + End:
71579 +*/
71580 diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.c linux-2.6.20/fs/reiser4/tree_walk.c
71581 --- linux-2.6.20.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
71582 +++ linux-2.6.20/fs/reiser4/tree_walk.c 2007-05-06 14:50:43.887034467 +0400
71583 @@ -0,0 +1,927 @@
71584 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71585 + * reiser4/README */
71586 +
71587 +/* Routines and macros to:
71588 +
71589 + get_left_neighbor()
71590 +
71591 + get_right_neighbor()
71592 +
71593 + get_parent()
71594 +
71595 + get_first_child()
71596 +
71597 + get_last_child()
71598 +
71599 + various routines to walk the whole tree and do things to it like
71600 + repack it, or move it to tertiary storage. Please make them as
71601 + generic as is reasonable.
71602 +
71603 +*/
71604 +
71605 +#include "forward.h"
71606 +#include "debug.h"
71607 +#include "dformat.h"
71608 +#include "coord.h"
71609 +#include "plugin/item/item.h"
71610 +#include "jnode.h"
71611 +#include "znode.h"
71612 +#include "tree_walk.h"
71613 +#include "tree.h"
71614 +#include "super.h"
71615 +
71616 +/* These macros are used internally in tree_walk.c in attempt to make
71617 + lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71618 + lock_left_neighbor */
71619 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71620 +#define FIELD_OFFSET(name) offsetof(znode, name)
71621 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71622 +#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71623 +#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71624 +
71625 +/* This is the generic procedure to get and lock `generic' neighbor (left or
71626 + right neighbor or parent). It implements common algorithm for all cases of
71627 + getting lock on neighbor node, only znode structure field is different in
71628 + each case. This is parameterized by ptr_offset argument, which is byte
71629 + offset for the pointer to the desired neighbor within the current node's
71630 + znode structure. This function should be called with the tree lock held */
71631 +static int lock_neighbor(
71632 + /* resulting lock handle */
71633 + lock_handle * result,
71634 + /* znode to lock */
71635 + znode * node,
71636 + /* pointer to neighbor (or parent) znode field offset, in bytes from
71637 + the base address of znode structure */
71638 + int ptr_offset,
71639 + /* lock mode for longterm_lock_znode call */
71640 + znode_lock_mode mode,
71641 + /* lock request for longterm_lock_znode call */
71642 + znode_lock_request req,
71643 + /* GN_* flags */
71644 + int flags, int rlocked)
71645 +{
71646 + reiser4_tree *tree = znode_get_tree(node);
71647 + znode *neighbor;
71648 + int ret;
71649 +
71650 + assert("umka-236", node != NULL);
71651 + assert("umka-237", tree != NULL);
71652 + assert_rw_locked(&(tree->tree_lock));
71653 +
71654 + if (flags & GN_TRY_LOCK)
71655 + req |= ZNODE_LOCK_NONBLOCK;
71656 + if (flags & GN_SAME_ATOM)
71657 + req |= ZNODE_LOCK_DONT_FUSE;
71658 +
71659 + /* get neighbor's address by using of sibling link, quit while loop
71660 + (and return) if link is not available. */
71661 + while (1) {
71662 + neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71663 +
71664 + /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71665 + * node pointed by it is not connected.
71666 + *
71667 + * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71668 + * check and allows passing reference to not connected znode to
71669 + * subsequent longterm_lock_znode() call. This kills possible
71670 + * busy loop if we are trying to get longterm lock on locked but
71671 + * not yet connected parent node. */
71672 + if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71673 + || znode_is_connected(neighbor))) {
71674 + return RETERR(-E_NO_NEIGHBOR);
71675 + }
71676 +
71677 + /* protect it from deletion. */
71678 + zref(neighbor);
71679 +
71680 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71681 +
71682 + ret = longterm_lock_znode(result, neighbor, mode, req);
71683 +
71684 + /* The lock handle obtains its own reference, release the one from above. */
71685 + zput(neighbor);
71686 +
71687 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71688 +
71689 + /* restart if node we got reference to is being
71690 + invalidated. we should not get reference to this node
71691 + again. */
71692 + if (ret == -EINVAL)
71693 + continue;
71694 + if (ret)
71695 + return ret;
71696 +
71697 + /* check if neighbor link still points to just locked znode;
71698 + the link could have been changed while the process slept. */
71699 + if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71700 + return 0;
71701 +
71702 + /* znode was locked by mistake; unlock it and restart locking
71703 + process from beginning. */
71704 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71705 + longterm_unlock_znode(result);
71706 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71707 + }
71708 +}
71709 +
71710 +/* get parent node with longterm lock, accepts GN* flags. */
71711 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71712 + znode * node /* child node */ ,
71713 + znode_lock_mode mode
71714 + /* type of lock: read or write */ ,
71715 + int flags /* GN_* flags */ )
71716 +{
71717 + int result;
71718 +
71719 + read_lock_tree(znode_get_tree(node));
71720 + result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71721 + ZNODE_LOCK_HIPRI, flags, 1);
71722 + read_unlock_tree(znode_get_tree(node));
71723 + return result;
71724 +}
71725 +
71726 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71727 + bit in @flags parameter */
71728 +/* Audited by: umka (2002.06.14) */
71729 +static inline int
71730 +lock_side_neighbor(lock_handle * result,
71731 + znode * node, znode_lock_mode mode, int flags, int rlocked)
71732 +{
71733 + int ret;
71734 + int ptr_offset;
71735 + znode_lock_request req;
71736 +
71737 + if (flags & GN_GO_LEFT) {
71738 + ptr_offset = LEFT_PTR_OFFSET;
71739 + req = ZNODE_LOCK_LOPRI;
71740 + } else {
71741 + ptr_offset = RIGHT_PTR_OFFSET;
71742 + req = ZNODE_LOCK_HIPRI;
71743 + }
71744 +
71745 + ret =
71746 + lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71747 +
71748 + if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71749 + * guarantee that neighbor is absent in the
71750 + * tree; in this case we return -ENOENT --
71751 + * means neighbor at least not found in
71752 + * cache */
71753 + return RETERR(-ENOENT);
71754 +
71755 + return ret;
71756 +}
71757 +
71758 +#if REISER4_DEBUG
71759 +
71760 +int check_sibling_list(znode * node)
71761 +{
71762 + znode *scan;
71763 + znode *next;
71764 +
71765 + assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71766 +
71767 + if (node == NULL)
71768 + return 1;
71769 +
71770 + if (ZF_ISSET(node, JNODE_RIP))
71771 + return 1;
71772 +
71773 + assert("nikita-3270", node != NULL);
71774 + assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71775 +
71776 + for (scan = node; znode_is_left_connected(scan); scan = next) {
71777 + next = scan->left;
71778 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71779 + assert("nikita-3271", znode_is_right_connected(next));
71780 + assert("nikita-3272", next->right == scan);
71781 + } else
71782 + break;
71783 + }
71784 + for (scan = node; znode_is_right_connected(scan); scan = next) {
71785 + next = scan->right;
71786 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71787 + assert("nikita-3273", znode_is_left_connected(next));
71788 + assert("nikita-3274", next->left == scan);
71789 + } else
71790 + break;
71791 + }
71792 + return 1;
71793 +}
71794 +
71795 +#endif
71796 +
71797 +/* Znode sibling pointers maintenence. */
71798 +
71799 +/* Znode sibling pointers are established between any neighbored nodes which are
71800 + in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71801 + JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71802 + value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71803 +
71804 + Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71805 + take care about searching (hash table lookup may be required) of znode
71806 + neighbors, establishing sibling pointers between them and setting
71807 + JNODE_*_CONNECTED state bits. */
71808 +
71809 +/* adjusting of sibling pointers and `connected' states for two
71810 + neighbors; works if one neighbor is NULL (was not found). */
71811 +
71812 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71813 +void link_left_and_right(znode * left, znode * right)
71814 +{
71815 + assert("nikita-3275", check_sibling_list(left));
71816 + assert("nikita-3275", check_sibling_list(right));
71817 +
71818 + if (left != NULL) {
71819 + if (left->right == NULL) {
71820 + left->right = right;
71821 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71822 +
71823 + ON_DEBUG(left->right_version =
71824 + atomic_inc_return(&delim_key_version);
71825 + );
71826 +
71827 + } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71828 + && left->right != right) {
71829 +
71830 + ON_DEBUG(left->right->left_version =
71831 + atomic_inc_return(&delim_key_version);
71832 + left->right_version =
71833 + atomic_inc_return(&delim_key_version););
71834 +
71835 + left->right->left = NULL;
71836 + left->right = right;
71837 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71838 + } else
71839 + /*
71840 + * there is a race condition in renew_sibling_link()
71841 + * and assertions below check that it is only one
71842 + * there. Thread T1 calls renew_sibling_link() without
71843 + * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71844 + * node, but before T1 gets to the
71845 + * link_left_and_right(), another thread T2 creates
71846 + * neighbor node and connects it. check for
71847 + * left->right == NULL above protects T1 from
71848 + * overwriting correct left->right pointer installed
71849 + * by T2.
71850 + */
71851 + assert("nikita-3302",
71852 + right == NULL || left->right == right);
71853 + }
71854 + if (right != NULL) {
71855 + if (right->left == NULL) {
71856 + right->left = left;
71857 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71858 +
71859 + ON_DEBUG(right->left_version =
71860 + atomic_inc_return(&delim_key_version);
71861 + );
71862 +
71863 + } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71864 + && right->left != left) {
71865 +
71866 + ON_DEBUG(right->left->right_version =
71867 + atomic_inc_return(&delim_key_version);
71868 + right->left_version =
71869 + atomic_inc_return(&delim_key_version););
71870 +
71871 + right->left->right = NULL;
71872 + right->left = left;
71873 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71874 +
71875 + } else
71876 + assert("nikita-3303",
71877 + left == NULL || right->left == left);
71878 + }
71879 + assert("nikita-3275", check_sibling_list(left));
71880 + assert("nikita-3275", check_sibling_list(right));
71881 +}
71882 +
71883 +/* Audited by: umka (2002.06.14) */
71884 +static void link_znodes(znode * first, znode * second, int to_left)
71885 +{
71886 + if (to_left)
71887 + link_left_and_right(second, first);
71888 + else
71889 + link_left_and_right(first, second);
71890 +}
71891 +
71892 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71893 + coord's unit position in horizontal direction, even across node
71894 + boundary. Should be called under tree lock, it protects nonexistence of
71895 + sibling link on parent level, if lock_side_neighbor() fails with
71896 + -ENOENT. */
71897 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71898 +{
71899 + int ret;
71900 + znode *node;
71901 + reiser4_tree *tree;
71902 +
71903 + assert("umka-243", coord != NULL);
71904 + assert("umka-244", handle != NULL);
71905 + assert("zam-1069", handle->node == NULL);
71906 +
71907 + ret =
71908 + (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71909 + coord_next_unit(coord);
71910 + if (!ret)
71911 + return 0;
71912 +
71913 + ret =
71914 + lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71915 + if (ret)
71916 + return ret;
71917 +
71918 + node = handle->node;
71919 + tree = znode_get_tree(node);
71920 + write_unlock_tree(tree);
71921 +
71922 + coord_init_zero(coord);
71923 +
71924 + /* We avoid synchronous read here if it is specified by flag. */
71925 + if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71926 + ret = jstartio(ZJNODE(handle->node));
71927 + if (!ret)
71928 + ret = -E_REPEAT;
71929 + goto error_locked;
71930 + }
71931 +
71932 + /* corresponded zrelse() should be called by the clients of
71933 + far_next_coord(), in place when this node gets unlocked. */
71934 + ret = zload(handle->node);
71935 + if (ret)
71936 + goto error_locked;
71937 +
71938 + if (flags & GN_GO_LEFT)
71939 + coord_init_last_unit(coord, node);
71940 + else
71941 + coord_init_first_unit(coord, node);
71942 +
71943 + if (0) {
71944 + error_locked:
71945 + longterm_unlock_znode(handle);
71946 + }
71947 + write_lock_tree(tree);
71948 + return ret;
71949 +}
71950 +
71951 +/* Very significant function which performs a step in horizontal direction
71952 + when sibling pointer is not available. Actually, it is only function which
71953 + does it.
71954 + Note: this function does not restore locking status at exit,
71955 + caller should does care about proper unlocking and zrelsing */
71956 +static int
71957 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71958 + tree_level level, int flags, int *nr_locked)
71959 +{
71960 + int ret;
71961 + int to_left = flags & GN_GO_LEFT;
71962 + reiser4_block_nr da;
71963 + /* parent of the neighbor node; we set it to parent until not sharing
71964 + of one parent between child and neighbor node is detected */
71965 + znode *side_parent = coord->node;
71966 + reiser4_tree *tree = znode_get_tree(child);
71967 + znode *neighbor = NULL;
71968 +
71969 + assert("umka-245", coord != NULL);
71970 + assert("umka-246", handle != NULL);
71971 + assert("umka-247", child != NULL);
71972 + assert("umka-303", tree != NULL);
71973 +
71974 + init_lh(handle);
71975 + write_lock_tree(tree);
71976 + ret = far_next_coord(coord, handle, flags);
71977 +
71978 + if (ret) {
71979 + if (ret != -ENOENT) {
71980 + write_unlock_tree(tree);
71981 + return ret;
71982 + }
71983 + } else {
71984 + item_plugin *iplug;
71985 +
71986 + if (handle->node != NULL) {
71987 + (*nr_locked)++;
71988 + side_parent = handle->node;
71989 + }
71990 +
71991 + /* does coord object points to internal item? We do not
71992 + support sibling pointers between znode for formatted and
71993 + unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71994 + iplug = item_plugin_by_coord(coord);
71995 + if (!item_is_internal(coord)) {
71996 + link_znodes(child, NULL, to_left);
71997 + write_unlock_tree(tree);
71998 + /* we know there can't be formatted neighbor */
71999 + return RETERR(-E_NO_NEIGHBOR);
72000 + }
72001 + write_unlock_tree(tree);
72002 +
72003 + iplug->s.internal.down_link(coord, NULL, &da);
72004 +
72005 + if (flags & GN_NO_ALLOC) {
72006 + neighbor = zlook(tree, &da);
72007 + } else {
72008 + neighbor =
72009 + zget(tree, &da, side_parent, level,
72010 + reiser4_ctx_gfp_mask_get());
72011 + }
72012 +
72013 + if (IS_ERR(neighbor)) {
72014 + ret = PTR_ERR(neighbor);
72015 + return ret;
72016 + }
72017 +
72018 + if (neighbor)
72019 + /* update delimiting keys */
72020 + set_child_delimiting_keys(coord->node, coord, neighbor);
72021 +
72022 + write_lock_tree(tree);
72023 + }
72024 +
72025 + if (likely(neighbor == NULL ||
72026 + (znode_get_level(child) == znode_get_level(neighbor)
72027 + && child != neighbor)))
72028 + link_znodes(child, neighbor, to_left);
72029 + else {
72030 + warning("nikita-3532",
72031 + "Sibling nodes on the different levels: %i != %i\n",
72032 + znode_get_level(child), znode_get_level(neighbor));
72033 + ret = RETERR(-EIO);
72034 + }
72035 +
72036 + write_unlock_tree(tree);
72037 +
72038 + /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72039 + if (neighbor != NULL && (flags & GN_NO_ALLOC))
72040 + /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72041 + zput(neighbor);
72042 +
72043 + return ret;
72044 +}
72045 +
72046 +/* This function is for establishing of one side relation. */
72047 +/* Audited by: umka (2002.06.14) */
72048 +static int connect_one_side(coord_t * coord, znode * node, int flags)
72049 +{
72050 + coord_t local;
72051 + lock_handle handle;
72052 + int nr_locked;
72053 + int ret;
72054 +
72055 + assert("umka-248", coord != NULL);
72056 + assert("umka-249", node != NULL);
72057 +
72058 + coord_dup_nocheck(&local, coord);
72059 +
72060 + init_lh(&handle);
72061 +
72062 + ret =
72063 + renew_sibling_link(&local, &handle, node, znode_get_level(node),
72064 + flags | GN_NO_ALLOC, &nr_locked);
72065 +
72066 + if (handle.node != NULL) {
72067 + /* complementary operations for zload() and lock() in far_next_coord() */
72068 + zrelse(handle.node);
72069 + longterm_unlock_znode(&handle);
72070 + }
72071 +
72072 + /* we catch error codes which are not interesting for us because we
72073 + run renew_sibling_link() only for znode connection. */
72074 + if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72075 + return 0;
72076 +
72077 + return ret;
72078 +}
72079 +
72080 +/* if @child is not in `connected' state, performs hash searches for left and
72081 + right neighbor nodes and establishes horizontal sibling links */
72082 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72083 +int connect_znode(coord_t * parent_coord, znode * child)
72084 +{
72085 + reiser4_tree *tree = znode_get_tree(child);
72086 + int ret = 0;
72087 +
72088 + assert("zam-330", parent_coord != NULL);
72089 + assert("zam-331", child != NULL);
72090 + assert("zam-332", parent_coord->node != NULL);
72091 + assert("umka-305", tree != NULL);
72092 +
72093 + /* it is trivial to `connect' root znode because it can't have
72094 + neighbors */
72095 + if (znode_above_root(parent_coord->node)) {
72096 + child->left = NULL;
72097 + child->right = NULL;
72098 + ZF_SET(child, JNODE_LEFT_CONNECTED);
72099 + ZF_SET(child, JNODE_RIGHT_CONNECTED);
72100 +
72101 + ON_DEBUG(child->left_version =
72102 + atomic_inc_return(&delim_key_version);
72103 + child->right_version =
72104 + atomic_inc_return(&delim_key_version););
72105 +
72106 + return 0;
72107 + }
72108 +
72109 + /* load parent node */
72110 + coord_clear_iplug(parent_coord);
72111 + ret = zload(parent_coord->node);
72112 +
72113 + if (ret != 0)
72114 + return ret;
72115 +
72116 + /* protect `connected' state check by tree_lock */
72117 + read_lock_tree(tree);
72118 +
72119 + if (!znode_is_right_connected(child)) {
72120 + read_unlock_tree(tree);
72121 + /* connect right (default is right) */
72122 + ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72123 + if (ret)
72124 + goto zrelse_and_ret;
72125 +
72126 + read_lock_tree(tree);
72127 + }
72128 +
72129 + ret = znode_is_left_connected(child);
72130 +
72131 + read_unlock_tree(tree);
72132 +
72133 + if (!ret) {
72134 + ret =
72135 + connect_one_side(parent_coord, child,
72136 + GN_NO_ALLOC | GN_GO_LEFT);
72137 + } else
72138 + ret = 0;
72139 +
72140 + zrelse_and_ret:
72141 + zrelse(parent_coord->node);
72142 +
72143 + return ret;
72144 +}
72145 +
72146 +/* this function is like renew_sibling_link() but allocates neighbor node if
72147 + it doesn't exist and `connects' it. It may require making two steps in
72148 + horizontal direction, first one for neighbor node finding/allocation,
72149 + second one is for finding neighbor of neighbor to connect freshly allocated
72150 + znode. */
72151 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72152 +static int
72153 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72154 +{
72155 + coord_t local;
72156 + lock_handle empty[2];
72157 + reiser4_tree *tree = znode_get_tree(node);
72158 + znode *neighbor = NULL;
72159 + int nr_locked = 0;
72160 + int ret;
72161 +
72162 + assert("umka-250", coord != NULL);
72163 + assert("umka-251", node != NULL);
72164 + assert("umka-307", tree != NULL);
72165 + assert("umka-308", level <= tree->height);
72166 +
72167 + /* umka (2002.06.14)
72168 + Here probably should be a check for given "level" validness.
72169 + Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72170 + */
72171 +
72172 + coord_dup(&local, coord);
72173 +
72174 + ret =
72175 + renew_sibling_link(&local, &empty[0], node, level,
72176 + flags & ~GN_NO_ALLOC, &nr_locked);
72177 + if (ret)
72178 + goto out;
72179 +
72180 + /* tree lock is not needed here because we keep parent node(s) locked
72181 + and reference to neighbor znode incremented */
72182 + neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72183 +
72184 + read_lock_tree(tree);
72185 + ret = znode_is_connected(neighbor);
72186 + read_unlock_tree(tree);
72187 + if (ret) {
72188 + ret = 0;
72189 + goto out;
72190 + }
72191 +
72192 + ret =
72193 + renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72194 + flags | GN_NO_ALLOC, &nr_locked);
72195 + /* second renew_sibling_link() call is used for znode connection only,
72196 + so we can live with these errors */
72197 + if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72198 + ret = 0;
72199 +
72200 + out:
72201 +
72202 + for (--nr_locked; nr_locked >= 0; --nr_locked) {
72203 + zrelse(empty[nr_locked].node);
72204 + longterm_unlock_znode(&empty[nr_locked]);
72205 + }
72206 +
72207 + if (neighbor != NULL)
72208 + /* decrement znode reference counter without actually
72209 + releasing it. */
72210 + atomic_dec(&ZJNODE(neighbor)->x_count);
72211 +
72212 + return ret;
72213 +}
72214 +
72215 +/*
72216 + reiser4_get_neighbor() -- lock node's neighbor.
72217 +
72218 + reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72219 + given parameter) using sibling link to it. If sibling link is not available
72220 + (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72221 + level up for information about neighbor's disk address. We lock node's
72222 + parent, if it is common parent for both 'node' and its neighbor, neighbor's
72223 + disk address is in next (to left or to right) down link from link that points
72224 + to original node. If not, we need to lock parent's neighbor, read its content
72225 + and take first(last) downlink with neighbor's disk address. That locking
72226 + could be done by using sibling link and lock_neighbor() function, if sibling
72227 + link exists. In another case we have to go level up again until we find
72228 + common parent or valid sibling link. Then go down
72229 + allocating/connecting/locking/reading nodes until neighbor of first one is
72230 + locked.
72231 +
72232 + @neighbor: result lock handle,
72233 + @node: a node which we lock neighbor of,
72234 + @lock_mode: lock mode {LM_READ, LM_WRITE},
72235 + @flags: logical OR of {GN_*} (see description above) subset.
72236 +
72237 + @return: 0 if success, negative value if lock was impossible due to an error
72238 + or lack of neighbor node.
72239 +*/
72240 +
72241 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72242 +int
72243 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72244 + znode_lock_mode lock_mode, int flags)
72245 +{
72246 + reiser4_tree *tree = znode_get_tree(node);
72247 + lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72248 +
72249 + coord_t coord;
72250 +
72251 + tree_level base_level;
72252 + tree_level h = 0;
72253 + int ret;
72254 +
72255 + assert("umka-252", tree != NULL);
72256 + assert("umka-253", neighbor != NULL);
72257 + assert("umka-254", node != NULL);
72258 +
72259 + base_level = znode_get_level(node);
72260 +
72261 + assert("umka-310", base_level <= tree->height);
72262 +
72263 + coord_init_zero(&coord);
72264 +
72265 + again:
72266 + /* first, we try to use simple lock_neighbor() which requires sibling
72267 + link existence */
72268 + read_lock_tree(tree);
72269 + ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72270 + read_unlock_tree(tree);
72271 + if (!ret) {
72272 + /* load znode content if it was specified */
72273 + if (flags & GN_LOAD_NEIGHBOR) {
72274 + ret = zload(node);
72275 + if (ret)
72276 + longterm_unlock_znode(neighbor);
72277 + }
72278 + return ret;
72279 + }
72280 +
72281 + /* only -ENOENT means we may look upward and try to connect
72282 + @node with its neighbor (if @flags allow us to do it) */
72283 + if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72284 + return ret;
72285 +
72286 + /* before establishing of sibling link we lock parent node; it is
72287 + required by renew_neighbor() to work. */
72288 + init_lh(&path[0]);
72289 + ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72290 + if (ret)
72291 + return ret;
72292 + if (znode_above_root(path[0].node)) {
72293 + longterm_unlock_znode(&path[0]);
72294 + return RETERR(-E_NO_NEIGHBOR);
72295 + }
72296 +
72297 + while (1) {
72298 + znode *child = (h == 0) ? node : path[h - 1].node;
72299 + znode *parent = path[h].node;
72300 +
72301 + ret = zload(parent);
72302 + if (ret)
72303 + break;
72304 +
72305 + ret = find_child_ptr(parent, child, &coord);
72306 +
72307 + if (ret) {
72308 + zrelse(parent);
72309 + break;
72310 + }
72311 +
72312 + /* try to establish missing sibling link */
72313 + ret = renew_neighbor(&coord, child, h + base_level, flags);
72314 +
72315 + zrelse(parent);
72316 +
72317 + switch (ret) {
72318 + case 0:
72319 + /* unlocking of parent znode prevents simple
72320 + deadlock situation */
72321 + done_lh(&path[h]);
72322 +
72323 + /* depend on tree level we stay on we repeat first
72324 + locking attempt ... */
72325 + if (h == 0)
72326 + goto again;
72327 +
72328 + /* ... or repeat establishing of sibling link at
72329 + one level below. */
72330 + --h;
72331 + break;
72332 +
72333 + case -ENOENT:
72334 + /* sibling link is not available -- we go
72335 + upward. */
72336 + init_lh(&path[h + 1]);
72337 + ret =
72338 + reiser4_get_parent(&path[h + 1], parent,
72339 + ZNODE_READ_LOCK);
72340 + if (ret)
72341 + goto fail;
72342 + ++h;
72343 + if (znode_above_root(path[h].node)) {
72344 + ret = RETERR(-E_NO_NEIGHBOR);
72345 + goto fail;
72346 + }
72347 + break;
72348 +
72349 + case -E_DEADLOCK:
72350 + /* there was lock request from hi-pri locker. if
72351 + it is possible we unlock last parent node and
72352 + re-lock it again. */
72353 + for (; reiser4_check_deadlock(); h--) {
72354 + done_lh(&path[h]);
72355 + if (h == 0)
72356 + goto fail;
72357 + }
72358 +
72359 + break;
72360 +
72361 + default: /* other errors. */
72362 + goto fail;
72363 + }
72364 + }
72365 + fail:
72366 + ON_DEBUG(check_lock_node_data(node));
72367 + ON_DEBUG(check_lock_data());
72368 +
72369 + /* unlock path */
72370 + do {
72371 + /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72372 + fail; path[0] is already done_lh-ed, therefore
72373 + longterm_unlock_znode(&path[h]); is not applicable */
72374 + done_lh(&path[h]);
72375 + --h;
72376 + } while (h + 1 != 0);
72377 +
72378 + return ret;
72379 +}
72380 +
72381 +/* remove node from sibling list */
72382 +/* Audited by: umka (2002.06.14) */
72383 +void sibling_list_remove(znode * node)
72384 +{
72385 + reiser4_tree *tree;
72386 +
72387 + tree = znode_get_tree(node);
72388 + assert("umka-255", node != NULL);
72389 + assert_rw_write_locked(&(tree->tree_lock));
72390 + assert("nikita-3275", check_sibling_list(node));
72391 +
72392 + write_lock_dk(tree);
72393 + if (znode_is_right_connected(node) && node->right != NULL &&
72394 + znode_is_left_connected(node) && node->left != NULL) {
72395 + assert("zam-32245",
72396 + keyeq(znode_get_rd_key(node),
72397 + znode_get_ld_key(node->right)));
72398 + znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72399 + }
72400 + write_unlock_dk(tree);
72401 +
72402 + if (znode_is_right_connected(node) && node->right != NULL) {
72403 + assert("zam-322", znode_is_left_connected(node->right));
72404 + node->right->left = node->left;
72405 + ON_DEBUG(node->right->left_version =
72406 + atomic_inc_return(&delim_key_version);
72407 + );
72408 + }
72409 + if (znode_is_left_connected(node) && node->left != NULL) {
72410 + assert("zam-323", znode_is_right_connected(node->left));
72411 + node->left->right = node->right;
72412 + ON_DEBUG(node->left->right_version =
72413 + atomic_inc_return(&delim_key_version);
72414 + );
72415 + }
72416 +
72417 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72418 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72419 + ON_DEBUG(node->left = node->right = NULL;
72420 + node->left_version = atomic_inc_return(&delim_key_version);
72421 + node->right_version = atomic_inc_return(&delim_key_version););
72422 + assert("nikita-3276", check_sibling_list(node));
72423 +}
72424 +
72425 +/* disconnect node from sibling list */
72426 +void sibling_list_drop(znode * node)
72427 +{
72428 + znode *right;
72429 + znode *left;
72430 +
72431 + assert("nikita-2464", node != NULL);
72432 + assert("nikita-3277", check_sibling_list(node));
72433 +
72434 + right = node->right;
72435 + if (right != NULL) {
72436 + assert("nikita-2465", znode_is_left_connected(right));
72437 + right->left = NULL;
72438 + ON_DEBUG(right->left_version =
72439 + atomic_inc_return(&delim_key_version);
72440 + );
72441 + }
72442 + left = node->left;
72443 + if (left != NULL) {
72444 + assert("zam-323", znode_is_right_connected(left));
72445 + left->right = NULL;
72446 + ON_DEBUG(left->right_version =
72447 + atomic_inc_return(&delim_key_version);
72448 + );
72449 + }
72450 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72451 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72452 + ON_DEBUG(node->left = node->right = NULL;
72453 + node->left_version = atomic_inc_return(&delim_key_version);
72454 + node->right_version = atomic_inc_return(&delim_key_version););
72455 +}
72456 +
72457 +/* Insert new node into sibling list. Regular balancing inserts new node
72458 + after (at right side) existing and locked node (@before), except one case
72459 + of adding new tree root node. @before should be NULL in that case. */
72460 +void sibling_list_insert_nolock(znode * new, znode * before)
72461 +{
72462 + assert("zam-334", new != NULL);
72463 + assert("nikita-3298", !znode_is_left_connected(new));
72464 + assert("nikita-3299", !znode_is_right_connected(new));
72465 + assert("nikita-3300", new->left == NULL);
72466 + assert("nikita-3301", new->right == NULL);
72467 + assert("nikita-3278", check_sibling_list(new));
72468 + assert("nikita-3279", check_sibling_list(before));
72469 +
72470 + if (before != NULL) {
72471 + assert("zam-333", znode_is_connected(before));
72472 + new->right = before->right;
72473 + new->left = before;
72474 + ON_DEBUG(new->right_version =
72475 + atomic_inc_return(&delim_key_version);
72476 + new->left_version =
72477 + atomic_inc_return(&delim_key_version););
72478 + if (before->right != NULL) {
72479 + before->right->left = new;
72480 + ON_DEBUG(before->right->left_version =
72481 + atomic_inc_return(&delim_key_version);
72482 + );
72483 + }
72484 + before->right = new;
72485 + ON_DEBUG(before->right_version =
72486 + atomic_inc_return(&delim_key_version);
72487 + );
72488 + } else {
72489 + new->right = NULL;
72490 + new->left = NULL;
72491 + ON_DEBUG(new->right_version =
72492 + atomic_inc_return(&delim_key_version);
72493 + new->left_version =
72494 + atomic_inc_return(&delim_key_version););
72495 + }
72496 + ZF_SET(new, JNODE_LEFT_CONNECTED);
72497 + ZF_SET(new, JNODE_RIGHT_CONNECTED);
72498 + assert("nikita-3280", check_sibling_list(new));
72499 + assert("nikita-3281", check_sibling_list(before));
72500 +}
72501 +
72502 +/*
72503 + Local variables:
72504 + c-indentation-style: "K&R"
72505 + mode-name: "LC"
72506 + c-basic-offset: 8
72507 + tab-width: 8
72508 + fill-column: 80
72509 + End:
72510 +*/
72511 diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.h linux-2.6.20/fs/reiser4/tree_walk.h
72512 --- linux-2.6.20.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
72513 +++ linux-2.6.20/fs/reiser4/tree_walk.h 2007-05-06 14:50:43.887034467 +0400
72514 @@ -0,0 +1,125 @@
72515 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72516 +
72517 +/* definitions of reiser4 tree walk functions */
72518 +
72519 +#ifndef __FS_REISER4_TREE_WALK_H__
72520 +#define __FS_REISER4_TREE_WALK_H__
72521 +
72522 +#include "debug.h"
72523 +#include "forward.h"
72524 +
72525 +/* establishes horizontal links between cached znodes */
72526 +int connect_znode(coord_t * coord, znode * node);
72527 +
72528 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72529 + have the following common arguments:
72530 +
72531 + return codes:
72532 +
72533 + @return : 0 - OK,
72534 +
72535 +ZAM-FIXME-HANS: wrong return code name. Change them all.
72536 + -ENOENT - neighbor is not in cache, what is detected by sibling
72537 + link absence.
72538 +
72539 + -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72540 + found (because we are left-/right- most node of the
72541 + tree, for example). Also, this return code is for
72542 + reiser4_get_parent() when we see no parent link -- it
72543 + means that our node is root node.
72544 +
72545 + -E_DEADLOCK - deadlock detected (request from high-priority process
72546 + received), other error codes are conformed to
72547 + /usr/include/asm/errno.h .
72548 +*/
72549 +
72550 +int
72551 +reiser4_get_parent_flags(lock_handle * result, znode * node,
72552 + znode_lock_mode mode, int flags);
72553 +
72554 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
72555 +typedef enum {
72556 + /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72557 + * find not allocated not connected neigbor by going though upper
72558 + * levels */
72559 + GN_CAN_USE_UPPER_LEVELS = 0x1,
72560 + /* locking left neighbor instead of right one */
72561 + GN_GO_LEFT = 0x2,
72562 + /* automatically load neighbor node content */
72563 + GN_LOAD_NEIGHBOR = 0x4,
72564 + /* return -E_REPEAT if can't lock */
72565 + GN_TRY_LOCK = 0x8,
72566 + /* used internally in tree_walk.c, causes renew_sibling to not
72567 + allocate neighbor znode, but only search for it in znode cache */
72568 + GN_NO_ALLOC = 0x10,
72569 + /* do not go across atom boundaries */
72570 + GN_SAME_ATOM = 0x20,
72571 + /* allow to lock not connected nodes */
72572 + GN_ALLOW_NOT_CONNECTED = 0x40,
72573 + /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72574 + GN_ASYNC = 0x80
72575 +} znode_get_neigbor_flags;
72576 +
72577 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
72578 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
72579 + znode_lock_mode mode)
72580 +{
72581 + return reiser4_get_parent_flags(result, node, mode,
72582 + GN_ALLOW_NOT_CONNECTED);
72583 +}
72584 +
72585 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72586 + znode_lock_mode lock_mode, int flags);
72587 +
72588 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
72589 +static inline int
72590 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72591 + int flags)
72592 +{
72593 + return reiser4_get_neighbor(result, node, lock_mode,
72594 + flags | GN_GO_LEFT);
72595 +}
72596 +
72597 +static inline int
72598 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72599 + int flags)
72600 +{
72601 + ON_DEBUG(check_lock_node_data(node));
72602 + ON_DEBUG(check_lock_data());
72603 + return reiser4_get_neighbor(result, node, lock_mode,
72604 + flags & (~GN_GO_LEFT));
72605 +}
72606 +
72607 +extern void sibling_list_remove(znode * node);
72608 +extern void sibling_list_drop(znode * node);
72609 +extern void sibling_list_insert_nolock(znode * new, znode * before);
72610 +extern void link_left_and_right(znode * left, znode * right);
72611 +
72612 +/* Functions called by tree_walk() when tree_walk() ... */
72613 +struct tree_walk_actor {
72614 + /* ... meets a formatted node, */
72615 + int (*process_znode) (tap_t *, void *);
72616 + /* ... meets an extent, */
72617 + int (*process_extent) (tap_t *, void *);
72618 + /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72619 + * node or extent processing functions. */
72620 + int (*before) (void *);
72621 +};
72622 +
72623 +#if REISER4_DEBUG
72624 +int check_sibling_list(znode * node);
72625 +#else
72626 +#define check_sibling_list(n) (1)
72627 +#endif
72628 +
72629 +#endif /* __FS_REISER4_TREE_WALK_H__ */
72630 +
72631 +/*
72632 + Local variables:
72633 + c-indentation-style: "K&R"
72634 + mode-name: "LC"
72635 + c-basic-offset: 8
72636 + tab-width: 8
72637 + fill-column: 120
72638 + End:
72639 +*/
72640 diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.c linux-2.6.20/fs/reiser4/txnmgr.c
72641 --- linux-2.6.20.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
72642 +++ linux-2.6.20/fs/reiser4/txnmgr.c 2007-05-06 14:50:43.895036966 +0400
72643 @@ -0,0 +1,3164 @@
72644 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72645 + * reiser4/README */
72646 +
72647 +/* Joshua MacDonald wrote the first draft of this code. */
72648 +
72649 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72650 +filesystem scales only as well as its worst locking design. You need to
72651 +substantially restructure this code. Josh was not as experienced a programmer
72652 +as you. Particularly review how the locking style differs from what you did
72653 +for znodes usingt hi-lo priority locking, and present to me an opinion on
72654 +whether the differences are well founded. */
72655 +
72656 +/* I cannot help but to disagree with the sentiment above. Locking of
72657 + * transaction manager is _not_ badly designed, and, at the very least, is not
72658 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72659 + * locking on znodes, especially on the root node of the tree. --nikita,
72660 + * 2003.10.13 */
72661 +
72662 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72663 + txnmgr processes capture_block requests and manages the relationship between jnodes and
72664 + atoms through the various stages of a transcrash, and it also oversees the fusion and
72665 + capture-on-copy processes. The main difficulty with this task is maintaining a
72666 + deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72667 + difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72668 + must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72669 + hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72670 + that any time you check the atom-pointer of a jnode or handle and then try to lock that
72671 + atom, you must use trylock() and possibly reverse the order.
72672 +
72673 + This code implements the design documented at:
72674 +
72675 + http://namesys.com/txn-doc.html
72676 +
72677 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72678 +above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72679 +topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72680 +year old --- define all technical terms used.
72681 +
72682 +*/
72683 +
72684 +/* Thoughts on the external transaction interface:
72685 +
72686 + In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
72687 + creates state that lasts for the duration of a system call and is called at the start
72688 + of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72689 + occupying the scope of a single system call. We wish to give certain applications an
72690 + interface to begin and close (commit) transactions. Since our implementation of
72691 + transactions does not yet support isolation, allowing an application to open a
72692 + transaction implies trusting it to later close the transaction. Part of the
72693 + transaction interface will be aimed at enabling that trust, but the interface for
72694 + actually using transactions is fairly narrow.
72695 +
72696 + BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72697 + this identifier into a string that a shell-script could use, allowing you to start a
72698 + transaction by issuing a command. Once open, the transcrash should be set in the task
72699 + structure, and there should be options (I suppose) to allow it to be carried across
72700 + fork/exec. A transcrash has several options:
72701 +
72702 + - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72703 + on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72704 + capture on reads as well, it should set READ_FUSING.
72705 +
72706 + - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72707 + eventually close (or else the machine must crash). If the application dies an
72708 + unexpected death with an open transcrash, for example, or if it hangs for a long
72709 + duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72710 + This is a dangerous option, but it is one way to solve the problem until isolated
72711 + transcrashes are available for untrusted applications.
72712 +
72713 + It seems to be what databases do, though it is unclear how one avoids a DoS attack
72714 + creating a vulnerability based on resource starvation. Guaranteeing that some
72715 + minimum amount of computational resources are made available would seem more correct
72716 + than guaranteeing some amount of time. When we again have someone to code the work,
72717 + this issue should be considered carefully. -Hans
72718 +
72719 + RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72720 + many dirty blocks it expects. The reserve_blocks interface should be called at a point
72721 + where it is safe for the application to fail, because the system may not be able to
72722 + grant the allocation and the application must be able to back-out. For this reason,
72723 + the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72724 + the application may also wish to extend the allocation after beginning its transcrash.
72725 +
72726 + CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72727 + modifications that require transaction protection. When isolated transactions are
72728 + supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72729 + RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72730 + CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72731 + why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72732 +
72733 + For actually implementing these out-of-system-call-scopped transcrashes, the
72734 + reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72735 + transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
72736 + "struct kmem_cache *_txnh_slab" created for that purpose in this file.
72737 +*/
72738 +
72739 +/* Extending the other system call interfaces for future transaction features:
72740 +
72741 + Specialized applications may benefit from passing flags to the ordinary system call
72742 + interface such as read(), write(), or stat(). For example, the application specifies
72743 + WRITE_FUSING by default but wishes to add that a certain read() command should be
72744 + treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72745 + read, or the file-data read? These issues are straight-forward, but there are a lot of
72746 + them and adding the necessary flags-passing code will be tedious.
72747 +
72748 + When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72749 + flag, which specifies that although it is a read operation being requested, a
72750 + write-lock should be taken. The reason is that read-locks are shared while write-locks
72751 + are exclusive, so taking a read-lock when a later-write is known in advance will often
72752 + leads to deadlock. If a reader knows it will write later, it should issue read
72753 + requests with the RMW flag set.
72754 +*/
72755 +
72756 +/*
72757 + The znode/atom deadlock avoidance.
72758 +
72759 + FIXME(Zam): writing of this comment is in progress.
72760 +
72761 + The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72762 + long-term locking, which makes reiser4 locking scheme more complex. It had
72763 + deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72764 + looked as the following: one stopped thread waits for a long-term lock on
72765 + znode, the thread who owns that lock waits when fusion with another atom will
72766 + be allowed.
72767 +
72768 + The source of the deadlocks is an optimization of not capturing index nodes
72769 + for read. Let's prove it. Suppose we have dumb node capturing scheme which
72770 + unconditionally captures each block before locking it.
72771 +
72772 + That scheme has no deadlocks. Let's begin with the thread which stage is
72773 + ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72774 + a capture because it's stage allows fusion with any atom except which are
72775 + being committed currently. A process of atom commit can't deadlock because
72776 + atom commit procedure does not acquire locks and does not fuse with other
72777 + atoms. Reiser4 does capturing right before going to sleep inside the
72778 + longtertm_lock_znode() function, it means the znode which we want to lock is
72779 + already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72780 + continue the analysis we understand that no one process in the sequence may
72781 + waits atom fusion. Thereby there are no deadlocks of described kind.
72782 +
72783 + The capturing optimization makes the deadlocks possible. A thread can wait a
72784 + lock which owner did not captured that node. The lock owner's current atom
72785 + is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72786 + state. A deadlock is possible when that atom meets another one which is in
72787 + ASTAGE_CAPTURE_WAIT already.
72788 +
72789 + The deadlock avoidance scheme includes two algorithms:
72790 +
72791 + First algorithm is used when a thread captures a node which is locked but not
72792 + captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72793 + moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72794 + being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72795 + routine which forces all lock owners to join with current atom is executed.
72796 +
72797 + Second algorithm does not allow to skip capturing of already captured nodes.
72798 +
72799 + Both algorithms together prevent waiting a longterm lock without atom fusion
72800 + with atoms of all lock owners, which is a key thing for getting atom/znode
72801 + locking deadlocks.
72802 +*/
72803 +
72804 +/*
72805 + * Transactions and mmap(2).
72806 + *
72807 + * 1. Transactions are not supported for accesses through mmap(2), because
72808 + * this would effectively amount to user-level transactions whose duration
72809 + * is beyond control of the kernel.
72810 + *
72811 + * 2. That said, we still want to preserve some decency with regard to
72812 + * mmap(2). During normal write(2) call, following sequence of events
72813 + * happens:
72814 + *
72815 + * 1. page is created;
72816 + *
72817 + * 2. jnode is created, dirtied and captured into current atom.
72818 + *
72819 + * 3. extent is inserted and modified.
72820 + *
72821 + * Steps (2) and (3) take place under long term lock on the twig node.
72822 + *
72823 + * When file is accessed through mmap(2) page is always created during
72824 + * page fault.
72825 + * After this (in reiser4_readpage()->reiser4_readpage_extent()):
72826 + *
72827 + * 1. if access is made to non-hole page new jnode is created, (if
72828 + * necessary)
72829 + *
72830 + * 2. if access is made to the hole page, jnode is not created (XXX
72831 + * not clear why).
72832 + *
72833 + * Also, even if page is created by write page fault it is not marked
72834 + * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72835 + * with page write-out.
72836 + *
72837 + * Dirty bit installed by hardware is only transferred to the struct page
72838 + * later, when page is unmapped (in zap_pte_range(), or
72839 + * try_to_unmap_one()).
72840 + *
72841 + * So, with mmap(2) we have to handle following irksome situations:
72842 + *
72843 + * 1. there exists modified page (clean or dirty) without jnode
72844 + *
72845 + * 2. there exists modified page (clean or dirty) with clean jnode
72846 + *
72847 + * 3. clean page which is a part of atom can be transparently modified
72848 + * at any moment through mapping without becoming dirty.
72849 + *
72850 + * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72851 + * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72852 + * don't see them, because these methods operate on atoms.
72853 + *
72854 + * (3) can lead to the loss of data: suppose we have dirty page with dirty
72855 + * captured jnode captured by some atom. As part of early flush (for
72856 + * example) page was written out. Dirty bit was cleared on both page and
72857 + * jnode. After this page is modified through mapping, but kernel doesn't
72858 + * notice and just discards page and jnode as part of commit. (XXX
72859 + * actually it doesn't, because to reclaim page ->releasepage() has to be
72860 + * called and before this dirty bit will be transferred to the struct
72861 + * page).
72862 + *
72863 + */
72864 +
72865 +#include "debug.h"
72866 +#include "txnmgr.h"
72867 +#include "jnode.h"
72868 +#include "znode.h"
72869 +#include "block_alloc.h"
72870 +#include "tree.h"
72871 +#include "wander.h"
72872 +#include "ktxnmgrd.h"
72873 +#include "super.h"
72874 +#include "page_cache.h"
72875 +#include "reiser4.h"
72876 +#include "vfs_ops.h"
72877 +#include "inode.h"
72878 +#include "flush.h"
72879 +
72880 +#include <asm/atomic.h>
72881 +#include <linux/types.h>
72882 +#include <linux/fs.h>
72883 +#include <linux/mm.h>
72884 +#include <linux/slab.h>
72885 +#include <linux/pagemap.h>
72886 +#include <linux/writeback.h>
72887 +#include <linux/swap.h> /* for totalram_pages */
72888 +
72889 +static void atom_free(txn_atom * atom);
72890 +
72891 +static int commit_txnh(txn_handle * txnh);
72892 +
72893 +static void wakeup_atom_waitfor_list(txn_atom * atom);
72894 +static void wakeup_atom_waiting_list(txn_atom * atom);
72895 +
72896 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72897 +
72898 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72899 +
72900 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72901 +
72902 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
72903 + txn_capture mode);
72904 +
72905 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72906 +
72907 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
72908 +
72909 +void reiser4_invalidate_list(struct list_head *);
72910 +
72911 +/* GENERIC STRUCTURES */
72912 +
72913 +typedef struct _txn_wait_links txn_wait_links;
72914 +
72915 +struct _txn_wait_links {
72916 + lock_stack *_lock_stack;
72917 + struct list_head _fwaitfor_link;
72918 + struct list_head _fwaiting_link;
72919 + int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72920 + int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72921 +};
72922 +
72923 +/* FIXME: In theory, we should be using the slab cache init & destructor
72924 + methods instead of, e.g., jnode_init, etc. */
72925 +static struct kmem_cache *_atom_slab = NULL;
72926 +/* this is for user-visible, cross system-call transactions. */
72927 +static struct kmem_cache *_txnh_slab = NULL;
72928 +
72929 +/**
72930 + * init_txnmgr_static - create transaction manager slab caches
72931 + *
72932 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72933 + * initialization.
72934 + */
72935 +int init_txnmgr_static(void)
72936 +{
72937 + assert("jmacd-600", _atom_slab == NULL);
72938 + assert("jmacd-601", _txnh_slab == NULL);
72939 +
72940 + ON_DEBUG(atomic_set(&flush_cnt, 0));
72941 +
72942 + _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72943 + SLAB_HWCACHE_ALIGN |
72944 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72945 + if (_atom_slab == NULL)
72946 + return RETERR(-ENOMEM);
72947 +
72948 + _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72949 + SLAB_HWCACHE_ALIGN, NULL, NULL);
72950 + if (_txnh_slab == NULL) {
72951 + kmem_cache_destroy(_atom_slab);
72952 + _atom_slab = NULL;
72953 + return RETERR(-ENOMEM);
72954 + }
72955 +
72956 + return 0;
72957 +}
72958 +
72959 +/**
72960 + * done_txnmgr_static - delete txn_atom and txn_handle caches
72961 + *
72962 + * This is called on reiser4 module unloading or system shutdown.
72963 + */
72964 +void done_txnmgr_static(void)
72965 +{
72966 + destroy_reiser4_cache(&_atom_slab);
72967 + destroy_reiser4_cache(&_txnh_slab);
72968 +}
72969 +
72970 +/**
72971 + * init_txnmgr - initialize a new transaction manager
72972 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72973 + *
72974 + * This is called on mount. Makes necessary initializations.
72975 + */
72976 +void reiser4_init_txnmgr(txn_mgr *mgr)
72977 +{
72978 + assert("umka-169", mgr != NULL);
72979 +
72980 + mgr->atom_count = 0;
72981 + mgr->id_count = 1;
72982 + INIT_LIST_HEAD(&mgr->atoms_list);
72983 + spin_lock_init(&mgr->tmgr_lock);
72984 + mutex_init(&mgr->commit_mutex);
72985 +}
72986 +
72987 +/**
72988 + * reiser4_done_txnmgr - stop transaction manager
72989 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72990 + *
72991 + * This is called on umount. Does sanity checks.
72992 + */
72993 +void reiser4_done_txnmgr(txn_mgr *mgr)
72994 +{
72995 + assert("umka-170", mgr != NULL);
72996 + assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72997 + assert("umka-1702", mgr->atom_count == 0);
72998 +}
72999 +
73000 +/* Initialize a transaction handle. */
73001 +/* Audited by: umka (2002.06.13) */
73002 +static void txnh_init(txn_handle * txnh, txn_mode mode)
73003 +{
73004 + assert("umka-171", txnh != NULL);
73005 +
73006 + txnh->mode = mode;
73007 + txnh->atom = NULL;
73008 + reiser4_ctx_gfp_mask_set();
73009 + txnh->flags = 0;
73010 + spin_lock_init(&txnh->hlock);
73011 + INIT_LIST_HEAD(&txnh->txnh_link);
73012 +}
73013 +
73014 +#if REISER4_DEBUG
73015 +/* Check if a transaction handle is clean. */
73016 +static int txnh_isclean(txn_handle * txnh)
73017 +{
73018 + assert("umka-172", txnh != NULL);
73019 + return txnh->atom == NULL &&
73020 + LOCK_CNT_NIL(spin_locked_txnh);
73021 +}
73022 +#endif
73023 +
73024 +/* Initialize an atom. */
73025 +static void atom_init(txn_atom * atom)
73026 +{
73027 + int level;
73028 +
73029 + assert("umka-173", atom != NULL);
73030 +
73031 + memset(atom, 0, sizeof(txn_atom));
73032 +
73033 + atom->stage = ASTAGE_FREE;
73034 + atom->start_time = jiffies;
73035 +
73036 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
73037 + INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73038 +
73039 + INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73040 + INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73041 + INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73042 + INIT_LIST_HEAD(&atom->inodes);
73043 + spin_lock_init(&(atom->alock));
73044 + /* list of transaction handles */
73045 + INIT_LIST_HEAD(&atom->txnh_list);
73046 + /* link to transaction manager's list of atoms */
73047 + INIT_LIST_HEAD(&atom->atom_link);
73048 + INIT_LIST_HEAD(&atom->fwaitfor_list);
73049 + INIT_LIST_HEAD(&atom->fwaiting_list);
73050 + blocknr_set_init(&atom->delete_set);
73051 + blocknr_set_init(&atom->wandered_map);
73052 +
73053 + init_atom_fq_parts(atom);
73054 +}
73055 +
73056 +#if REISER4_DEBUG
73057 +/* Check if an atom is clean. */
73058 +static int atom_isclean(txn_atom * atom)
73059 +{
73060 + int level;
73061 +
73062 + assert("umka-174", atom != NULL);
73063 +
73064 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73065 + if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73066 + return 0;
73067 + }
73068 + }
73069 +
73070 + return atom->stage == ASTAGE_FREE &&
73071 + atom->txnh_count == 0 &&
73072 + atom->capture_count == 0 &&
73073 + atomic_read(&atom->refcount) == 0 &&
73074 + (&atom->atom_link == atom->atom_link.next &&
73075 + &atom->atom_link == atom->atom_link.prev) &&
73076 + list_empty_careful(&atom->txnh_list) &&
73077 + list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73078 + list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73079 + list_empty_careful(ATOM_WB_LIST(atom)) &&
73080 + list_empty_careful(&atom->fwaitfor_list) &&
73081 + list_empty_careful(&atom->fwaiting_list) &&
73082 + atom_fq_parts_are_clean(atom);
73083 +}
73084 +#endif
73085 +
73086 +/* Begin a transaction in this context. Currently this uses the reiser4_context's
73087 + trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
73088 + this will be extended to allow transaction handles to span several contexts. */
73089 +/* Audited by: umka (2002.06.13) */
73090 +void reiser4_txn_begin(reiser4_context * context)
73091 +{
73092 + assert("jmacd-544", context->trans == NULL);
73093 +
73094 + context->trans = &context->trans_in_ctx;
73095 +
73096 + /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73097 + transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
73098 + stack allocated right now, but we would like to allow for dynamically allocated
73099 + transcrashes that span multiple system calls.
73100 + */
73101 + txnh_init(context->trans, TXN_WRITE_FUSING);
73102 +}
73103 +
73104 +/* Finish a transaction handle context. */
73105 +int reiser4_txn_end(reiser4_context * context)
73106 +{
73107 + long ret = 0;
73108 + txn_handle *txnh;
73109 +
73110 + assert("umka-283", context != NULL);
73111 + assert("nikita-3012", reiser4_schedulable());
73112 + assert("vs-24", context == get_current_context());
73113 + assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73114 +
73115 + txnh = context->trans;
73116 + if (txnh != NULL) {
73117 + if (txnh->atom != NULL)
73118 + ret = commit_txnh(txnh);
73119 + assert("jmacd-633", txnh_isclean(txnh));
73120 + context->trans = NULL;
73121 + }
73122 + return ret;
73123 +}
73124 +
73125 +void reiser4_txn_restart(reiser4_context * context)
73126 +{
73127 + reiser4_txn_end(context);
73128 + reiser4_preempt_point();
73129 + reiser4_txn_begin(context);
73130 +}
73131 +
73132 +void reiser4_txn_restart_current(void)
73133 +{
73134 + reiser4_txn_restart(get_current_context());
73135 +}
73136 +
73137 +/* TXN_ATOM */
73138 +
73139 +/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
73140 + is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
73141 + return NULL. */
73142 +static txn_atom *txnh_get_atom(txn_handle * txnh)
73143 +{
73144 + txn_atom *atom;
73145 +
73146 + assert("umka-180", txnh != NULL);
73147 + assert_spin_not_locked(&(txnh->hlock));
73148 +
73149 + while (1) {
73150 + spin_lock_txnh(txnh);
73151 + atom = txnh->atom;
73152 +
73153 + if (atom == NULL)
73154 + break;
73155 +
73156 + if (spin_trylock_atom(atom))
73157 + break;
73158 +
73159 + atomic_inc(&atom->refcount);
73160 +
73161 + spin_unlock_txnh(txnh);
73162 + spin_lock_atom(atom);
73163 + spin_lock_txnh(txnh);
73164 +
73165 + if (txnh->atom == atom) {
73166 + atomic_dec(&atom->refcount);
73167 + break;
73168 + }
73169 +
73170 + spin_unlock_txnh(txnh);
73171 + atom_dec_and_unlock(atom);
73172 + }
73173 +
73174 + return atom;
73175 +}
73176 +
73177 +/* Get the current atom and spinlock it if current atom present. May return NULL */
73178 +txn_atom *get_current_atom_locked_nocheck(void)
73179 +{
73180 + reiser4_context *cx;
73181 + txn_atom *atom;
73182 + txn_handle *txnh;
73183 +
73184 + cx = get_current_context();
73185 + assert("zam-437", cx != NULL);
73186 +
73187 + txnh = cx->trans;
73188 + assert("zam-435", txnh != NULL);
73189 +
73190 + atom = txnh_get_atom(txnh);
73191 +
73192 + spin_unlock_txnh(txnh);
73193 + return atom;
73194 +}
73195 +
73196 +/* Get the atom belonging to a jnode, which is initially locked. Return with
73197 + both jnode and atom locked. This performs the necessary spin_trylock to
73198 + break the lock-ordering cycle. Assumes the jnode is already locked, and
73199 + returns NULL if atom is not set. */
73200 +txn_atom *jnode_get_atom(jnode * node)
73201 +{
73202 + txn_atom *atom;
73203 +
73204 + assert("umka-181", node != NULL);
73205 +
73206 + while (1) {
73207 + assert_spin_locked(&(node->guard));
73208 +
73209 + atom = node->atom;
73210 + /* node is not in any atom */
73211 + if (atom == NULL)
73212 + break;
73213 +
73214 + /* If atom is not locked, grab the lock and return */
73215 + if (spin_trylock_atom(atom))
73216 + break;
73217 +
73218 + /* At least one jnode belongs to this atom it guarantees that
73219 + * atom->refcount > 0, we can safely increment refcount. */
73220 + atomic_inc(&atom->refcount);
73221 + spin_unlock_jnode(node);
73222 +
73223 + /* re-acquire spin locks in the right order */
73224 + spin_lock_atom(atom);
73225 + spin_lock_jnode(node);
73226 +
73227 + /* check if node still points to the same atom. */
73228 + if (node->atom == atom) {
73229 + atomic_dec(&atom->refcount);
73230 + break;
73231 + }
73232 +
73233 + /* releasing of atom lock and reference requires not holding
73234 + * locks on jnodes. */
73235 + spin_unlock_jnode(node);
73236 +
73237 + /* We do not sure that this atom has extra references except our
73238 + * one, so we should call proper function which may free atom if
73239 + * last reference is released. */
73240 + atom_dec_and_unlock(atom);
73241 +
73242 + /* lock jnode again for getting valid node->atom pointer
73243 + * value. */
73244 + spin_lock_jnode(node);
73245 + }
73246 +
73247 + return atom;
73248 +}
73249 +
73250 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
73251 + by flush code to indicate whether the next node (in some direction) is suitable for
73252 + flushing. */
73253 +int
73254 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73255 +{
73256 + int compat;
73257 + txn_atom *atom;
73258 +
73259 + assert("umka-182", node != NULL);
73260 + assert("umka-183", check != NULL);
73261 +
73262 + /* Not sure what this function is supposed to do if supplied with @check that is
73263 + neither formatted nor unformatted (bitmap or so). */
73264 + assert("nikita-2373", jnode_is_znode(check)
73265 + || jnode_is_unformatted(check));
73266 +
73267 + /* Need a lock on CHECK to get its atom and to check various state bits.
73268 + Don't need a lock on NODE once we get the atom lock. */
73269 + /* It is not enough to lock two nodes and check (node->atom ==
73270 + check->atom) because atom could be locked and being fused at that
73271 + moment, jnodes of the atom of that state (being fused) can point to
73272 + different objects, but the atom is the same. */
73273 + spin_lock_jnode(check);
73274 +
73275 + atom = jnode_get_atom(check);
73276 +
73277 + if (atom == NULL) {
73278 + compat = 0;
73279 + } else {
73280 + compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73281 +
73282 + if (compat && jnode_is_znode(check)) {
73283 + compat &= znode_is_connected(JZNODE(check));
73284 + }
73285 +
73286 + if (compat && alloc_check) {
73287 + compat &= (alloc_value == jnode_is_flushprepped(check));
73288 + }
73289 +
73290 + spin_unlock_atom(atom);
73291 + }
73292 +
73293 + spin_unlock_jnode(check);
73294 +
73295 + return compat;
73296 +}
73297 +
73298 +/* Decrement the atom's reference count and if it falls to zero, free it. */
73299 +void atom_dec_and_unlock(txn_atom * atom)
73300 +{
73301 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73302 +
73303 + assert("umka-186", atom != NULL);
73304 + assert_spin_locked(&(atom->alock));
73305 + assert("zam-1039", atomic_read(&atom->refcount) > 0);
73306 +
73307 + if (atomic_dec_and_test(&atom->refcount)) {
73308 + /* take txnmgr lock and atom lock in proper order. */
73309 + if (!spin_trylock_txnmgr(mgr)) {
73310 + /* This atom should exist after we re-acquire its
73311 + * spinlock, so we increment its reference counter. */
73312 + atomic_inc(&atom->refcount);
73313 + spin_unlock_atom(atom);
73314 + spin_lock_txnmgr(mgr);
73315 + spin_lock_atom(atom);
73316 +
73317 + if (!atomic_dec_and_test(&atom->refcount)) {
73318 + spin_unlock_atom(atom);
73319 + spin_unlock_txnmgr(mgr);
73320 + return;
73321 + }
73322 + }
73323 + assert_spin_locked(&(mgr->tmgr_lock));
73324 + atom_free(atom);
73325 + spin_unlock_txnmgr(mgr);
73326 + } else
73327 + spin_unlock_atom(atom);
73328 +}
73329 +
73330 +/* Create new atom and connect it to given transaction handle. This adds the
73331 + atom to the transaction manager's list and sets its reference count to 1, an
73332 + artificial reference which is kept until it commits. We play strange games
73333 + to avoid allocation under jnode & txnh spinlocks.*/
73334 +
73335 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73336 +{
73337 + txn_atom *atom;
73338 + txn_mgr *mgr;
73339 +
73340 + if (REISER4_DEBUG && rofs_tree(current_tree)) {
73341 + warning("nikita-3366", "Creating atom on rofs");
73342 + dump_stack();
73343 + }
73344 +
73345 + if (*atom_alloc == NULL) {
73346 + (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73347 + reiser4_ctx_gfp_mask_get());
73348 +
73349 + if (*atom_alloc == NULL)
73350 + return RETERR(-ENOMEM);
73351 + }
73352 +
73353 + /* and, also, txnmgr spin lock should be taken before jnode and txnh
73354 + locks. */
73355 + mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73356 + spin_lock_txnmgr(mgr);
73357 + spin_lock_txnh(txnh);
73358 +
73359 + /* Check whether new atom still needed */
73360 + if (txnh->atom != NULL) {
73361 + /* NOTE-NIKITA probably it is rather better to free
73362 + * atom_alloc here than thread it up to reiser4_try_capture() */
73363 +
73364 + spin_unlock_txnh(txnh);
73365 + spin_unlock_txnmgr(mgr);
73366 +
73367 + return -E_REPEAT;
73368 + }
73369 +
73370 + atom = *atom_alloc;
73371 + *atom_alloc = NULL;
73372 +
73373 + atom_init(atom);
73374 +
73375 + assert("jmacd-17", atom_isclean(atom));
73376 +
73377 + /*
73378 + * lock ordering is broken here. It is ok, as long as @atom is new
73379 + * and inaccessible for others. We can't use spin_lock_atom or
73380 + * spin_lock(&atom->alock) because they care about locking
73381 + * dependencies. spin_trylock_lock doesn't.
73382 + */
73383 + check_me("", spin_trylock_atom(atom));
73384 +
73385 + /* add atom to the end of transaction manager's list of atoms */
73386 + list_add_tail(&atom->atom_link, &mgr->atoms_list);
73387 + atom->atom_id = mgr->id_count++;
73388 + mgr->atom_count += 1;
73389 +
73390 + /* Release txnmgr lock */
73391 + spin_unlock_txnmgr(mgr);
73392 +
73393 + /* One reference until it commits. */
73394 + atomic_inc(&atom->refcount);
73395 + atom->stage = ASTAGE_CAPTURE_FUSE;
73396 + atom->super = reiser4_get_current_sb();
73397 + capture_assign_txnh_nolock(atom, txnh);
73398 +
73399 + spin_unlock_atom(atom);
73400 + spin_unlock_txnh(txnh);
73401 +
73402 + return -E_REPEAT;
73403 +}
73404 +
73405 +/* Return true if an atom is currently "open". */
73406 +static int atom_isopen(const txn_atom * atom)
73407 +{
73408 + assert("umka-185", atom != NULL);
73409 +
73410 + return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73411 +}
73412 +
73413 +/* Return the number of pointers to this atom that must be updated during fusion. This
73414 + approximates the amount of work to be done. Fusion chooses the atom with fewer
73415 + pointers to fuse into the atom with more pointers. */
73416 +static int atom_pointer_count(const txn_atom * atom)
73417 +{
73418 + assert("umka-187", atom != NULL);
73419 +
73420 + /* This is a measure of the amount of work needed to fuse this atom
73421 + * into another. */
73422 + return atom->txnh_count + atom->capture_count;
73423 +}
73424 +
73425 +/* Called holding the atom lock, this removes the atom from the transaction manager list
73426 + and frees it. */
73427 +static void atom_free(txn_atom * atom)
73428 +{
73429 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73430 +
73431 + assert("umka-188", atom != NULL);
73432 + assert_spin_locked(&(atom->alock));
73433 +
73434 + /* Remove from the txn_mgr's atom list */
73435 + assert_spin_locked(&(mgr->tmgr_lock));
73436 + mgr->atom_count -= 1;
73437 + list_del_init(&atom->atom_link);
73438 +
73439 + /* Clean the atom */
73440 + assert("jmacd-16",
73441 + (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73442 + atom->stage = ASTAGE_FREE;
73443 +
73444 + blocknr_set_destroy(&atom->delete_set);
73445 + blocknr_set_destroy(&atom->wandered_map);
73446 +
73447 + assert("jmacd-16", atom_isclean(atom));
73448 +
73449 + spin_unlock_atom(atom);
73450 +
73451 + kmem_cache_free(_atom_slab, atom);
73452 +}
73453 +
73454 +static int atom_is_dotard(const txn_atom * atom)
73455 +{
73456 + return time_after(jiffies, atom->start_time +
73457 + get_current_super_private()->tmgr.atom_max_age);
73458 +}
73459 +
73460 +static int atom_can_be_committed(txn_atom * atom)
73461 +{
73462 + assert_spin_locked(&(atom->alock));
73463 + assert("zam-885", atom->txnh_count > atom->nr_waiters);
73464 + return atom->txnh_count == atom->nr_waiters + 1;
73465 +}
73466 +
73467 +/* Return true if an atom should commit now. This is determined by aging, atom
73468 + size or atom flags. */
73469 +static int atom_should_commit(const txn_atom * atom)
73470 +{
73471 + assert("umka-189", atom != NULL);
73472 + return
73473 + (atom->flags & ATOM_FORCE_COMMIT) ||
73474 + ((unsigned)atom_pointer_count(atom) >
73475 + get_current_super_private()->tmgr.atom_max_size)
73476 + || atom_is_dotard(atom);
73477 +}
73478 +
73479 +/* return 1 if current atom exists and requires commit. */
73480 +int current_atom_should_commit(void)
73481 +{
73482 + txn_atom *atom;
73483 + int result = 0;
73484 +
73485 + atom = get_current_atom_locked_nocheck();
73486 + if (atom) {
73487 + result = atom_should_commit(atom);
73488 + spin_unlock_atom(atom);
73489 + }
73490 + return result;
73491 +}
73492 +
73493 +static int atom_should_commit_asap(const txn_atom * atom)
73494 +{
73495 + unsigned int captured;
73496 + unsigned int pinnedpages;
73497 +
73498 + assert("nikita-3309", atom != NULL);
73499 +
73500 + captured = (unsigned)atom->capture_count;
73501 + pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73502 +
73503 + return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73504 +}
73505 +
73506 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73507 +{
73508 + jnode *first_dirty;
73509 +
73510 + list_for_each_entry(first_dirty, head, capture_link) {
73511 + if (!(flags & JNODE_FLUSH_COMMIT)) {
73512 + /*
73513 + * skip jnodes which "heard banshee" or having active
73514 + * I/O
73515 + */
73516 + if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73517 + JF_ISSET(first_dirty, JNODE_WRITEBACK))
73518 + continue;
73519 + }
73520 + return first_dirty;
73521 + }
73522 + return NULL;
73523 +}
73524 +
73525 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73526 + nodes on atom's lists */
73527 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73528 +{
73529 + jnode *first_dirty;
73530 + tree_level level;
73531 +
73532 + assert_spin_locked(&(atom->alock));
73533 +
73534 + /* The flush starts from LEAF_LEVEL (=1). */
73535 + for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73536 + if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73537 + continue;
73538 +
73539 + first_dirty =
73540 + find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73541 + flags);
73542 + if (first_dirty)
73543 + return first_dirty;
73544 + }
73545 +
73546 + /* znode-above-root is on the list #0. */
73547 + return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73548 +}
73549 +
73550 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73551 +{
73552 + jnode *cur;
73553 +
73554 + assert("zam-905", atom_is_protected(atom));
73555 +
73556 + cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73557 + while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73558 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73559 +
73560 + spin_lock_jnode(cur);
73561 + if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73562 + if (JF_ISSET(cur, JNODE_DIRTY)) {
73563 + queue_jnode(fq, cur);
73564 + } else {
73565 + /* move jnode to atom's clean list */
73566 + list_move_tail(&cur->capture_link,
73567 + ATOM_CLEAN_LIST(atom));
73568 + }
73569 + }
73570 + spin_unlock_jnode(cur);
73571 +
73572 + cur = next;
73573 + }
73574 +}
73575 +
73576 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73577 + * jnodes to disk. */
73578 +static int submit_wb_list(void)
73579 +{
73580 + int ret;
73581 + flush_queue_t *fq;
73582 +
73583 + fq = get_fq_for_current_atom();
73584 + if (IS_ERR(fq))
73585 + return PTR_ERR(fq);
73586 +
73587 + dispatch_wb_list(fq->atom, fq);
73588 + spin_unlock_atom(fq->atom);
73589 +
73590 + ret = reiser4_write_fq(fq, NULL, 1);
73591 + reiser4_fq_put(fq);
73592 +
73593 + return ret;
73594 +}
73595 +
73596 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
73597 +static int current_atom_complete_writes(void)
73598 +{
73599 + int ret;
73600 +
73601 + /* Each jnode from that list was modified and dirtied when it had i/o
73602 + * request running already. After i/o completion we have to resubmit
73603 + * them to disk again.*/
73604 + ret = submit_wb_list();
73605 + if (ret < 0)
73606 + return ret;
73607 +
73608 + /* Wait all i/o completion */
73609 + ret = current_atom_finish_all_fq();
73610 + if (ret)
73611 + return ret;
73612 +
73613 + /* Scan wb list again; all i/o should be completed, we re-submit dirty
73614 + * nodes to disk */
73615 + ret = submit_wb_list();
73616 + if (ret < 0)
73617 + return ret;
73618 +
73619 + /* Wait all nodes we just submitted */
73620 + return current_atom_finish_all_fq();
73621 +}
73622 +
73623 +#if REISER4_DEBUG
73624 +
73625 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
73626 +{
73627 + if (atom == NULL) {
73628 + printk("%s: no atom\n", prefix);
73629 + return;
73630 + }
73631 +
73632 + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
73633 + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
73634 + atomic_read(&atom->refcount), atom->atom_id, atom->flags,
73635 + atom->txnh_count, atom->capture_count, atom->stage,
73636 + atom->start_time, atom->flushed);
73637 +}
73638 +
73639 +#else /* REISER4_DEBUG */
73640 +
73641 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
73642 +
73643 +#endif /* REISER4_DEBUG */
73644 +
73645 +#define TOOMANYFLUSHES (1 << 13)
73646 +
73647 +/* Called with the atom locked and no open "active" transaction handlers except
73648 + ours, this function calls flush_current_atom() until all dirty nodes are
73649 + processed. Then it initiates commit processing.
73650 +
73651 + Called by the single remaining open "active" txnh, which is closing. Other
73652 + open txnhs belong to processes which wait atom commit in commit_txnh()
73653 + routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73654 + long as we hold the atom lock none of the jnodes can be captured and/or
73655 + locked.
73656 +
73657 + Return value is an error code if commit fails.
73658 +*/
73659 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73660 +{
73661 + reiser4_super_info_data *sbinfo = get_current_super_private();
73662 + long ret = 0;
73663 + /* how many times jnode_flush() was called as a part of attempt to
73664 + * commit this atom. */
73665 + int flushiters;
73666 +
73667 + assert("zam-888", atom != NULL && *atom != NULL);
73668 + assert_spin_locked(&((*atom)->alock));
73669 + assert("zam-887", get_current_context()->trans->atom == *atom);
73670 + assert("jmacd-151", atom_isopen(*atom));
73671 +
73672 + assert("nikita-3184",
73673 + get_current_super_private()->delete_mutex_owner != current);
73674 +
73675 + for (flushiters = 0;; ++flushiters) {
73676 + ret =
73677 + flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73678 + JNODE_FLUSH_COMMIT,
73679 + LONG_MAX /* nr_to_write */ ,
73680 + nr_submitted, atom, NULL);
73681 + if (ret != -E_REPEAT)
73682 + break;
73683 +
73684 + /* if atom's dirty list contains one znode which is
73685 + HEARD_BANSHEE and is locked we have to allow lock owner to
73686 + continue and uncapture that znode */
73687 + reiser4_preempt_point();
73688 +
73689 + *atom = get_current_atom_locked();
73690 + if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73691 + warning("nikita-3176",
73692 + "Flushing like mad: %i", flushiters);
73693 + reiser4_info_atom("atom", *atom);
73694 + DEBUGON(flushiters > (1 << 20));
73695 + }
73696 + }
73697 +
73698 + if (ret)
73699 + return ret;
73700 +
73701 + assert_spin_locked(&((*atom)->alock));
73702 +
73703 + if (!atom_can_be_committed(*atom)) {
73704 + spin_unlock_atom(*atom);
73705 + return RETERR(-E_REPEAT);
73706 + }
73707 +
73708 + if ((*atom)->capture_count == 0)
73709 + goto done;
73710 +
73711 + /* Up to this point we have been flushing and after flush is called we
73712 + return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73713 + at this point, commit should be successful. */
73714 + reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
73715 + ON_DEBUG(((*atom)->committer = current));
73716 + spin_unlock_atom(*atom);
73717 +
73718 + ret = current_atom_complete_writes();
73719 + if (ret)
73720 + return ret;
73721 +
73722 + assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73723 +
73724 + /* isolate critical code path which should be executed by only one
73725 + * thread using tmgr mutex */
73726 + mutex_lock(&sbinfo->tmgr.commit_mutex);
73727 +
73728 + ret = reiser4_write_logs(nr_submitted);
73729 + if (ret < 0)
73730 + reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73731 +
73732 + /* The atom->ovrwr_nodes list is processed under commit mutex held
73733 + because of bitmap nodes which are captured by special way in
73734 + reiser4_pre_commit_hook_bitmap(), that way does not include
73735 + capture_fuse_wait() as a capturing of other nodes does -- the commit
73736 + mutex is used for transaction isolation instead. */
73737 + reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
73738 + mutex_unlock(&sbinfo->tmgr.commit_mutex);
73739 +
73740 + reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
73741 + reiser4_invalidate_list(ATOM_WB_LIST(*atom));
73742 + assert("zam-927", list_empty(&(*atom)->inodes));
73743 +
73744 + spin_lock_atom(*atom);
73745 + done:
73746 + reiser4_atom_set_stage(*atom, ASTAGE_DONE);
73747 + ON_DEBUG((*atom)->committer = NULL);
73748 +
73749 + /* Atom's state changes, so wake up everybody waiting for this
73750 + event. */
73751 + wakeup_atom_waiting_list(*atom);
73752 +
73753 + /* Decrement the "until commit" reference, at least one txnh (the caller) is
73754 + still open. */
73755 + atomic_dec(&(*atom)->refcount);
73756 +
73757 + assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73758 + assert("jmacd-1062", (*atom)->capture_count == 0);
73759 + BUG_ON((*atom)->capture_count != 0);
73760 + assert_spin_locked(&((*atom)->alock));
73761 +
73762 + return ret;
73763 +}
73764 +
73765 +/* TXN_TXNH */
73766 +
73767 +/**
73768 + * force_commit_atom - commit current atom and wait commit completion
73769 + * @txnh:
73770 + *
73771 + * Commits current atom and wait commit completion; current atom and @txnh have
73772 + * to be spinlocked before call, this function unlocks them on exit.
73773 + */
73774 +int force_commit_atom(txn_handle *txnh)
73775 +{
73776 + txn_atom *atom;
73777 +
73778 + assert("zam-837", txnh != NULL);
73779 + assert_spin_locked(&(txnh->hlock));
73780 + assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73781 +
73782 + atom = txnh->atom;
73783 +
73784 + assert("zam-834", atom != NULL);
73785 + assert_spin_locked(&(atom->alock));
73786 +
73787 + /*
73788 + * Set flags for atom and txnh: forcing atom commit and waiting for
73789 + * commit completion
73790 + */
73791 + txnh->flags |= TXNH_WAIT_COMMIT;
73792 + atom->flags |= ATOM_FORCE_COMMIT;
73793 +
73794 + spin_unlock_txnh(txnh);
73795 + spin_unlock_atom(atom);
73796 +
73797 + /* commit is here */
73798 + reiser4_txn_restart_current();
73799 + return 0;
73800 +}
73801 +
73802 +/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73803 + * should we commit all atoms including new ones which are created after this
73804 + * functions is called. */
73805 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73806 +{
73807 + int ret;
73808 + txn_atom *atom;
73809 + txn_mgr *mgr;
73810 + txn_handle *txnh;
73811 + unsigned long start_time = jiffies;
73812 + reiser4_context *ctx = get_current_context();
73813 +
73814 + assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
73815 + assert("nikita-3058", reiser4_commit_check_locks());
73816 +
73817 + reiser4_txn_restart_current();
73818 +
73819 + mgr = &get_super_private(super)->tmgr;
73820 +
73821 + txnh = ctx->trans;
73822 +
73823 + again:
73824 +
73825 + spin_lock_txnmgr(mgr);
73826 +
73827 + list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73828 + spin_lock_atom(atom);
73829 +
73830 + /* Commit any atom which can be committed. If @commit_new_atoms
73831 + * is not set we commit only atoms which were created before
73832 + * this call is started. */
73833 + if (commit_all_atoms
73834 + || time_before_eq(atom->start_time, start_time)) {
73835 + if (atom->stage <= ASTAGE_POST_COMMIT) {
73836 + spin_unlock_txnmgr(mgr);
73837 +
73838 + if (atom->stage < ASTAGE_PRE_COMMIT) {
73839 + spin_lock_txnh(txnh);
73840 + /* Add force-context txnh */
73841 + capture_assign_txnh_nolock(atom, txnh);
73842 + ret = force_commit_atom(txnh);
73843 + if (ret)
73844 + return ret;
73845 + } else
73846 + /* wait atom commit */
73847 + reiser4_atom_wait_event(atom);
73848 +
73849 + goto again;
73850 + }
73851 + }
73852 +
73853 + spin_unlock_atom(atom);
73854 + }
73855 +
73856 +#if REISER4_DEBUG
73857 + if (commit_all_atoms) {
73858 + reiser4_super_info_data *sbinfo = get_super_private(super);
73859 + spin_lock_reiser4_super(sbinfo);
73860 + assert("zam-813",
73861 + sbinfo->blocks_fake_allocated_unformatted == 0);
73862 + assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73863 + spin_unlock_reiser4_super(sbinfo);
73864 + }
73865 +#endif
73866 +
73867 + spin_unlock_txnmgr(mgr);
73868 +
73869 + return 0;
73870 +}
73871 +
73872 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73873 + * caller */
73874 +static int atom_is_committable(txn_atom * atom)
73875 +{
73876 + return
73877 + atom->stage < ASTAGE_PRE_COMMIT &&
73878 + atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73879 +}
73880 +
73881 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73882 + * lock at exit */
73883 +int commit_some_atoms(txn_mgr * mgr)
73884 +{
73885 + int ret = 0;
73886 + txn_atom *atom;
73887 + txn_handle *txnh;
73888 + reiser4_context *ctx;
73889 + struct list_head *pos, *tmp;
73890 +
73891 + ctx = get_current_context();
73892 + assert("nikita-2444", ctx != NULL);
73893 +
73894 + txnh = ctx->trans;
73895 + spin_lock_txnmgr(mgr);
73896 +
73897 + /*
73898 + * this is to avoid gcc complain that atom might be used
73899 + * uninitialized
73900 + */
73901 + atom = NULL;
73902 +
73903 + /* look for atom to commit */
73904 + list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73905 + atom = list_entry(pos, txn_atom, atom_link);
73906 + /*
73907 + * first test without taking atom spin lock, whether it is
73908 + * eligible for committing at all
73909 + */
73910 + if (atom_is_committable(atom)) {
73911 + /* now, take spin lock and re-check */
73912 + spin_lock_atom(atom);
73913 + if (atom_is_committable(atom))
73914 + break;
73915 + spin_unlock_atom(atom);
73916 + }
73917 + }
73918 +
73919 + ret = (&mgr->atoms_list == pos);
73920 + spin_unlock_txnmgr(mgr);
73921 +
73922 + if (ret) {
73923 + /* nothing found */
73924 + spin_unlock(&mgr->daemon->guard);
73925 + return 0;
73926 + }
73927 +
73928 + spin_lock_txnh(txnh);
73929 +
73930 + BUG_ON(atom == NULL);
73931 + /* Set the atom to force committing */
73932 + atom->flags |= ATOM_FORCE_COMMIT;
73933 +
73934 + /* Add force-context txnh */
73935 + capture_assign_txnh_nolock(atom, txnh);
73936 +
73937 + spin_unlock_txnh(txnh);
73938 + spin_unlock_atom(atom);
73939 +
73940 + /* we are about to release daemon spin lock, notify daemon it
73941 + has to rescan atoms */
73942 + mgr->daemon->rescan = 1;
73943 + spin_unlock(&mgr->daemon->guard);
73944 + reiser4_txn_restart_current();
73945 + return 0;
73946 +}
73947 +
73948 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73949 +{
73950 + int atom_stage;
73951 + txn_atom *atom_2;
73952 + int repeat;
73953 +
73954 + assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73955 +
73956 + atom_stage = atom->stage;
73957 + repeat = 0;
73958 +
73959 + if (!spin_trylock_txnmgr(tmgr)) {
73960 + atomic_inc(&atom->refcount);
73961 + spin_unlock_atom(atom);
73962 + spin_lock_txnmgr(tmgr);
73963 + spin_lock_atom(atom);
73964 + repeat = 1;
73965 + if (atom->stage != atom_stage) {
73966 + spin_unlock_txnmgr(tmgr);
73967 + atom_dec_and_unlock(atom);
73968 + return -E_REPEAT;
73969 + }
73970 + atomic_dec(&atom->refcount);
73971 + }
73972 +
73973 + list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73974 + if (atom == atom_2)
73975 + continue;
73976 + /*
73977 + * if trylock does not succeed we just do not fuse with that
73978 + * atom.
73979 + */
73980 + if (spin_trylock_atom(atom_2)) {
73981 + if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73982 + spin_unlock_txnmgr(tmgr);
73983 + capture_fuse_into(atom_2, atom);
73984 + /* all locks are lost we can only repeat here */
73985 + return -E_REPEAT;
73986 + }
73987 + spin_unlock_atom(atom_2);
73988 + }
73989 + }
73990 + atom->flags |= ATOM_CANCEL_FUSION;
73991 + spin_unlock_txnmgr(tmgr);
73992 + if (repeat) {
73993 + spin_unlock_atom(atom);
73994 + return -E_REPEAT;
73995 + }
73996 + return 0;
73997 +}
73998 +
73999 +/* Calls jnode_flush for current atom if it exists; if not, just take another
74000 + atom and call jnode_flush() for him. If current transaction handle has
74001 + already assigned atom (current atom) we have to close current transaction
74002 + prior to switch to another atom or do something with current atom. This
74003 + code tries to flush current atom.
74004 +
74005 + flush_some_atom() is called as part of memory clearing process. It is
74006 + invoked from balance_dirty_pages(), pdflushd, and entd.
74007 +
74008 + If we can flush no nodes, atom is committed, because this frees memory.
74009 +
74010 + If atom is too large or too old it is committed also.
74011 +*/
74012 +int
74013 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
74014 + int flags)
74015 +{
74016 + reiser4_context *ctx = get_current_context();
74017 + txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
74018 + txn_handle *txnh = ctx->trans;
74019 + txn_atom *atom;
74020 + int ret;
74021 +
74022 + BUG_ON(wbc->nr_to_write == 0);
74023 + BUG_ON(*nr_submitted != 0);
74024 + assert("zam-1042", txnh != NULL);
74025 + repeat:
74026 + if (txnh->atom == NULL) {
74027 + /* current atom is not available, take first from txnmgr */
74028 + spin_lock_txnmgr(tmgr);
74029 +
74030 + /* traverse the list of all atoms */
74031 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74032 + /* lock atom before checking its state */
74033 + spin_lock_atom(atom);
74034 +
74035 + /*
74036 + * we need an atom which is not being committed and
74037 + * which has no flushers (jnode_flush() add one flusher
74038 + * at the beginning and subtract one at the end).
74039 + */
74040 + if (atom->stage < ASTAGE_PRE_COMMIT &&
74041 + atom->nr_flushers == 0) {
74042 + spin_lock_txnh(txnh);
74043 + capture_assign_txnh_nolock(atom, txnh);
74044 + spin_unlock_txnh(txnh);
74045 +
74046 + goto found;
74047 + }
74048 +
74049 + spin_unlock_atom(atom);
74050 + }
74051 +
74052 + /*
74053 + * Write throttling is case of no one atom can be
74054 + * flushed/committed.
74055 + */
74056 + if (!current_is_pdflush() && !wbc->nonblocking) {
74057 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74058 + spin_lock_atom(atom);
74059 + /* Repeat the check from the above. */
74060 + if (atom->stage < ASTAGE_PRE_COMMIT
74061 + && atom->nr_flushers == 0) {
74062 + spin_lock_txnh(txnh);
74063 + capture_assign_txnh_nolock(atom, txnh);
74064 + spin_unlock_txnh(txnh);
74065 +
74066 + goto found;
74067 + }
74068 + if (atom->stage <= ASTAGE_POST_COMMIT) {
74069 + spin_unlock_txnmgr(tmgr);
74070 + /*
74071 + * we just wait until atom's flusher
74072 + * makes a progress in flushing or
74073 + * committing the atom
74074 + */
74075 + reiser4_atom_wait_event(atom);
74076 + goto repeat;
74077 + }
74078 + spin_unlock_atom(atom);
74079 + }
74080 + }
74081 + spin_unlock_txnmgr(tmgr);
74082 + return 0;
74083 + found:
74084 + spin_unlock_txnmgr(tmgr);
74085 + } else
74086 + atom = get_current_atom_locked();
74087 +
74088 + BUG_ON(atom->super != ctx->super);
74089 + assert("vs-35", atom->super == ctx->super);
74090 + if (start) {
74091 + spin_lock_jnode(start);
74092 + ret = (atom == start->atom) ? 1 : 0;
74093 + spin_unlock_jnode(start);
74094 + if (ret == 0)
74095 + start = NULL;
74096 + }
74097 + ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74098 + if (ret == 0) {
74099 + /* flush_current_atom returns 0 only if it submitted for write
74100 + nothing */
74101 + BUG_ON(*nr_submitted != 0);
74102 + if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74103 + if (atom->capture_count < tmgr->atom_min_size &&
74104 + !(atom->flags & ATOM_CANCEL_FUSION)) {
74105 + ret = txn_try_to_fuse_small_atom(tmgr, atom);
74106 + if (ret == -E_REPEAT) {
74107 + reiser4_preempt_point();
74108 + goto repeat;
74109 + }
74110 + }
74111 + /* if early flushing could not make more nodes clean,
74112 + * or atom is too old/large,
74113 + * we force current atom to commit */
74114 + /* wait for commit completion but only if this
74115 + * wouldn't stall pdflushd and ent thread. */
74116 + if (!wbc->nonblocking && !ctx->entd)
74117 + txnh->flags |= TXNH_WAIT_COMMIT;
74118 + atom->flags |= ATOM_FORCE_COMMIT;
74119 + }
74120 + spin_unlock_atom(atom);
74121 + } else if (ret == -E_REPEAT) {
74122 + if (*nr_submitted == 0) {
74123 + /* let others who hampers flushing (hold longterm locks,
74124 + for instance) to free the way for flush */
74125 + reiser4_preempt_point();
74126 + goto repeat;
74127 + }
74128 + ret = 0;
74129 + }
74130 +/*
74131 + if (*nr_submitted > wbc->nr_to_write)
74132 + warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74133 +*/
74134 + reiser4_txn_restart(ctx);
74135 +
74136 + return ret;
74137 +}
74138 +
74139 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74140 +void reiser4_invalidate_list(struct list_head *head)
74141 +{
74142 + while (!list_empty(head)) {
74143 + jnode *node;
74144 +
74145 + node = list_entry(head->next, jnode, capture_link);
74146 + spin_lock_jnode(node);
74147 + reiser4_uncapture_block(node);
74148 + jput(node);
74149 + }
74150 +}
74151 +
74152 +static void init_wlinks(txn_wait_links * wlinks)
74153 +{
74154 + wlinks->_lock_stack = get_current_lock_stack();
74155 + INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74156 + INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74157 + wlinks->waitfor_cb = NULL;
74158 + wlinks->waiting_cb = NULL;
74159 +}
74160 +
74161 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74162 +void reiser4_atom_wait_event(txn_atom * atom)
74163 +{
74164 + txn_wait_links _wlinks;
74165 +
74166 + assert_spin_locked(&(atom->alock));
74167 + assert("nikita-3156",
74168 + lock_stack_isclean(get_current_lock_stack()) ||
74169 + atom->nr_running_queues > 0);
74170 +
74171 + init_wlinks(&_wlinks);
74172 + list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74173 + atomic_inc(&atom->refcount);
74174 + spin_unlock_atom(atom);
74175 +
74176 + reiser4_prepare_to_sleep(_wlinks._lock_stack);
74177 + reiser4_go_to_sleep(_wlinks._lock_stack);
74178 +
74179 + spin_lock_atom(atom);
74180 + list_del(&_wlinks._fwaitfor_link);
74181 + atom_dec_and_unlock(atom);
74182 +}
74183 +
74184 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
74185 +{
74186 + assert("nikita-3535", atom != NULL);
74187 + assert_spin_locked(&(atom->alock));
74188 + assert("nikita-3536", stage <= ASTAGE_INVALID);
74189 + /* Excelsior! */
74190 + assert("nikita-3537", stage >= atom->stage);
74191 + if (atom->stage != stage) {
74192 + atom->stage = stage;
74193 + reiser4_atom_send_event(atom);
74194 + }
74195 +}
74196 +
74197 +/* wake all threads which wait for an event */
74198 +void reiser4_atom_send_event(txn_atom * atom)
74199 +{
74200 + assert_spin_locked(&(atom->alock));
74201 + wakeup_atom_waitfor_list(atom);
74202 +}
74203 +
74204 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74205 + example, because it does fsync(2)) */
74206 +static int should_wait_commit(txn_handle * h)
74207 +{
74208 + return h->flags & TXNH_WAIT_COMMIT;
74209 +}
74210 +
74211 +typedef struct commit_data {
74212 + txn_atom *atom;
74213 + txn_handle *txnh;
74214 + long nr_written;
74215 + /* as an optimization we start committing atom by first trying to
74216 + * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74217 + * allows to reduce stalls due to other threads waiting for atom in
74218 + * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74219 + * preliminary flushes. */
74220 + int preflush;
74221 + /* have we waited on atom. */
74222 + int wait;
74223 + int failed;
74224 + int wake_ktxnmgrd_up;
74225 +} commit_data;
74226 +
74227 +/*
74228 + * Called from commit_txnh() repeatedly, until either error happens, or atom
74229 + * commits successfully.
74230 + */
74231 +static int try_commit_txnh(commit_data * cd)
74232 +{
74233 + int result;
74234 +
74235 + assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74236 +
74237 + /* Get the atom and txnh locked. */
74238 + cd->atom = txnh_get_atom(cd->txnh);
74239 + assert("jmacd-309", cd->atom != NULL);
74240 + spin_unlock_txnh(cd->txnh);
74241 +
74242 + if (cd->wait) {
74243 + cd->atom->nr_waiters--;
74244 + cd->wait = 0;
74245 + }
74246 +
74247 + if (cd->atom->stage == ASTAGE_DONE)
74248 + return 0;
74249 +
74250 + if (cd->failed)
74251 + return 0;
74252 +
74253 + if (atom_should_commit(cd->atom)) {
74254 + /* if atom is _very_ large schedule it for commit as soon as
74255 + * possible. */
74256 + if (atom_should_commit_asap(cd->atom)) {
74257 + /*
74258 + * When atom is in PRE_COMMIT or later stage following
74259 + * invariant (encoded in atom_can_be_committed())
74260 + * holds: there is exactly one non-waiter transaction
74261 + * handle opened on this atom. When thread wants to
74262 + * wait until atom commits (for example sync()) it
74263 + * waits on atom event after increasing
74264 + * atom->nr_waiters (see blow in this function). It
74265 + * cannot be guaranteed that atom is already committed
74266 + * after receiving event, so loop has to be
74267 + * re-started. But if atom switched into PRE_COMMIT
74268 + * stage and became too large, we cannot change its
74269 + * state back to CAPTURE_WAIT (atom stage can only
74270 + * increase monotonically), hence this check.
74271 + */
74272 + if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74273 + reiser4_atom_set_stage(cd->atom,
74274 + ASTAGE_CAPTURE_WAIT);
74275 + cd->atom->flags |= ATOM_FORCE_COMMIT;
74276 + }
74277 + if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74278 + /*
74279 + * this thread (transaction handle that is) doesn't
74280 + * want to commit atom. Notify waiters that handle is
74281 + * closed. This can happen, for example, when we are
74282 + * under VFS directory lock and don't want to commit
74283 + * atom right now to avoid stalling other threads
74284 + * working in the same directory.
74285 + */
74286 +
74287 + /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
74288 + * commit this atom: no atom waiters and only one
74289 + * (our) open transaction handle. */
74290 + cd->wake_ktxnmgrd_up =
74291 + cd->atom->txnh_count == 1 &&
74292 + cd->atom->nr_waiters == 0;
74293 + reiser4_atom_send_event(cd->atom);
74294 + result = 0;
74295 + } else if (!atom_can_be_committed(cd->atom)) {
74296 + if (should_wait_commit(cd->txnh)) {
74297 + /* sync(): wait for commit */
74298 + cd->atom->nr_waiters++;
74299 + cd->wait = 1;
74300 + reiser4_atom_wait_event(cd->atom);
74301 + result = RETERR(-E_REPEAT);
74302 + } else {
74303 + result = 0;
74304 + }
74305 + } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74306 + /*
74307 + * optimization: flush atom without switching it into
74308 + * ASTAGE_CAPTURE_WAIT.
74309 + *
74310 + * But don't do this for ktxnmgrd, because ktxnmgrd
74311 + * should never block on atom fusion.
74312 + */
74313 + result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74314 + LONG_MAX, &cd->nr_written,
74315 + &cd->atom, NULL);
74316 + if (result == 0) {
74317 + spin_unlock_atom(cd->atom);
74318 + cd->preflush = 0;
74319 + result = RETERR(-E_REPEAT);
74320 + } else /* Atoms wasn't flushed
74321 + * completely. Rinse. Repeat. */
74322 + --cd->preflush;
74323 + } else {
74324 + /* We change atom state to ASTAGE_CAPTURE_WAIT to
74325 + prevent atom fusion and count ourself as an active
74326 + flusher */
74327 + reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74328 + cd->atom->flags |= ATOM_FORCE_COMMIT;
74329 +
74330 + result =
74331 + commit_current_atom(&cd->nr_written, &cd->atom);
74332 + if (result != 0 && result != -E_REPEAT)
74333 + cd->failed = 1;
74334 + }
74335 + } else
74336 + result = 0;
74337 +
74338 +#if REISER4_DEBUG
74339 + if (result == 0)
74340 + assert_spin_locked(&(cd->atom->alock));
74341 +#endif
74342 +
74343 + /* perfectly valid assertion, except that when atom/txnh is not locked
74344 + * fusion can take place, and cd->atom points nowhere. */
74345 + /*
74346 + assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74347 + */
74348 + return result;
74349 +}
74350 +
74351 +/* Called to commit a transaction handle. This decrements the atom's number of open
74352 + handles and if it is the last handle to commit and the atom should commit, initiates
74353 + atom commit. if commit does not fail, return number of written blocks */
74354 +static int commit_txnh(txn_handle * txnh)
74355 +{
74356 + commit_data cd;
74357 + assert("umka-192", txnh != NULL);
74358 +
74359 + memset(&cd, 0, sizeof cd);
74360 + cd.txnh = txnh;
74361 + cd.preflush = 10;
74362 +
74363 + /* calls try_commit_txnh() until either atom commits, or error
74364 + * happens */
74365 + while (try_commit_txnh(&cd) != 0)
74366 + reiser4_preempt_point();
74367 +
74368 + spin_lock_txnh(txnh);
74369 +
74370 + cd.atom->txnh_count -= 1;
74371 + txnh->atom = NULL;
74372 + /* remove transaction handle from atom's list of transaction handles */
74373 + list_del_init(&txnh->txnh_link);
74374 +
74375 + spin_unlock_txnh(txnh);
74376 + atom_dec_and_unlock(cd.atom);
74377 + /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74378 + * because it takes time) by current thread, we do that work
74379 + * asynchronously by ktxnmgrd daemon. */
74380 + if (cd.wake_ktxnmgrd_up)
74381 + ktxnmgrd_kick(&get_current_super_private()->tmgr);
74382 +
74383 + return 0;
74384 +}
74385 +
74386 +/* TRY_CAPTURE */
74387 +
74388 +/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74389 + condition indicates that the request should be retried, and it may block if the
74390 + txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74391 +
74392 + This routine encodes the basic logic of block capturing described by:
74393 +
74394 + http://namesys.com/v4/v4.html
74395 +
74396 + Our goal here is to ensure that any two blocks that contain dependent modifications
74397 + should commit at the same time. This function enforces this discipline by initiating
74398 + fusion whenever a transaction handle belonging to one atom requests to read or write a
74399 + block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74400 +
74401 + In addition, this routine handles the initial assignment of atoms to blocks and
74402 + transaction handles. These are possible outcomes of this function:
74403 +
74404 + 1. The block and handle are already part of the same atom: return immediate success
74405 +
74406 + 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74407 + the handle to the block's atom.
74408 +
74409 + 3. The handle is assigned but the block is not: call capture_assign_block to assign
74410 + the block to the handle's atom.
74411 +
74412 + 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74413 + to fuse atoms.
74414 +
74415 + 5. Neither block nor handle are assigned: create a new atom and assign them both.
74416 +
74417 + 6. A read request for a non-captured block: return immediate success.
74418 +
74419 + This function acquires and releases the handle's spinlock. This function is called
74420 + under the jnode lock and if the return value is 0, it returns with the jnode lock still
74421 + held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74422 + released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
74423 + lock in the failure case.
74424 +*/
74425 +static int try_capture_block(
74426 + txn_handle * txnh, jnode * node, txn_capture mode,
74427 + txn_atom ** atom_alloc)
74428 +{
74429 + txn_atom *block_atom;
74430 + txn_atom *txnh_atom;
74431 +
74432 + /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
74433 + assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74434 +
74435 + /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74436 + * node->tree somewhere. */
74437 + assert("umka-194", txnh != NULL);
74438 + assert("umka-195", node != NULL);
74439 +
74440 + /* The jnode is already locked! Being called from reiser4_try_capture(). */
74441 + assert_spin_locked(&(node->guard));
74442 + block_atom = node->atom;
74443 +
74444 + /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74445 + let us touch the atoms themselves. */
74446 + spin_lock_txnh(txnh);
74447 + txnh_atom = txnh->atom;
74448 + /* Process of capturing continues into one of four branches depends on
74449 + which atoms from (block atom (node->atom), current atom (txnh->atom))
74450 + exist. */
74451 + if (txnh_atom == NULL) {
74452 + if (block_atom == NULL) {
74453 + spin_unlock_txnh(txnh);
74454 + spin_unlock_jnode(node);
74455 + /* assign empty atom to the txnh and repeat */
74456 + return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74457 + } else {
74458 + atomic_inc(&block_atom->refcount);
74459 + /* node spin-lock isn't needed anymore */
74460 + spin_unlock_jnode(node);
74461 + if (!spin_trylock_atom(block_atom)) {
74462 + spin_unlock_txnh(txnh);
74463 + spin_lock_atom(block_atom);
74464 + spin_lock_txnh(txnh);
74465 + }
74466 + /* re-check state after getting txnh and the node
74467 + * atom spin-locked */
74468 + if (node->atom != block_atom || txnh->atom != NULL) {
74469 + spin_unlock_txnh(txnh);
74470 + atom_dec_and_unlock(block_atom);
74471 + return RETERR(-E_REPEAT);
74472 + }
74473 + atomic_dec(&block_atom->refcount);
74474 + if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74475 + (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74476 + block_atom->txnh_count != 0))
74477 + return capture_fuse_wait(txnh, block_atom, NULL, mode);
74478 + capture_assign_txnh_nolock(block_atom, txnh);
74479 + spin_unlock_txnh(txnh);
74480 + spin_unlock_atom(block_atom);
74481 + return RETERR(-E_REPEAT);
74482 + }
74483 + } else {
74484 + /* It is time to perform deadlock prevention check over the
74485 + node we want to capture. It is possible this node was locked
74486 + for read without capturing it. The optimization which allows
74487 + to do it helps us in keeping atoms independent as long as
74488 + possible but it may cause lock/fuse deadlock problems.
74489 +
74490 + A number of similar deadlock situations with locked but not
74491 + captured nodes were found. In each situation there are two
74492 + or more threads: one of them does flushing while another one
74493 + does routine balancing or tree lookup. The flushing thread
74494 + (F) sleeps in long term locking request for node (N), another
74495 + thread (A) sleeps in trying to capture some node already
74496 + belonging the atom F, F has a state which prevents
74497 + immediately fusion .
74498 +
74499 + Deadlocks of this kind cannot happen if node N was properly
74500 + captured by thread A. The F thread fuse atoms before locking
74501 + therefore current atom of thread F and current atom of thread
74502 + A became the same atom and thread A may proceed. This does
74503 + not work if node N was not captured because the fusion of
74504 + atom does not happens.
74505 +
74506 + The following scheme solves the deadlock: If
74507 + longterm_lock_znode locks and does not capture a znode, that
74508 + znode is marked as MISSED_IN_CAPTURE. A node marked this way
74509 + is processed by the code below which restores the missed
74510 + capture and fuses current atoms of all the node lock owners
74511 + by calling the fuse_not_fused_lock_owners() function. */
74512 + if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74513 + JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74514 + if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74515 + spin_unlock_txnh(txnh);
74516 + spin_unlock_jnode(node);
74517 + fuse_not_fused_lock_owners(txnh, JZNODE(node));
74518 + return RETERR(-E_REPEAT);
74519 + }
74520 + }
74521 + if (block_atom == NULL) {
74522 + atomic_inc(&txnh_atom->refcount);
74523 + spin_unlock_txnh(txnh);
74524 + if (!spin_trylock_atom(txnh_atom)) {
74525 + spin_unlock_jnode(node);
74526 + spin_lock_atom(txnh_atom);
74527 + spin_lock_jnode(node);
74528 + }
74529 + if (txnh->atom != txnh_atom || node->atom != NULL
74530 + || JF_ISSET(node, JNODE_IS_DYING)) {
74531 + spin_unlock_jnode(node);
74532 + atom_dec_and_unlock(txnh_atom);
74533 + return RETERR(-E_REPEAT);
74534 + }
74535 + atomic_dec(&txnh_atom->refcount);
74536 + capture_assign_block_nolock(txnh_atom, node);
74537 + spin_unlock_atom(txnh_atom);
74538 + } else {
74539 + if (txnh_atom != block_atom) {
74540 + if (mode & TXN_CAPTURE_DONT_FUSE) {
74541 + spin_unlock_txnh(txnh);
74542 + spin_unlock_jnode(node);
74543 + /* we are in a "no-fusion" mode and @node is
74544 + * already part of transaction. */
74545 + return RETERR(-E_NO_NEIGHBOR);
74546 + }
74547 + return capture_init_fusion(node, txnh, mode);
74548 + }
74549 + spin_unlock_txnh(txnh);
74550 + }
74551 + }
74552 + return 0;
74553 +}
74554 +
74555 +static txn_capture
74556 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74557 +{
74558 + txn_capture cap_mode;
74559 +
74560 + assert_spin_locked(&(node->guard));
74561 +
74562 + /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74563 +
74564 + if (lock_mode == ZNODE_WRITE_LOCK) {
74565 + cap_mode = TXN_CAPTURE_WRITE;
74566 + } else if (node->atom != NULL) {
74567 + cap_mode = TXN_CAPTURE_WRITE;
74568 + } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74569 + jnode_get_level(node) == LEAF_LEVEL) {
74570 + /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74571 + /* We only need a READ_FUSING capture at the leaf level. This
74572 + is because the internal levels of the tree (twigs included)
74573 + are redundant from the point of the user that asked for a
74574 + read-fusing transcrash. The user only wants to read-fuse
74575 + atoms due to reading uncommitted data that another user has
74576 + written. It is the file system that reads/writes the
74577 + internal tree levels, the user only reads/writes leaves. */
74578 + cap_mode = TXN_CAPTURE_READ_ATOMIC;
74579 + } else {
74580 + /* In this case (read lock at a non-leaf) there's no reason to
74581 + * capture. */
74582 + /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74583 + return 0;
74584 + }
74585 +
74586 + cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74587 + assert("nikita-3186", cap_mode != 0);
74588 + return cap_mode;
74589 +}
74590 +
74591 +/* This is an external interface to try_capture_block(), it calls
74592 + try_capture_block() repeatedly as long as -E_REPEAT is returned.
74593 +
74594 + @node: node to capture,
74595 + @lock_mode: read or write lock is used in capture mode calculation,
74596 + @flags: see txn_capture flags enumeration,
74597 + @can_coc : can copy-on-capture
74598 +
74599 + @return: 0 - node was successfully captured, -E_REPEAT - capture request
74600 + cannot be processed immediately as it was requested in flags,
74601 + < 0 - other errors.
74602 +*/
74603 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
74604 + txn_capture flags)
74605 +{
74606 + txn_atom *atom_alloc = NULL;
74607 + txn_capture cap_mode;
74608 + txn_handle *txnh = get_current_context()->trans;
74609 + int ret;
74610 +
74611 + assert_spin_locked(&(node->guard));
74612 +
74613 + repeat:
74614 + if (JF_ISSET(node, JNODE_IS_DYING))
74615 + return RETERR(-EINVAL);
74616 + if (node->atom != NULL && txnh->atom == node->atom)
74617 + return 0;
74618 + cap_mode = build_capture_mode(node, lock_mode, flags);
74619 + if (cap_mode == 0 ||
74620 + (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74621 + /* Mark this node as "MISSED". It helps in further deadlock
74622 + * analysis */
74623 + if (jnode_is_znode(node))
74624 + JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74625 + return 0;
74626 + }
74627 + /* Repeat try_capture as long as -E_REPEAT is returned. */
74628 + ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74629 + /* Regardless of non_blocking:
74630 +
74631 + If ret == 0 then jnode is still locked.
74632 + If ret != 0 then jnode is unlocked.
74633 + */
74634 +#if REISER4_DEBUG
74635 + if (ret == 0)
74636 + assert_spin_locked(&(node->guard));
74637 + else
74638 + assert_spin_not_locked(&(node->guard));
74639 +#endif
74640 + assert_spin_not_locked(&(txnh->guard));
74641 +
74642 + if (ret == -E_REPEAT) {
74643 + /* E_REPEAT implies all locks were released, therefore we need
74644 + to take the jnode's lock again. */
74645 + spin_lock_jnode(node);
74646 +
74647 + /* Although this may appear to be a busy loop, it is not.
74648 + There are several conditions that cause E_REPEAT to be
74649 + returned by the call to try_capture_block, all cases
74650 + indicating some kind of state change that means you should
74651 + retry the request and will get a different result. In some
74652 + cases this could be avoided with some extra code, but
74653 + generally it is done because the necessary locks were
74654 + released as a result of the operation and repeating is the
74655 + simplest thing to do (less bug potential). The cases are:
74656 + atom fusion returns E_REPEAT after it completes (jnode and
74657 + txnh were unlocked); race conditions in assign_block,
74658 + assign_txnh, and init_fusion return E_REPEAT (trylock
74659 + failure); after going to sleep in capture_fuse_wait
74660 + (request was blocked but may now succeed). I'm not quite
74661 + sure how capture_copy works yet, but it may also return
74662 + E_REPEAT. When the request is legitimately blocked, the
74663 + requestor goes to sleep in fuse_wait, so this is not a busy
74664 + loop. */
74665 + /* NOTE-NIKITA: still don't understand:
74666 +
74667 + try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74668 +
74669 + looks like busy loop?
74670 + */
74671 + goto repeat;
74672 + }
74673 +
74674 + /* free extra atom object that was possibly allocated by
74675 + try_capture_block().
74676 +
74677 + Do this before acquiring jnode spin lock to
74678 + minimize time spent under lock. --nikita */
74679 + if (atom_alloc != NULL) {
74680 + kmem_cache_free(_atom_slab, atom_alloc);
74681 + }
74682 +
74683 + if (ret != 0) {
74684 + if (ret == -E_BLOCK) {
74685 + assert("nikita-3360",
74686 + cap_mode & TXN_CAPTURE_NONBLOCKING);
74687 + ret = -E_REPEAT;
74688 + }
74689 +
74690 + /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74691 + want to fix the above code to avoid releasing the lock and
74692 + re-acquiring it, but there are cases were failure occurs
74693 + when the lock is not held, and those cases would need to be
74694 + modified to re-take the lock. */
74695 + spin_lock_jnode(node);
74696 + }
74697 +
74698 + /* Jnode is still locked. */
74699 + assert_spin_locked(&(node->guard));
74700 + return ret;
74701 +}
74702 +
74703 +static void release_two_atoms(txn_atom *one, txn_atom *two)
74704 +{
74705 + spin_unlock_atom(one);
74706 + atom_dec_and_unlock(two);
74707 + spin_lock_atom(one);
74708 + atom_dec_and_unlock(one);
74709 +}
74710 +
74711 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74712 + returned by that routine. The txn_capture request mode is computed here depending on
74713 + the transaction handle's type and the lock request. This is called from the depths of
74714 + the lock manager with the jnode lock held and it always returns with the jnode lock
74715 + held.
74716 +*/
74717 +
74718 +/* fuse all 'active' atoms of lock owners of given node. */
74719 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74720 +{
74721 + lock_handle *lh;
74722 + int repeat;
74723 + txn_atom *atomh, *atomf;
74724 + reiser4_context *me = get_current_context();
74725 + reiser4_context *ctx = NULL;
74726 +
74727 + assert_spin_not_locked(&(ZJNODE(node)->guard));
74728 + assert_spin_not_locked(&(txnh->hlock));
74729 +
74730 + repeat:
74731 + repeat = 0;
74732 + atomh = txnh_get_atom(txnh);
74733 + spin_unlock_txnh(txnh);
74734 + assert("zam-692", atomh != NULL);
74735 +
74736 + spin_lock_zlock(&node->lock);
74737 + /* inspect list of lock owners */
74738 + list_for_each_entry(lh, &node->lock.owners, owners_link) {
74739 + ctx = get_context_by_lock_stack(lh->owner);
74740 + if (ctx == me)
74741 + continue;
74742 + /* below we use two assumptions to avoid addition spin-locks
74743 + for checking the condition :
74744 +
74745 + 1) if the lock stack has lock, the transaction should be
74746 + opened, i.e. ctx->trans != NULL;
74747 +
74748 + 2) reading of well-aligned ctx->trans->atom is atomic, if it
74749 + equals to the address of spin-locked atomh, we take that
74750 + the atoms are the same, nothing has to be captured. */
74751 + if (atomh != ctx->trans->atom) {
74752 + reiser4_wake_up(lh->owner);
74753 + repeat = 1;
74754 + break;
74755 + }
74756 + }
74757 + if (repeat) {
74758 + if (!spin_trylock_txnh(ctx->trans)) {
74759 + spin_unlock_zlock(&node->lock);
74760 + spin_unlock_atom(atomh);
74761 + goto repeat;
74762 + }
74763 + atomf = ctx->trans->atom;
74764 + if (atomf == NULL) {
74765 + capture_assign_txnh_nolock(atomh, ctx->trans);
74766 + /* release zlock lock _after_ assigning the atom to the
74767 + * transaction handle, otherwise the lock owner thread
74768 + * may unlock all znodes, exit kernel context and here
74769 + * we would access an invalid transaction handle. */
74770 + spin_unlock_zlock(&node->lock);
74771 + spin_unlock_atom(atomh);
74772 + spin_unlock_txnh(ctx->trans);
74773 + goto repeat;
74774 + }
74775 + assert("zam-1059", atomf != atomh);
74776 + spin_unlock_zlock(&node->lock);
74777 + atomic_inc(&atomh->refcount);
74778 + atomic_inc(&atomf->refcount);
74779 + spin_unlock_txnh(ctx->trans);
74780 + if (atomf > atomh) {
74781 + spin_lock_atom_nested(atomf);
74782 + } else {
74783 + spin_unlock_atom(atomh);
74784 + spin_lock_atom(atomf);
74785 + spin_lock_atom_nested(atomh);
74786 + }
74787 + if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74788 + release_two_atoms(atomf, atomh);
74789 + goto repeat;
74790 + }
74791 + atomic_dec(&atomh->refcount);
74792 + atomic_dec(&atomf->refcount);
74793 + capture_fuse_into(atomf, atomh);
74794 + goto repeat;
74795 + }
74796 + spin_unlock_zlock(&node->lock);
74797 + spin_unlock_atom(atomh);
74798 +}
74799 +
74800 +/* This is the interface to capture unformatted nodes via their struct page
74801 + reference. Currently it is only used in reiser4_invalidatepage */
74802 +int try_capture_page_to_invalidate(struct page *pg)
74803 +{
74804 + int ret;
74805 + jnode *node;
74806 +
74807 + assert("umka-292", pg != NULL);
74808 + assert("nikita-2597", PageLocked(pg));
74809 +
74810 + if (IS_ERR(node = jnode_of_page(pg))) {
74811 + return PTR_ERR(node);
74812 + }
74813 +
74814 + spin_lock_jnode(node);
74815 + unlock_page(pg);
74816 +
74817 + ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
74818 + spin_unlock_jnode(node);
74819 + jput(node);
74820 + lock_page(pg);
74821 + return ret;
74822 +}
74823 +
74824 +/* This informs the transaction manager when a node is deleted. Add the block to the
74825 + atom's delete set and uncapture the block.
74826 +
74827 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74828 +explanations. find all the functions that use it, and unless there is some very
74829 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74830 +move the loop to inside the function.
74831 +
74832 +VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74833 + */
74834 +void reiser4_uncapture_page(struct page *pg)
74835 +{
74836 + jnode *node;
74837 + txn_atom *atom;
74838 +
74839 + assert("umka-199", pg != NULL);
74840 + assert("nikita-3155", PageLocked(pg));
74841 +
74842 + clear_page_dirty_for_io(pg);
74843 +
74844 + reiser4_wait_page_writeback(pg);
74845 +
74846 + node = jprivate(pg);
74847 + BUG_ON(node == NULL);
74848 +
74849 + spin_lock_jnode(node);
74850 +
74851 + atom = jnode_get_atom(node);
74852 + if (atom == NULL) {
74853 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74854 + spin_unlock_jnode(node);
74855 + return;
74856 + }
74857 +
74858 + /* We can remove jnode from transaction even if it is on flush queue
74859 + * prepped list, we only need to be sure that flush queue is not being
74860 + * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
74861 + * spin lock for protection of the prepped nodes list, instead
74862 + * write_fq() increments atom's nr_running_queues counters for the time
74863 + * when prepped list is not protected by spin lock. Here we check this
74864 + * counter if we want to remove jnode from flush queue and, if the
74865 + * counter is not zero, wait all reiser4_write_fq() for this atom to
74866 + * complete. This is not significant overhead. */
74867 + while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74868 + spin_unlock_jnode(node);
74869 + /*
74870 + * at this moment we want to wait for "atom event", viz. wait
74871 + * until @node can be removed from flush queue. But
74872 + * reiser4_atom_wait_event() cannot be called with page locked,
74873 + * because it deadlocks with jnode_extent_write(). Unlock page,
74874 + * after making sure (through page_cache_get()) that it cannot
74875 + * be released from memory.
74876 + */
74877 + page_cache_get(pg);
74878 + unlock_page(pg);
74879 + reiser4_atom_wait_event(atom);
74880 + lock_page(pg);
74881 + /*
74882 + * page may has been detached by ->writepage()->releasepage().
74883 + */
74884 + reiser4_wait_page_writeback(pg);
74885 + spin_lock_jnode(node);
74886 + page_cache_release(pg);
74887 + atom = jnode_get_atom(node);
74888 +/* VS-FIXME-HANS: improve the commenting in this function */
74889 + if (atom == NULL) {
74890 + spin_unlock_jnode(node);
74891 + return;
74892 + }
74893 + }
74894 + reiser4_uncapture_block(node);
74895 + spin_unlock_atom(atom);
74896 + jput(node);
74897 +}
74898 +
74899 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74900 + * inode's tree of jnodes */
74901 +void reiser4_uncapture_jnode(jnode * node)
74902 +{
74903 + txn_atom *atom;
74904 +
74905 + assert_spin_locked(&(node->guard));
74906 + assert("", node->pg == 0);
74907 +
74908 + atom = jnode_get_atom(node);
74909 + if (atom == NULL) {
74910 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74911 + spin_unlock_jnode(node);
74912 + return;
74913 + }
74914 +
74915 + reiser4_uncapture_block(node);
74916 + spin_unlock_atom(atom);
74917 + jput(node);
74918 +}
74919 +
74920 +/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74921 + increases atom refcount and txnh_count, adds to txnh_list. */
74922 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74923 +{
74924 + assert("umka-200", atom != NULL);
74925 + assert("umka-201", txnh != NULL);
74926 +
74927 + assert_spin_locked(&(txnh->hlock));
74928 + assert_spin_locked(&(atom->alock));
74929 + assert("jmacd-824", txnh->atom == NULL);
74930 + assert("nikita-3540", atom_isopen(atom));
74931 + BUG_ON(txnh->atom != NULL);
74932 +
74933 + atomic_inc(&atom->refcount);
74934 + txnh->atom = atom;
74935 + reiser4_ctx_gfp_mask_set();
74936 + list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74937 + atom->txnh_count += 1;
74938 +}
74939 +
74940 +/* No-locking version of assign_block. Sets the block's atom pointer, references the
74941 + block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74942 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74943 +{
74944 + assert("umka-202", atom != NULL);
74945 + assert("umka-203", node != NULL);
74946 + assert_spin_locked(&(node->guard));
74947 + assert_spin_locked(&(atom->alock));
74948 + assert("jmacd-323", node->atom == NULL);
74949 + BUG_ON(!list_empty_careful(&node->capture_link));
74950 + assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74951 +
74952 + /* Pointer from jnode to atom is not counted in atom->refcount. */
74953 + node->atom = atom;
74954 +
74955 + list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74956 + atom->capture_count += 1;
74957 + /* reference to jnode is acquired by atom. */
74958 + jref(node);
74959 +
74960 + ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74961 +
74962 + LOCK_CNT_INC(t_refs);
74963 +}
74964 +
74965 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
74966 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74967 +{
74968 + assert_spin_locked(&(node->guard));
74969 + assert_spin_locked(&(atom->alock));
74970 + assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74971 +
74972 + JF_SET(node, JNODE_DIRTY);
74973 +
74974 + get_current_context()->nr_marked_dirty++;
74975 +
74976 + /* We grab2flush_reserve one additional block only if node was
74977 + not CREATED and jnode_flush did not sort it into neither
74978 + relocate set nor overwrite one. If node is in overwrite or
74979 + relocate set we assume that atom's flush reserved counter was
74980 + already adjusted. */
74981 + if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74982 + && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74983 + && !jnode_is_cluster_page(node)) {
74984 + assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
74985 + assert("vs-1506", *jnode_get_block(node) != 0);
74986 + grabbed2flush_reserved_nolock(atom, (__u64) 1);
74987 + JF_SET(node, JNODE_FLUSH_RESERVED);
74988 + }
74989 +
74990 + if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74991 + /* If the atom is not set yet, it will be added to the appropriate list in
74992 + capture_assign_block_nolock. */
74993 + /* Sometimes a node is set dirty before being captured -- the case for new
74994 + jnodes. In that case the jnode will be added to the appropriate list
74995 + in capture_assign_block_nolock. Another reason not to re-link jnode is
74996 + that jnode is on a flush queue (see flush.c for details) */
74997 +
74998 + int level = jnode_get_level(node);
74999 +
75000 + assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
75001 + assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
75002 + assert("nikita-2607", 0 <= level);
75003 + assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
75004 +
75005 + /* move node to atom's dirty list */
75006 + list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
75007 + ON_DEBUG(count_jnode
75008 + (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
75009 + }
75010 +}
75011 +
75012 +/* Set the dirty status for this (spin locked) jnode. */
75013 +void jnode_make_dirty_locked(jnode * node)
75014 +{
75015 + assert("umka-204", node != NULL);
75016 + assert_spin_locked(&(node->guard));
75017 +
75018 + if (REISER4_DEBUG && rofs_jnode(node)) {
75019 + warning("nikita-3365", "Dirtying jnode on rofs");
75020 + dump_stack();
75021 + }
75022 +
75023 + /* Fast check for already dirty node */
75024 + if (!JF_ISSET(node, JNODE_DIRTY)) {
75025 + txn_atom *atom;
75026 +
75027 + atom = jnode_get_atom(node);
75028 + assert("vs-1094", atom);
75029 + /* Check jnode dirty status again because node spin lock might
75030 + * be released inside jnode_get_atom(). */
75031 + if (likely(!JF_ISSET(node, JNODE_DIRTY)))
75032 + do_jnode_make_dirty(node, atom);
75033 + spin_unlock_atom(atom);
75034 + }
75035 +}
75036 +
75037 +/* Set the dirty status for this znode. */
75038 +void znode_make_dirty(znode * z)
75039 +{
75040 + jnode *node;
75041 + struct page *page;
75042 +
75043 + assert("umka-204", z != NULL);
75044 + assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
75045 + assert("nikita-3560", znode_is_write_locked(z));
75046 +
75047 + node = ZJNODE(z);
75048 + /* znode is longterm locked, we can check dirty bit without spinlock */
75049 + if (JF_ISSET(node, JNODE_DIRTY)) {
75050 + /* znode is dirty already. All we have to do is to change znode version */
75051 + z->version = znode_build_version(jnode_get_tree(node));
75052 + return;
75053 + }
75054 +
75055 + spin_lock_jnode(node);
75056 + jnode_make_dirty_locked(node);
75057 + page = jnode_page(node);
75058 + if (page != NULL) {
75059 + /* this is useful assertion (allows one to check that no
75060 + * modifications are lost due to update of in-flight page),
75061 + * but it requires locking on page to check PG_writeback
75062 + * bit. */
75063 + /* assert("nikita-3292",
75064 + !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75065 + page_cache_get(page);
75066 +
75067 + /* jnode lock is not needed for the rest of
75068 + * znode_set_dirty(). */
75069 + spin_unlock_jnode(node);
75070 + /* reiser4 file write code calls set_page_dirty for
75071 + * unformatted nodes, for formatted nodes we do it here. */
75072 + reiser4_set_page_dirty_internal(page);
75073 + page_cache_release(page);
75074 + /* bump version counter in znode */
75075 + z->version = znode_build_version(jnode_get_tree(node));
75076 + } else {
75077 + assert("zam-596", znode_above_root(JZNODE(node)));
75078 + spin_unlock_jnode(node);
75079 + }
75080 +
75081 + assert("nikita-1900", znode_is_write_locked(z));
75082 + assert("jmacd-9777", node->atom != NULL);
75083 +}
75084 +
75085 +int reiser4_sync_atom(txn_atom * atom)
75086 +{
75087 + int result;
75088 + txn_handle *txnh;
75089 +
75090 + txnh = get_current_context()->trans;
75091 +
75092 + result = 0;
75093 + if (atom != NULL) {
75094 + if (atom->stage < ASTAGE_PRE_COMMIT) {
75095 + spin_lock_txnh(txnh);
75096 + capture_assign_txnh_nolock(atom, txnh);
75097 + result = force_commit_atom(txnh);
75098 + } else if (atom->stage < ASTAGE_POST_COMMIT) {
75099 + /* wait atom commit */
75100 + reiser4_atom_wait_event(atom);
75101 + /* try once more */
75102 + result = RETERR(-E_REPEAT);
75103 + } else
75104 + spin_unlock_atom(atom);
75105 + }
75106 + return result;
75107 +}
75108 +
75109 +#if REISER4_DEBUG
75110 +
75111 +/* move jnode form one list to another
75112 + call this after atom->capture_count is updated */
75113 +void
75114 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75115 + atom_list new_list, int check_lists)
75116 +{
75117 + struct list_head *pos;
75118 +
75119 + assert("zam-1018", atom_is_protected(atom));
75120 + assert_spin_locked(&(node->guard));
75121 + assert("", NODE_LIST(node) == old_list);
75122 +
75123 + switch (NODE_LIST(node)) {
75124 + case NOT_CAPTURED:
75125 + break;
75126 + case DIRTY_LIST:
75127 + assert("", atom->dirty > 0);
75128 + atom->dirty--;
75129 + break;
75130 + case CLEAN_LIST:
75131 + assert("", atom->clean > 0);
75132 + atom->clean--;
75133 + break;
75134 + case FQ_LIST:
75135 + assert("", atom->fq > 0);
75136 + atom->fq--;
75137 + break;
75138 + case WB_LIST:
75139 + assert("", atom->wb > 0);
75140 + atom->wb--;
75141 + break;
75142 + case OVRWR_LIST:
75143 + assert("", atom->ovrwr > 0);
75144 + atom->ovrwr--;
75145 + break;
75146 + default:
75147 + impossible("", "");
75148 + }
75149 +
75150 + switch (new_list) {
75151 + case NOT_CAPTURED:
75152 + break;
75153 + case DIRTY_LIST:
75154 + atom->dirty++;
75155 + break;
75156 + case CLEAN_LIST:
75157 + atom->clean++;
75158 + break;
75159 + case FQ_LIST:
75160 + atom->fq++;
75161 + break;
75162 + case WB_LIST:
75163 + atom->wb++;
75164 + break;
75165 + case OVRWR_LIST:
75166 + atom->ovrwr++;
75167 + break;
75168 + default:
75169 + impossible("", "");
75170 + }
75171 + ASSIGN_NODE_LIST(node, new_list);
75172 + if (0 && check_lists) {
75173 + int count;
75174 + tree_level level;
75175 +
75176 + count = 0;
75177 +
75178 + /* flush queue list */
75179 + /* reiser4_check_fq(atom); */
75180 +
75181 + /* dirty list */
75182 + count = 0;
75183 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75184 + list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75185 + count++;
75186 + }
75187 + if (count != atom->dirty)
75188 + warning("", "dirty counter %d, real %d\n", atom->dirty,
75189 + count);
75190 +
75191 + /* clean list */
75192 + count = 0;
75193 + list_for_each(pos, ATOM_CLEAN_LIST(atom))
75194 + count++;
75195 + if (count != atom->clean)
75196 + warning("", "clean counter %d, real %d\n", atom->clean,
75197 + count);
75198 +
75199 + /* wb list */
75200 + count = 0;
75201 + list_for_each(pos, ATOM_WB_LIST(atom))
75202 + count++;
75203 + if (count != atom->wb)
75204 + warning("", "wb counter %d, real %d\n", atom->wb,
75205 + count);
75206 +
75207 + /* overwrite list */
75208 + count = 0;
75209 + list_for_each(pos, ATOM_OVRWR_LIST(atom))
75210 + count++;
75211 +
75212 + if (count != atom->ovrwr)
75213 + warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75214 + count);
75215 + }
75216 + assert("vs-1624", atom->num_queued == atom->fq);
75217 + if (atom->capture_count !=
75218 + atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75219 + printk
75220 + ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75221 + atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75222 + atom->wb, atom->fq);
75223 + assert("vs-1622",
75224 + atom->capture_count ==
75225 + atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75226 + atom->fq);
75227 + }
75228 +}
75229 +
75230 +#endif
75231 +
75232 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75233 + * lock should be taken before calling this function. */
75234 +void jnode_make_wander_nolock(jnode * node)
75235 +{
75236 + txn_atom *atom;
75237 +
75238 + assert("nikita-2431", node != NULL);
75239 + assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75240 + assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75241 + assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75242 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75243 +
75244 + atom = node->atom;
75245 +
75246 + assert("zam-895", atom != NULL);
75247 + assert("zam-894", atom_is_protected(atom));
75248 +
75249 + JF_SET(node, JNODE_OVRWR);
75250 + /* move node to atom's overwrite list */
75251 + list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75252 + ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75253 +}
75254 +
75255 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75256 + * this function. */
75257 +void jnode_make_wander(jnode * node)
75258 +{
75259 + txn_atom *atom;
75260 +
75261 + spin_lock_jnode(node);
75262 + atom = jnode_get_atom(node);
75263 + assert("zam-913", atom != NULL);
75264 + assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75265 +
75266 + jnode_make_wander_nolock(node);
75267 + spin_unlock_atom(atom);
75268 + spin_unlock_jnode(node);
75269 +}
75270 +
75271 +/* this just sets RELOC bit */
75272 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75273 +{
75274 + assert_spin_locked(&(node->guard));
75275 + assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75276 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75277 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75278 + assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75279 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75280 + jnode_set_reloc(node);
75281 +}
75282 +
75283 +/* Make znode RELOC and put it on flush queue */
75284 +void znode_make_reloc(znode * z, flush_queue_t * fq)
75285 +{
75286 + jnode *node;
75287 + txn_atom *atom;
75288 +
75289 + node = ZJNODE(z);
75290 + spin_lock_jnode(node);
75291 +
75292 + atom = jnode_get_atom(node);
75293 + assert("zam-919", atom != NULL);
75294 +
75295 + jnode_make_reloc_nolock(fq, node);
75296 + queue_jnode(fq, node);
75297 +
75298 + spin_unlock_atom(atom);
75299 + spin_unlock_jnode(node);
75300 +
75301 +}
75302 +
75303 +/* Make unformatted node RELOC and put it on flush queue */
75304 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75305 +{
75306 + assert("vs-1479", jnode_is_unformatted(node));
75307 +
75308 + jnode_make_reloc_nolock(fq, node);
75309 + queue_jnode(fq, node);
75310 +}
75311 +
75312 +int reiser4_capture_super_block(struct super_block *s)
75313 +{
75314 + int result;
75315 + znode *uber;
75316 + lock_handle lh;
75317 +
75318 + init_lh(&lh);
75319 + result = get_uber_znode(reiser4_get_tree(s),
75320 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75321 + if (result)
75322 + return result;
75323 +
75324 + uber = lh.node;
75325 + /* Grabbing one block for superblock */
75326 + result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75327 + if (result != 0)
75328 + return result;
75329 +
75330 + znode_make_dirty(uber);
75331 +
75332 + done_lh(&lh);
75333 + return 0;
75334 +}
75335 +
75336 +/* Wakeup every handle on the atom's WAITFOR list */
75337 +static void wakeup_atom_waitfor_list(txn_atom * atom)
75338 +{
75339 + txn_wait_links *wlinks;
75340 +
75341 + assert("umka-210", atom != NULL);
75342 +
75343 + /* atom is locked */
75344 + list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75345 + if (wlinks->waitfor_cb == NULL ||
75346 + wlinks->waitfor_cb(atom, wlinks))
75347 + /* Wake up. */
75348 + reiser4_wake_up(wlinks->_lock_stack);
75349 + }
75350 +}
75351 +
75352 +/* Wakeup every handle on the atom's WAITING list */
75353 +static void wakeup_atom_waiting_list(txn_atom * atom)
75354 +{
75355 + txn_wait_links *wlinks;
75356 +
75357 + assert("umka-211", atom != NULL);
75358 +
75359 + /* atom is locked */
75360 + list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75361 + if (wlinks->waiting_cb == NULL ||
75362 + wlinks->waiting_cb(atom, wlinks))
75363 + /* Wake up. */
75364 + reiser4_wake_up(wlinks->_lock_stack);
75365 + }
75366 +}
75367 +
75368 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75369 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75370 +{
75371 + assert("nikita-3330", atom != NULL);
75372 + assert_spin_locked(&(atom->alock));
75373 +
75374 + /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75375 + * last transaction handle. */
75376 + return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75377 +}
75378 +
75379 +/* The general purpose of this function is to wait on the first of two possible events.
75380 + The situation is that a handle (and its atom atomh) is blocked trying to capture a
75381 + block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75382 + handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75383 + another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75384 + needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75385 + proceed and fuse the two atoms in the CAPTURE_WAIT state.
75386 +
75387 + In other words, if either atomh or atomf change state, the handle will be awakened,
75388 + thus there are two lists per atom: WAITING and WAITFOR.
75389 +
75390 + This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75391 + close but it is not assigned to an atom of its own.
75392 +
75393 + Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75394 + BOTH_ATOM_LOCKS. Result: all four locks are released.
75395 +*/
75396 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75397 + txn_atom * atomh, txn_capture mode)
75398 +{
75399 + int ret;
75400 + txn_wait_links wlinks;
75401 +
75402 + assert("umka-213", txnh != NULL);
75403 + assert("umka-214", atomf != NULL);
75404 +
75405 + if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75406 + spin_unlock_txnh(txnh);
75407 + spin_unlock_atom(atomf);
75408 +
75409 + if (atomh) {
75410 + spin_unlock_atom(atomh);
75411 + }
75412 +
75413 + return RETERR(-E_BLOCK);
75414 + }
75415 +
75416 + /* Initialize the waiting list links. */
75417 + init_wlinks(&wlinks);
75418 +
75419 + /* Add txnh to atomf's waitfor list, unlock atomf. */
75420 + list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75421 + wlinks.waitfor_cb = wait_for_fusion;
75422 + atomic_inc(&atomf->refcount);
75423 + spin_unlock_atom(atomf);
75424 +
75425 + if (atomh) {
75426 + /* Add txnh to atomh's waiting list, unlock atomh. */
75427 + list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75428 + atomic_inc(&atomh->refcount);
75429 + spin_unlock_atom(atomh);
75430 + }
75431 +
75432 + /* Go to sleep. */
75433 + spin_unlock_txnh(txnh);
75434 +
75435 + ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
75436 + if (ret == 0) {
75437 + reiser4_go_to_sleep(wlinks._lock_stack);
75438 + ret = RETERR(-E_REPEAT);
75439 + }
75440 +
75441 + /* Remove from the waitfor list. */
75442 + spin_lock_atom(atomf);
75443 +
75444 + list_del(&wlinks._fwaitfor_link);
75445 + atom_dec_and_unlock(atomf);
75446 +
75447 + if (atomh) {
75448 + /* Remove from the waiting list. */
75449 + spin_lock_atom(atomh);
75450 + list_del(&wlinks._fwaiting_link);
75451 + atom_dec_and_unlock(atomh);
75452 + }
75453 + return ret;
75454 +}
75455 +
75456 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
75457 +{
75458 + assert("zam-1067", one != two);
75459 +
75460 + /* lock the atom with lesser address first */
75461 + if (one < two) {
75462 + spin_lock_atom(one);
75463 + spin_lock_atom_nested(two);
75464 + } else {
75465 + spin_lock_atom(two);
75466 + spin_lock_atom_nested(one);
75467 + }
75468 +}
75469 +
75470 +/* Perform the necessary work to prepare for fusing two atoms, which involves
75471 + * acquiring two atom locks in the proper order. If one of the node's atom is
75472 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75473 + * atom is not then the handle's request is put to sleep. If the node's atom
75474 + * is committing, then the node can be copy-on-captured. Otherwise, pick the
75475 + * atom with fewer pointers to be fused into the atom with more pointer and
75476 + * call capture_fuse_into.
75477 + */
75478 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75479 +{
75480 + txn_atom * txnh_atom = txnh->atom;
75481 + txn_atom * block_atom = node->atom;
75482 +
75483 + atomic_inc(&txnh_atom->refcount);
75484 + atomic_inc(&block_atom->refcount);
75485 +
75486 + spin_unlock_txnh(txnh);
75487 + spin_unlock_jnode(node);
75488 +
75489 + lock_two_atoms(txnh_atom, block_atom);
75490 +
75491 + if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75492 + release_two_atoms(txnh_atom, block_atom);
75493 + return RETERR(-E_REPEAT);
75494 + }
75495 +
75496 + atomic_dec(&txnh_atom->refcount);
75497 + atomic_dec(&block_atom->refcount);
75498 +
75499 + assert ("zam-1066", atom_isopen(txnh_atom));
75500 +
75501 + if (txnh_atom->stage >= block_atom->stage ||
75502 + (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75503 + capture_fuse_into(txnh_atom, block_atom);
75504 + return RETERR(-E_REPEAT);
75505 + }
75506 + spin_lock_txnh(txnh);
75507 + return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75508 +}
75509 +
75510 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
75511 + the small list to point to the large atom. Returns the length of the list. */
75512 +static int
75513 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75514 + struct list_head *small_head)
75515 +{
75516 + int count = 0;
75517 + jnode *node;
75518 +
75519 + assert("umka-218", large != NULL);
75520 + assert("umka-219", large_head != NULL);
75521 + assert("umka-220", small_head != NULL);
75522 + /* small atom should be locked also. */
75523 + assert_spin_locked(&(large->alock));
75524 +
75525 + /* For every jnode on small's capture list... */
75526 + list_for_each_entry(node, small_head, capture_link) {
75527 + count += 1;
75528 +
75529 + /* With the jnode lock held, update atom pointer. */
75530 + spin_lock_jnode(node);
75531 + node->atom = large;
75532 + spin_unlock_jnode(node);
75533 + }
75534 +
75535 + /* Splice the lists. */
75536 + list_splice_init(small_head, large_head->prev);
75537 +
75538 + return count;
75539 +}
75540 +
75541 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
75542 + the small list to point to the large atom. Returns the length of the list. */
75543 +static int
75544 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75545 + struct list_head *small_head)
75546 +{
75547 + int count = 0;
75548 + txn_handle *txnh;
75549 +
75550 + assert("umka-221", large != NULL);
75551 + assert("umka-222", large_head != NULL);
75552 + assert("umka-223", small_head != NULL);
75553 +
75554 + /* Adjust every txnh to the new atom. */
75555 + list_for_each_entry(txnh, small_head, txnh_link) {
75556 + count += 1;
75557 +
75558 + /* With the txnh lock held, update atom pointer. */
75559 + spin_lock_txnh(txnh);
75560 + txnh->atom = large;
75561 + spin_unlock_txnh(txnh);
75562 + }
75563 +
75564 + /* Splice the txn_handle list. */
75565 + list_splice_init(small_head, large_head->prev);
75566 +
75567 + return count;
75568 +}
75569 +
75570 +/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75571 + added to LARGE and their ->atom pointers are all updated. The associated counts are
75572 + updated as well, and any waiting handles belonging to either are awakened. Finally the
75573 + smaller atom's refcount is decremented.
75574 +*/
75575 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
75576 +{
75577 + int level;
75578 + unsigned zcount = 0;
75579 + unsigned tcount = 0;
75580 +
75581 + assert("umka-224", small != NULL);
75582 + assert("umka-225", small != NULL);
75583 +
75584 + assert_spin_locked(&(large->alock));
75585 + assert_spin_locked(&(small->alock));
75586 +
75587 + assert("jmacd-201", atom_isopen(small));
75588 + assert("jmacd-202", atom_isopen(large));
75589 +
75590 + /* Splice and update the per-level dirty jnode lists */
75591 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75592 + zcount +=
75593 + capture_fuse_jnode_lists(large,
75594 + ATOM_DIRTY_LIST(large, level),
75595 + ATOM_DIRTY_LIST(small, level));
75596 + }
75597 +
75598 + /* Splice and update the [clean,dirty] jnode and txnh lists */
75599 + zcount +=
75600 + capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75601 + ATOM_CLEAN_LIST(small));
75602 + zcount +=
75603 + capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75604 + ATOM_OVRWR_LIST(small));
75605 + zcount +=
75606 + capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75607 + ATOM_WB_LIST(small));
75608 + zcount +=
75609 + capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75610 + tcount +=
75611 + capture_fuse_txnh_lists(large, &large->txnh_list,
75612 + &small->txnh_list);
75613 +
75614 + /* Check our accounting. */
75615 + assert("jmacd-1063",
75616 + zcount + small->num_queued == small->capture_count);
75617 + assert("jmacd-1065", tcount == small->txnh_count);
75618 +
75619 + /* sum numbers of waiters threads */
75620 + large->nr_waiters += small->nr_waiters;
75621 + small->nr_waiters = 0;
75622 +
75623 + /* splice flush queues */
75624 + reiser4_fuse_fq(large, small);
75625 +
75626 + /* update counter of jnode on every atom' list */
75627 + ON_DEBUG(large->dirty += small->dirty;
75628 + small->dirty = 0;
75629 + large->clean += small->clean;
75630 + small->clean = 0;
75631 + large->ovrwr += small->ovrwr;
75632 + small->ovrwr = 0;
75633 + large->wb += small->wb;
75634 + small->wb = 0;
75635 + large->fq += small->fq;
75636 + small->fq = 0;);
75637 +
75638 + /* count flushers in result atom */
75639 + large->nr_flushers += small->nr_flushers;
75640 + small->nr_flushers = 0;
75641 +
75642 + /* update counts of flushed nodes */
75643 + large->flushed += small->flushed;
75644 + small->flushed = 0;
75645 +
75646 + /* Transfer list counts to large. */
75647 + large->txnh_count += small->txnh_count;
75648 + large->capture_count += small->capture_count;
75649 +
75650 + /* Add all txnh references to large. */
75651 + atomic_add(small->txnh_count, &large->refcount);
75652 + atomic_sub(small->txnh_count, &small->refcount);
75653 +
75654 + /* Reset small counts */
75655 + small->txnh_count = 0;
75656 + small->capture_count = 0;
75657 +
75658 + /* Assign the oldest start_time, merge flags. */
75659 + large->start_time = min(large->start_time, small->start_time);
75660 + large->flags |= small->flags;
75661 +
75662 + /* Merge blocknr sets. */
75663 + blocknr_set_merge(&small->delete_set, &large->delete_set);
75664 + blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75665 +
75666 + /* Merge allocated/deleted file counts */
75667 + large->nr_objects_deleted += small->nr_objects_deleted;
75668 + large->nr_objects_created += small->nr_objects_created;
75669 +
75670 + small->nr_objects_deleted = 0;
75671 + small->nr_objects_created = 0;
75672 +
75673 + /* Merge allocated blocks counts */
75674 + large->nr_blocks_allocated += small->nr_blocks_allocated;
75675 +
75676 + large->nr_running_queues += small->nr_running_queues;
75677 + small->nr_running_queues = 0;
75678 +
75679 + /* Merge blocks reserved for overwrite set. */
75680 + large->flush_reserved += small->flush_reserved;
75681 + small->flush_reserved = 0;
75682 +
75683 + if (large->stage < small->stage) {
75684 + /* Large only needs to notify if it has changed state. */
75685 + reiser4_atom_set_stage(large, small->stage);
75686 + wakeup_atom_waiting_list(large);
75687 + }
75688 +
75689 + reiser4_atom_set_stage(small, ASTAGE_INVALID);
75690 +
75691 + /* Notify any waiters--small needs to unload its wait lists. Waiters
75692 + actually remove themselves from the list before returning from the
75693 + fuse_wait function. */
75694 + wakeup_atom_waiting_list(small);
75695 +
75696 + /* Unlock atoms */
75697 + spin_unlock_atom(large);
75698 + atom_dec_and_unlock(small);
75699 +}
75700 +
75701 +/* TXNMGR STUFF */
75702 +
75703 +/* Release a block from the atom, reversing the effects of being captured,
75704 + do not release atom's reference to jnode due to holding spin-locks.
75705 + Currently this is only called when the atom commits.
75706 +
75707 + NOTE: this function does not release a (journal) reference to jnode
75708 + due to locking optimizations, you should call jput() somewhere after
75709 + calling reiser4_uncapture_block(). */
75710 +void reiser4_uncapture_block(jnode * node)
75711 +{
75712 + txn_atom *atom;
75713 +
75714 + assert("umka-226", node != NULL);
75715 + atom = node->atom;
75716 + assert("umka-228", atom != NULL);
75717 +
75718 + assert("jmacd-1021", node->atom == atom);
75719 + assert_spin_locked(&(node->guard));
75720 + assert("jmacd-1023", atom_is_protected(atom));
75721 +
75722 + JF_CLR(node, JNODE_DIRTY);
75723 + JF_CLR(node, JNODE_RELOC);
75724 + JF_CLR(node, JNODE_OVRWR);
75725 + JF_CLR(node, JNODE_CREATED);
75726 + JF_CLR(node, JNODE_WRITEBACK);
75727 + JF_CLR(node, JNODE_REPACK);
75728 +
75729 + list_del_init(&node->capture_link);
75730 + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75731 + assert("zam-925", atom_isopen(atom));
75732 + assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75733 + ON_DEBUG(atom->num_queued--);
75734 + JF_CLR(node, JNODE_FLUSH_QUEUED);
75735 + }
75736 + atom->capture_count -= 1;
75737 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75738 + node->atom = NULL;
75739 +
75740 + spin_unlock_jnode(node);
75741 + LOCK_CNT_DEC(t_refs);
75742 +}
75743 +
75744 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75745 + bitmap-based allocator code for adding modified bitmap blocks the
75746 + transaction. @atom and @node are spin locked */
75747 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75748 +{
75749 + assert("zam-538", atom_is_protected(atom));
75750 + assert_spin_locked(&(node->guard));
75751 + assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75752 + assert("zam-543", node->atom == NULL);
75753 + assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75754 +
75755 + list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75756 + jref(node);
75757 + node->atom = atom;
75758 + atom->capture_count++;
75759 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75760 +}
75761 +
75762 +static int count_deleted_blocks_actor(txn_atom * atom,
75763 + const reiser4_block_nr * a,
75764 + const reiser4_block_nr * b, void *data)
75765 +{
75766 + reiser4_block_nr *counter = data;
75767 +
75768 + assert("zam-995", data != NULL);
75769 + assert("zam-996", a != NULL);
75770 + if (b == NULL)
75771 + *counter += 1;
75772 + else
75773 + *counter += *b;
75774 + return 0;
75775 +}
75776 +
75777 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
75778 +{
75779 + reiser4_block_nr result;
75780 + txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75781 + txn_atom *atom;
75782 +
75783 + result = 0;
75784 +
75785 + spin_lock_txnmgr(tmgr);
75786 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75787 + spin_lock_atom(atom);
75788 + if (atom_isopen(atom))
75789 + blocknr_set_iterator(
75790 + atom, &atom->delete_set,
75791 + count_deleted_blocks_actor, &result, 0);
75792 + spin_unlock_atom(atom);
75793 + }
75794 + spin_unlock_txnmgr(tmgr);
75795 +
75796 + return result;
75797 +}
75798 +
75799 +/*
75800 + * Local variables:
75801 + * c-indentation-style: "K&R"
75802 + * mode-name: "LC"
75803 + * c-basic-offset: 8
75804 + * tab-width: 8
75805 + * fill-column: 79
75806 + * End:
75807 + */
75808 diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.h linux-2.6.20/fs/reiser4/txnmgr.h
75809 --- linux-2.6.20.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
75810 +++ linux-2.6.20/fs/reiser4/txnmgr.h 2007-05-06 14:50:43.899038216 +0400
75811 @@ -0,0 +1,708 @@
75812 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75813 + * reiser4/README */
75814 +
75815 +/* data-types and function declarations for transaction manager. See txnmgr.c
75816 + * for details. */
75817 +
75818 +#ifndef __REISER4_TXNMGR_H__
75819 +#define __REISER4_TXNMGR_H__
75820 +
75821 +#include "forward.h"
75822 +#include "dformat.h"
75823 +
75824 +#include <linux/fs.h>
75825 +#include <linux/mm.h>
75826 +#include <linux/types.h>
75827 +#include <linux/spinlock.h>
75828 +#include <asm/atomic.h>
75829 +#include <linux/wait.h>
75830 +
75831 +/* TYPE DECLARATIONS */
75832 +
75833 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
75834 + A capture request dynamically assigns a block to the calling thread's transaction
75835 + handle. */
75836 +typedef enum {
75837 + /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75838 + atom should fuse in order to ensure that the block commits atomically with the
75839 + caller. */
75840 + TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75841 +
75842 + /* A READ_NONCOM request indicates that a block will be read and that the caller is
75843 + willing to read a non-committed block without causing atoms to fuse. */
75844 + TXN_CAPTURE_READ_NONCOM = (1 << 1),
75845 +
75846 + /* A READ_MODIFY request indicates that a block will be read but that the caller
75847 + wishes for the block to be captured as it will be written. This capture request
75848 + mode is not currently used, but eventually it will be useful for preventing
75849 + deadlock in read-modify-write cycles. */
75850 + TXN_CAPTURE_READ_MODIFY = (1 << 2),
75851 +
75852 + /* A WRITE capture request indicates that a block will be modified and that atoms
75853 + should fuse to make the commit atomic. */
75854 + TXN_CAPTURE_WRITE = (1 << 3),
75855 +
75856 + /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75857 + exclusive type designation from extra bits that may be supplied -- see
75858 + below. */
75859 + TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75860 + TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75861 + TXN_CAPTURE_WRITE),
75862 +
75863 + /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75864 + indicate modification will occur. */
75865 + TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75866 +
75867 + /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
75868 + prefer not to sleep waiting for an aging atom to commit. */
75869 + TXN_CAPTURE_NONBLOCKING = (1 << 4),
75870 +
75871 + /* An option to reiser4_try_capture to prevent atom fusion, just simple
75872 + capturing is allowed */
75873 + TXN_CAPTURE_DONT_FUSE = (1 << 5)
75874 +
75875 + /* This macro selects only the exclusive capture request types, stripping out any
75876 + options that were supplied (i.e., NONBLOCKING). */
75877 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75878 +} txn_capture;
75879 +
75880 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75881 + difference is in the handling of read requests. A WRITE_FUSING transaction handle
75882 + defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75883 + transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75884 +typedef enum {
75885 + TXN_WRITE_FUSING = (1 << 0),
75886 + TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75887 +} txn_mode;
75888 +
75889 +/* Every atom has a stage, which is one of these exclusive values: */
75890 +typedef enum {
75891 + /* Initially an atom is free. */
75892 + ASTAGE_FREE = 0,
75893 +
75894 + /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75895 + blocks and fuse with other atoms. */
75896 + ASTAGE_CAPTURE_FUSE = 1,
75897 +
75898 + /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75899 +
75900 + /* When an atom reaches a certain age it must do all it can to commit. An atom in
75901 + the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75902 + atoms in the CAPTURE_FUSE stage. */
75903 + ASTAGE_CAPTURE_WAIT = 2,
75904 +
75905 + /* Waiting for I/O before commit. Copy-on-capture (see
75906 + http://namesys.com/v4/v4.html). */
75907 + ASTAGE_PRE_COMMIT = 3,
75908 +
75909 + /* Post-commit overwrite I/O. Steal-on-capture. */
75910 + ASTAGE_POST_COMMIT = 4,
75911 +
75912 + /* Atom which waits for the removal of the last reference to (it? ) to
75913 + * be deleted from memory */
75914 + ASTAGE_DONE = 5,
75915 +
75916 + /* invalid atom. */
75917 + ASTAGE_INVALID = 6,
75918 +
75919 +} txn_stage;
75920 +
75921 +/* Certain flags may be set in the txn_atom->flags field. */
75922 +typedef enum {
75923 + /* Indicates that the atom should commit as soon as possible. */
75924 + ATOM_FORCE_COMMIT = (1 << 0),
75925 + /* to avoid endless loop, mark the atom (which was considered as too
75926 + * small) after failed attempt to fuse it. */
75927 + ATOM_CANCEL_FUSION = (1 << 1)
75928 +} txn_flags;
75929 +
75930 +/* Flags for controlling commit_txnh */
75931 +typedef enum {
75932 + /* Wait commit atom completion in commit_txnh */
75933 + TXNH_WAIT_COMMIT = 0x2,
75934 + /* Don't commit atom when this handle is closed */
75935 + TXNH_DONT_COMMIT = 0x4
75936 +} txn_handle_flags_t;
75937 +
75938 +/* TYPE DEFINITIONS */
75939 +
75940 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75941 + fields, so typically an operation on the atom through either of these objects must (1)
75942 + lock the object, (2) read the atom pointer, (3) lock the atom.
75943 +
75944 + During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75945 + through the list of handles and pages held by the smaller of the two atoms. For each
75946 + handle and page referencing the smaller atom, the fusing process must: (1) lock the
75947 + object, and (2) update the atom pointer.
75948 +
75949 + You can see that there is a conflict of lock ordering here, so the more-complex
75950 + procedure should have priority, i.e., the fusing process has priority so that it is
75951 + guaranteed to make progress and to avoid restarts.
75952 +
75953 + This decision, however, means additional complexity for aquiring the atom lock in the
75954 + first place.
75955 +
75956 + The general original procedure followed in the code was:
75957 +
75958 + TXN_OBJECT *obj = ...;
75959 + TXN_ATOM *atom;
75960 +
75961 + spin_lock (& obj->_lock);
75962 +
75963 + atom = obj->_atom;
75964 +
75965 + if (! spin_trylock_atom (atom))
75966 + {
75967 + spin_unlock (& obj->_lock);
75968 + RESTART OPERATION, THERE WAS A RACE;
75969 + }
75970 +
75971 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75972 +
75973 + It has however been found that this wastes CPU a lot in a manner that is
75974 + hard to profile. So, proper refcounting was added to atoms, and new
75975 + standard locking sequence is like following:
75976 +
75977 + TXN_OBJECT *obj = ...;
75978 + TXN_ATOM *atom;
75979 +
75980 + spin_lock (& obj->_lock);
75981 +
75982 + atom = obj->_atom;
75983 +
75984 + if (! spin_trylock_atom (atom))
75985 + {
75986 + atomic_inc (& atom->refcount);
75987 + spin_unlock (& obj->_lock);
75988 + spin_lock (&atom->_lock);
75989 + atomic_dec (& atom->refcount);
75990 + // HERE atom is locked
75991 + spin_unlock (&atom->_lock);
75992 + RESTART OPERATION, THERE WAS A RACE;
75993 + }
75994 +
75995 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75996 +
75997 + (core of this is implemented in trylock_throttle() function)
75998 +
75999 + See the jnode_get_atom() function for a common case.
76000 +
76001 + As an additional (and important) optimization allowing to avoid restarts,
76002 + it is possible to re-check required pre-conditions at the HERE point in
76003 + code above and proceed without restarting if they are still satisfied.
76004 +*/
76005 +
76006 +/* An atomic transaction: this is the underlying system representation
76007 + of a transaction, not the one seen by clients.
76008 +
76009 + Invariants involving this data-type:
76010 +
76011 + [sb-fake-allocated]
76012 +*/
76013 +struct txn_atom {
76014 + /* The spinlock protecting the atom, held during fusion and various other state
76015 + changes. */
76016 + spinlock_t alock;
76017 +
76018 + /* The atom's reference counter, increasing (in case of a duplication
76019 + of an existing reference or when we are sure that some other
76020 + reference exists) may be done without taking spinlock, decrementing
76021 + of the ref. counter requires a spinlock to be held.
76022 +
76023 + Each transaction handle counts in ->refcount. All jnodes count as
76024 + one reference acquired in atom_begin_andlock(), released in
76025 + commit_current_atom().
76026 + */
76027 + atomic_t refcount;
76028 +
76029 + /* The atom_id identifies the atom in persistent records such as the log. */
76030 + __u32 atom_id;
76031 +
76032 + /* Flags holding any of the txn_flags enumerated values (e.g.,
76033 + ATOM_FORCE_COMMIT). */
76034 + __u32 flags;
76035 +
76036 + /* Number of open handles. */
76037 + __u32 txnh_count;
76038 +
76039 + /* The number of znodes captured by this atom. Equal to the sum of lengths of the
76040 + dirty_nodes[level] and clean_nodes lists. */
76041 + __u32 capture_count;
76042 +
76043 +#if REISER4_DEBUG
76044 + int clean;
76045 + int dirty;
76046 + int ovrwr;
76047 + int wb;
76048 + int fq;
76049 +#endif
76050 +
76051 + __u32 flushed;
76052 +
76053 + /* Current transaction stage. */
76054 + txn_stage stage;
76055 +
76056 + /* Start time. */
76057 + unsigned long start_time;
76058 +
76059 + /* The atom's delete set. It collects block numbers of the nodes
76060 + which were deleted during the transaction. */
76061 + struct list_head delete_set;
76062 +
76063 + /* The atom's wandered_block mapping. */
76064 + struct list_head wandered_map;
76065 +
76066 + /* The transaction's list of dirty captured nodes--per level. Index
76067 + by (level). dirty_nodes[0] is for znode-above-root */
76068 + struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76069 +
76070 + /* The transaction's list of clean captured nodes. */
76071 + struct list_head clean_nodes;
76072 +
76073 + /* The atom's overwrite set */
76074 + struct list_head ovrwr_nodes;
76075 +
76076 + /* nodes which are being written to disk */
76077 + struct list_head writeback_nodes;
76078 +
76079 + /* list of inodes */
76080 + struct list_head inodes;
76081 +
76082 + /* List of handles associated with this atom. */
76083 + struct list_head txnh_list;
76084 +
76085 + /* Transaction list link: list of atoms in the transaction manager. */
76086 + struct list_head atom_link;
76087 +
76088 + /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76089 + struct list_head fwaitfor_list;
76090 +
76091 + /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76092 + struct list_head fwaiting_list;
76093 +
76094 + /* Numbers of objects which were deleted/created in this transaction
76095 + thereby numbers of objects IDs which were released/deallocated. */
76096 + int nr_objects_deleted;
76097 + int nr_objects_created;
76098 + /* number of blocks allocated during the transaction */
76099 + __u64 nr_blocks_allocated;
76100 + /* All atom's flush queue objects are on this list */
76101 + struct list_head flush_queues;
76102 +#if REISER4_DEBUG
76103 + /* number of flush queues for this atom. */
76104 + int nr_flush_queues;
76105 + /* Number of jnodes which were removed from atom's lists and put
76106 + on flush_queue */
76107 + int num_queued;
76108 +#endif
76109 + /* number of threads who wait for this atom to complete commit */
76110 + int nr_waiters;
76111 + /* number of threads which do jnode_flush() over this atom */
76112 + int nr_flushers;
76113 + /* number of flush queues which are IN_USE and jnodes from fq->prepped
76114 + are submitted to disk by the reiser4_write_fq() routine. */
76115 + int nr_running_queues;
76116 + /* A counter of grabbed unformatted nodes, see a description of the
76117 + * reiser4 space reservation scheme at block_alloc.c */
76118 + reiser4_block_nr flush_reserved;
76119 +#if REISER4_DEBUG
76120 + void *committer;
76121 +#endif
76122 + struct super_block *super;
76123 +};
76124 +
76125 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76126 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76127 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76128 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76129 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76130 +
76131 +#define NODE_LIST(node) (node)->list
76132 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76133 +ON_DEBUG(void
76134 + count_jnode(txn_atom *, jnode *, atom_list old_list,
76135 + atom_list new_list, int check_lists));
76136 +
76137 +typedef struct protected_jnodes {
76138 + struct list_head inatom; /* link to atom's list these structures */
76139 + struct list_head nodes; /* head of list of protected nodes */
76140 +} protected_jnodes;
76141 +
76142 +/* A transaction handle: the client obtains and commits this handle which is assigned by
76143 + the system to a txn_atom. */
76144 +struct txn_handle {
76145 + /* Spinlock protecting ->atom pointer */
76146 + spinlock_t hlock;
76147 +
76148 + /* Flags for controlling commit_txnh() behavior */
76149 + /* from txn_handle_flags_t */
76150 + txn_handle_flags_t flags;
76151 +
76152 + /* Whether it is READ_FUSING or WRITE_FUSING. */
76153 + txn_mode mode;
76154 +
76155 + /* If assigned, the atom it is part of. */
76156 + txn_atom *atom;
76157 +
76158 + /* Transaction list link. Head is in txn_atom. */
76159 + struct list_head txnh_link;
76160 +};
76161 +
76162 +/* The transaction manager: one is contained in the reiser4_super_info_data */
76163 +struct txn_mgr {
76164 + /* A spinlock protecting the atom list, id_count, flush_control */
76165 + spinlock_t tmgr_lock;
76166 +
76167 + /* List of atoms. */
76168 + struct list_head atoms_list;
76169 +
76170 + /* Number of atoms. */
76171 + int atom_count;
76172 +
76173 + /* A counter used to assign atom->atom_id values. */
76174 + __u32 id_count;
76175 +
76176 + /* a mutex object for commit serialization */
76177 + struct mutex commit_mutex;
76178 +
76179 + /* a list of all txnmrgs served by particular daemon. */
76180 + struct list_head linkage;
76181 +
76182 + /* description of daemon for this txnmgr */
76183 + ktxnmgrd_context *daemon;
76184 +
76185 + /* parameters. Adjustable through mount options. */
76186 + unsigned int atom_max_size;
76187 + unsigned int atom_max_age;
76188 + unsigned int atom_min_size;
76189 + /* max number of concurrent flushers for one atom, 0 - unlimited. */
76190 + unsigned int atom_max_flushers;
76191 + struct dentry *debugfs_atom_count;
76192 + struct dentry *debugfs_id_count;
76193 +};
76194 +
76195 +/* FUNCTION DECLARATIONS */
76196 +
76197 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
76198 + are prefixed with "txn_". For comments, see txnmgr.c. */
76199 +
76200 +extern int init_txnmgr_static(void);
76201 +extern void done_txnmgr_static(void);
76202 +
76203 +extern void reiser4_init_txnmgr(txn_mgr *);
76204 +extern void reiser4_done_txnmgr(txn_mgr *);
76205 +
76206 +extern int reiser4_txn_reserve(int reserved);
76207 +
76208 +extern void reiser4_txn_begin(reiser4_context * context);
76209 +extern int reiser4_txn_end(reiser4_context * context);
76210 +
76211 +extern void reiser4_txn_restart(reiser4_context * context);
76212 +extern void reiser4_txn_restart_current(void);
76213 +
76214 +extern int txnmgr_force_commit_all(struct super_block *, int);
76215 +extern int current_atom_should_commit(void);
76216 +
76217 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
76218 +
76219 +extern int commit_some_atoms(txn_mgr *);
76220 +extern int force_commit_atom(txn_handle *);
76221 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76222 +
76223 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76224 +
76225 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
76226 +
76227 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76228 + int alloc_value);
76229 +extern void atom_dec_and_unlock(txn_atom * atom);
76230 +
76231 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76232 +extern int try_capture_page_to_invalidate(struct page *pg);
76233 +
76234 +extern void reiser4_uncapture_page(struct page *pg);
76235 +extern void reiser4_uncapture_block(jnode *);
76236 +extern void reiser4_uncapture_jnode(jnode *);
76237 +
76238 +extern int reiser4_capture_inode(struct inode *);
76239 +extern int reiser4_uncapture_inode(struct inode *);
76240 +
76241 +extern txn_atom *get_current_atom_locked_nocheck(void);
76242 +
76243 +#if REISER4_DEBUG
76244 +
76245 +/**
76246 + * atom_is_protected - make sure that nobody but us can do anything with atom
76247 + * @atom: atom to be checked
76248 + *
76249 + * This is used to assert that atom either entered commit stages or is spin
76250 + * locked.
76251 + */
76252 +static inline int atom_is_protected(txn_atom *atom)
76253 +{
76254 + if (atom->stage >= ASTAGE_PRE_COMMIT)
76255 + return 1;
76256 + assert_spin_locked(&(atom->alock));
76257 + return 1;
76258 +}
76259 +
76260 +#endif
76261 +
76262 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
76263 +static inline txn_atom *get_current_atom_locked(void)
76264 +{
76265 + txn_atom *atom;
76266 +
76267 + atom = get_current_atom_locked_nocheck();
76268 + assert("zam-761", atom != NULL);
76269 +
76270 + return atom;
76271 +}
76272 +
76273 +extern txn_atom *jnode_get_atom(jnode *);
76274 +
76275 +extern void reiser4_atom_wait_event(txn_atom *);
76276 +extern void reiser4_atom_send_event(txn_atom *);
76277 +
76278 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76279 +extern int reiser4_capture_super_block(struct super_block *s);
76280 +int capture_bulk(jnode **, int count);
76281 +
76282 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
76283 + calling convention of these three routines. */
76284 +extern void blocknr_set_init(struct list_head * bset);
76285 +extern void blocknr_set_destroy(struct list_head * bset);
76286 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
76287 +extern int blocknr_set_add_extent(txn_atom * atom,
76288 + struct list_head * bset,
76289 + blocknr_set_entry ** new_bsep,
76290 + const reiser4_block_nr * start,
76291 + const reiser4_block_nr * len);
76292 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
76293 + blocknr_set_entry ** new_bsep,
76294 + const reiser4_block_nr * a,
76295 + const reiser4_block_nr * b);
76296 +
76297 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76298 + const reiser4_block_nr *, void *);
76299 +
76300 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
76301 + blocknr_set_actor_f actor, void *data,
76302 + int delete);
76303 +
76304 +/* flush code takes care about how to fuse flush queues */
76305 +extern void flush_init_atom(txn_atom * atom);
76306 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76307 +
76308 +static inline void spin_lock_atom(txn_atom *atom)
76309 +{
76310 + /* check that spinlocks of lower priorities are not held */
76311 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76312 + LOCK_CNT_NIL(spin_locked_atom) &&
76313 + LOCK_CNT_NIL(spin_locked_jnode) &&
76314 + LOCK_CNT_NIL(spin_locked_zlock) &&
76315 + LOCK_CNT_NIL(rw_locked_dk) &&
76316 + LOCK_CNT_NIL(rw_locked_tree)));
76317 +
76318 + spin_lock(&(atom->alock));
76319 +
76320 + LOCK_CNT_INC(spin_locked_atom);
76321 + LOCK_CNT_INC(spin_locked);
76322 +}
76323 +
76324 +static inline void spin_lock_atom_nested(txn_atom *atom)
76325 +{
76326 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76327 + LOCK_CNT_NIL(spin_locked_jnode) &&
76328 + LOCK_CNT_NIL(spin_locked_zlock) &&
76329 + LOCK_CNT_NIL(rw_locked_dk) &&
76330 + LOCK_CNT_NIL(rw_locked_tree)));
76331 +
76332 + spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76333 +
76334 + LOCK_CNT_INC(spin_locked_atom);
76335 + LOCK_CNT_INC(spin_locked);
76336 +}
76337 +
76338 +static inline int spin_trylock_atom(txn_atom *atom)
76339 +{
76340 + if (spin_trylock(&(atom->alock))) {
76341 + LOCK_CNT_INC(spin_locked_atom);
76342 + LOCK_CNT_INC(spin_locked);
76343 + return 1;
76344 + }
76345 + return 0;
76346 +}
76347 +
76348 +static inline void spin_unlock_atom(txn_atom *atom)
76349 +{
76350 + assert_spin_locked(&(atom->alock));
76351 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76352 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76353 +
76354 + LOCK_CNT_DEC(spin_locked_atom);
76355 + LOCK_CNT_DEC(spin_locked);
76356 +
76357 + spin_unlock(&(atom->alock));
76358 +}
76359 +
76360 +static inline void spin_lock_txnh(txn_handle *txnh)
76361 +{
76362 + /* check that spinlocks of lower priorities are not held */
76363 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76364 + LOCK_CNT_NIL(spin_locked_zlock) &&
76365 + LOCK_CNT_NIL(rw_locked_tree)));
76366 +
76367 + spin_lock(&(txnh->hlock));
76368 +
76369 + LOCK_CNT_INC(spin_locked_txnh);
76370 + LOCK_CNT_INC(spin_locked);
76371 +}
76372 +
76373 +static inline int spin_trylock_txnh(txn_handle *txnh)
76374 +{
76375 + if (spin_trylock(&(txnh->hlock))) {
76376 + LOCK_CNT_INC(spin_locked_txnh);
76377 + LOCK_CNT_INC(spin_locked);
76378 + return 1;
76379 + }
76380 + return 0;
76381 +}
76382 +
76383 +static inline void spin_unlock_txnh(txn_handle *txnh)
76384 +{
76385 + assert_spin_locked(&(txnh->hlock));
76386 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76387 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76388 +
76389 + LOCK_CNT_DEC(spin_locked_txnh);
76390 + LOCK_CNT_DEC(spin_locked);
76391 +
76392 + spin_unlock(&(txnh->hlock));
76393 +}
76394 +
76395 +#define spin_ordering_pred_txnmgr(tmgr) \
76396 + ( LOCK_CNT_NIL(spin_locked_atom) && \
76397 + LOCK_CNT_NIL(spin_locked_txnh) && \
76398 + LOCK_CNT_NIL(spin_locked_jnode) && \
76399 + LOCK_CNT_NIL(rw_locked_zlock) && \
76400 + LOCK_CNT_NIL(rw_locked_dk) && \
76401 + LOCK_CNT_NIL(rw_locked_tree) )
76402 +
76403 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
76404 +{
76405 + /* check that spinlocks of lower priorities are not held */
76406 + assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76407 + LOCK_CNT_NIL(spin_locked_txnh) &&
76408 + LOCK_CNT_NIL(spin_locked_jnode) &&
76409 + LOCK_CNT_NIL(spin_locked_zlock) &&
76410 + LOCK_CNT_NIL(rw_locked_dk) &&
76411 + LOCK_CNT_NIL(rw_locked_tree)));
76412 +
76413 + spin_lock(&(mgr->tmgr_lock));
76414 +
76415 + LOCK_CNT_INC(spin_locked_txnmgr);
76416 + LOCK_CNT_INC(spin_locked);
76417 +}
76418 +
76419 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76420 +{
76421 + if (spin_trylock(&(mgr->tmgr_lock))) {
76422 + LOCK_CNT_INC(spin_locked_txnmgr);
76423 + LOCK_CNT_INC(spin_locked);
76424 + return 1;
76425 + }
76426 + return 0;
76427 +}
76428 +
76429 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76430 +{
76431 + assert_spin_locked(&(mgr->tmgr_lock));
76432 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76433 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76434 +
76435 + LOCK_CNT_DEC(spin_locked_txnmgr);
76436 + LOCK_CNT_DEC(spin_locked);
76437 +
76438 + spin_unlock(&(mgr->tmgr_lock));
76439 +}
76440 +
76441 +typedef enum {
76442 + FQ_IN_USE = 0x1
76443 +} flush_queue_state_t;
76444 +
76445 +typedef struct flush_queue flush_queue_t;
76446 +
76447 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76448 + is filled by the jnode_flush() routine, and written to disk under memory
76449 + pressure or at atom commit time. */
76450 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76451 + field and fq->prepped list can be modified if atom is spin-locked and fq
76452 + object is "in-use" state. For read-only traversal of the fq->prepped list
76453 + and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76454 + only have atom spin-locked. */
76455 +struct flush_queue {
76456 + /* linkage element is the first in this structure to make debugging
76457 + easier. See field in atom struct for description of list. */
76458 + struct list_head alink;
76459 + /* A spinlock to protect changes of fq state and fq->atom pointer */
76460 + spinlock_t guard;
76461 + /* flush_queue state: [in_use | ready] */
76462 + flush_queue_state_t state;
76463 + /* A list which contains queued nodes, queued nodes are removed from any
76464 + * atom's list and put on this ->prepped one. */
76465 + struct list_head prepped;
76466 + /* number of submitted i/o requests */
76467 + atomic_t nr_submitted;
76468 + /* number of i/o errors */
76469 + atomic_t nr_errors;
76470 + /* An atom this flush queue is attached to */
76471 + txn_atom *atom;
76472 + /* A wait queue head to wait on i/o completion */
76473 + wait_queue_head_t wait;
76474 +#if REISER4_DEBUG
76475 + /* A thread which took this fq in exclusive use, NULL if fq is free,
76476 + * used for debugging. */
76477 + struct task_struct *owner;
76478 +#endif
76479 +};
76480 +
76481 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76482 +extern void reiser4_fq_put_nolock(flush_queue_t *);
76483 +extern void reiser4_fq_put(flush_queue_t *);
76484 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
76485 +extern void queue_jnode(flush_queue_t *, jnode *);
76486 +
76487 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
76488 +extern int current_atom_finish_all_fq(void);
76489 +extern void init_atom_fq_parts(txn_atom *);
76490 +
76491 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76492 +
76493 +extern void znode_make_dirty(znode * node);
76494 +extern void jnode_make_dirty_locked(jnode * node);
76495 +
76496 +extern int reiser4_sync_atom(txn_atom * atom);
76497 +
76498 +#if REISER4_DEBUG
76499 +extern int atom_fq_parts_are_clean(txn_atom *);
76500 +#endif
76501 +
76502 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76503 +extern flush_queue_t *get_fq_for_current_atom(void);
76504 +
76505 +void protected_jnodes_init(protected_jnodes * list);
76506 +void protected_jnodes_done(protected_jnodes * list);
76507 +void reiser4_invalidate_list(struct list_head * head);
76508 +
76509 +# endif /* __REISER4_TXNMGR_H__ */
76510 +
76511 +/* Make Linus happy.
76512 + Local variables:
76513 + c-indentation-style: "K&R"
76514 + mode-name: "LC"
76515 + c-basic-offset: 8
76516 + tab-width: 8
76517 + fill-column: 120
76518 + End:
76519 +*/
76520 diff -urN linux-2.6.20.orig/fs/reiser4/type_safe_hash.h linux-2.6.20/fs/reiser4/type_safe_hash.h
76521 --- linux-2.6.20.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
76522 +++ linux-2.6.20/fs/reiser4/type_safe_hash.h 2007-05-06 14:50:43.899038216 +0400
76523 @@ -0,0 +1,320 @@
76524 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76525 + * reiser4/README */
76526 +
76527 +/* A hash table class that uses hash chains (singly-linked) and is
76528 + parametrized to provide type safety. */
76529 +
76530 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
76531 +#define __REISER4_TYPE_SAFE_HASH_H__
76532 +
76533 +#include "debug.h"
76534 +
76535 +#include <asm/errno.h>
76536 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76537 + based on the object type. You need to declare the item type before
76538 + this definition, define it after this definition. */
76539 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76540 + \
76541 +typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76542 +typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76543 + \
76544 +struct PREFIX##_hash_table_ \
76545 +{ \
76546 + ITEM_TYPE **_table; \
76547 + __u32 _buckets; \
76548 +}; \
76549 + \
76550 +struct PREFIX##_hash_link_ \
76551 +{ \
76552 + ITEM_TYPE *_next; \
76553 +}
76554 +
76555 +/* Step 2: Define the object type of the hash: give it field of type
76556 + PREFIX_hash_link. */
76557 +
76558 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76559 + the type and field name used in step 3. The arguments are:
76560 +
76561 + ITEM_TYPE The item type being hashed
76562 + KEY_TYPE The type of key being hashed
76563 + KEY_NAME The name of the key field within the item
76564 + LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76565 + HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76566 + EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76567 +
76568 + It implements these functions:
76569 +
76570 + prefix_hash_init Initialize the table given its size.
76571 + prefix_hash_insert Insert an item
76572 + prefix_hash_insert_index Insert an item w/ precomputed hash_index
76573 + prefix_hash_find Find an item by key
76574 + prefix_hash_find_index Find an item w/ precomputed hash_index
76575 + prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76576 + prefix_hash_remove_index Remove an item w/ precomputed hash_index
76577 +
76578 + If you'd like something to be done differently, feel free to ask me
76579 + for modifications. Additional features that could be added but
76580 + have not been:
76581 +
76582 + prefix_hash_remove_key Find and remove an item by key
76583 + prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76584 +
76585 + The hash_function currently receives only the key as an argument,
76586 + meaning it must somehow know the number of buckets. If this is a
76587 + problem let me know.
76588 +
76589 + This hash table uses a single-linked hash chain. This means
76590 + insertion is fast but deletion requires searching the chain.
76591 +
76592 + There is also the doubly-linked hash chain approach, under which
76593 + deletion requires no search but the code is longer and it takes two
76594 + pointers per item.
76595 +
76596 + The circularly-linked approach has the shortest code but requires
76597 + two pointers per bucket, doubling the size of the bucket array (in
76598 + addition to two pointers per item).
76599 +*/
76600 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76601 + \
76602 +static __inline__ void \
76603 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76604 + __u32 hash UNUSED_ARG) \
76605 +{ \
76606 + assert("nikita-2780", hash < table->_buckets); \
76607 +} \
76608 + \
76609 +static __inline__ int \
76610 +PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76611 + __u32 buckets) \
76612 +{ \
76613 + hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76614 + hash->_buckets = buckets; \
76615 + if (hash->_table == NULL) \
76616 + { \
76617 + return RETERR(-ENOMEM); \
76618 + } \
76619 + memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76620 + ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76621 + return 0; \
76622 +} \
76623 + \
76624 +static __inline__ void \
76625 +PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76626 +{ \
76627 + if (REISER4_DEBUG && hash->_table != NULL) { \
76628 + __u32 i; \
76629 + for (i = 0 ; i < hash->_buckets ; ++ i) \
76630 + assert("nikita-2905", hash->_table[i] == NULL); \
76631 + } \
76632 + if (hash->_table != NULL) \
76633 + KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76634 + hash->_table = NULL; \
76635 +} \
76636 + \
76637 +static __inline__ void \
76638 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76639 +{ \
76640 + prefetch(item->LINK_NAME._next); \
76641 +} \
76642 + \
76643 +static __inline__ void \
76644 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76645 + __u32 index) \
76646 +{ \
76647 + prefetch(hash->_table[index]); \
76648 +} \
76649 + \
76650 +static __inline__ ITEM_TYPE* \
76651 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76652 + __u32 hash_index, \
76653 + KEY_TYPE const *find_key) \
76654 +{ \
76655 + ITEM_TYPE *item; \
76656 + \
76657 + PREFIX##_check_hash(hash, hash_index); \
76658 + \
76659 + for (item = hash->_table[hash_index]; \
76660 + item != NULL; \
76661 + item = item->LINK_NAME._next) \
76662 + { \
76663 + prefetch(item->LINK_NAME._next); \
76664 + prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76665 + if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76666 + { \
76667 + return item; \
76668 + } \
76669 + } \
76670 + \
76671 + return NULL; \
76672 +} \
76673 + \
76674 +static __inline__ ITEM_TYPE* \
76675 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76676 + __u32 hash_index, \
76677 + KEY_TYPE const *find_key) \
76678 +{ \
76679 + ITEM_TYPE ** item = &hash->_table[hash_index]; \
76680 + \
76681 + PREFIX##_check_hash(hash, hash_index); \
76682 + \
76683 + while (*item != NULL) { \
76684 + prefetch(&(*item)->LINK_NAME._next); \
76685 + if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76686 + ITEM_TYPE *found; \
76687 + \
76688 + found = *item; \
76689 + *item = found->LINK_NAME._next; \
76690 + found->LINK_NAME._next = hash->_table[hash_index]; \
76691 + hash->_table[hash_index] = found; \
76692 + return found; \
76693 + } \
76694 + item = &(*item)->LINK_NAME._next; \
76695 + } \
76696 + return NULL; \
76697 +} \
76698 + \
76699 +static __inline__ int \
76700 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76701 + __u32 hash_index, \
76702 + ITEM_TYPE *del_item) \
76703 +{ \
76704 + ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76705 + \
76706 + PREFIX##_check_hash(hash, hash_index); \
76707 + \
76708 + while (*hash_item_p != NULL) { \
76709 + prefetch(&(*hash_item_p)->LINK_NAME._next); \
76710 + if (*hash_item_p == del_item) { \
76711 + *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76712 + return 1; \
76713 + } \
76714 + hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76715 + } \
76716 + return 0; \
76717 +} \
76718 + \
76719 +static __inline__ void \
76720 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76721 + __u32 hash_index, \
76722 + ITEM_TYPE *ins_item) \
76723 +{ \
76724 + PREFIX##_check_hash(hash, hash_index); \
76725 + \
76726 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76727 + hash->_table[hash_index] = ins_item; \
76728 +} \
76729 + \
76730 +static __inline__ void \
76731 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76732 + __u32 hash_index, \
76733 + ITEM_TYPE *ins_item) \
76734 +{ \
76735 + PREFIX##_check_hash(hash, hash_index); \
76736 + \
76737 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76738 + smp_wmb(); \
76739 + hash->_table[hash_index] = ins_item; \
76740 +} \
76741 + \
76742 +static __inline__ ITEM_TYPE* \
76743 +PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76744 + KEY_TYPE const *find_key) \
76745 +{ \
76746 + return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76747 +} \
76748 + \
76749 +static __inline__ ITEM_TYPE* \
76750 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76751 + KEY_TYPE const *find_key) \
76752 +{ \
76753 + return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76754 +} \
76755 + \
76756 +static __inline__ int \
76757 +PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76758 + ITEM_TYPE *del_item) \
76759 +{ \
76760 + return PREFIX##_hash_remove_index (hash, \
76761 + HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76762 +} \
76763 + \
76764 +static __inline__ int \
76765 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76766 + ITEM_TYPE *del_item) \
76767 +{ \
76768 + return PREFIX##_hash_remove (hash, del_item); \
76769 +} \
76770 + \
76771 +static __inline__ void \
76772 +PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76773 + ITEM_TYPE *ins_item) \
76774 +{ \
76775 + return PREFIX##_hash_insert_index (hash, \
76776 + HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76777 +} \
76778 + \
76779 +static __inline__ void \
76780 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76781 + ITEM_TYPE *ins_item) \
76782 +{ \
76783 + return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76784 + ins_item); \
76785 +} \
76786 + \
76787 +static __inline__ ITEM_TYPE * \
76788 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76789 +{ \
76790 + ITEM_TYPE *first; \
76791 + \
76792 + for (first = NULL; ind < hash->_buckets; ++ ind) { \
76793 + first = hash->_table[ind]; \
76794 + if (first != NULL) \
76795 + break; \
76796 + } \
76797 + return first; \
76798 +} \
76799 + \
76800 +static __inline__ ITEM_TYPE * \
76801 +PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76802 + ITEM_TYPE *item) \
76803 +{ \
76804 + ITEM_TYPE *next; \
76805 + \
76806 + if (item == NULL) \
76807 + return NULL; \
76808 + next = item->LINK_NAME._next; \
76809 + if (next == NULL) \
76810 + next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76811 + return next; \
76812 +} \
76813 + \
76814 +typedef struct {} PREFIX##_hash_dummy
76815 +
76816 +#define for_all_ht_buckets(table, head) \
76817 +for ((head) = &(table) -> _table[ 0 ] ; \
76818 + (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76819 +
76820 +#define for_all_in_bucket(bucket, item, next, field) \
76821 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76822 + (item) != NULL ; \
76823 + (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76824 +
76825 +#define for_all_in_htable(table, prefix, item, next) \
76826 +for ((item) = prefix ## _hash_first ((table), 0), \
76827 + (next) = prefix ## _hash_next ((table), (item)) ; \
76828 + (item) != NULL ; \
76829 + (item) = (next), \
76830 + (next) = prefix ## _hash_next ((table), (item)))
76831 +
76832 +/* __REISER4_TYPE_SAFE_HASH_H__ */
76833 +#endif
76834 +
76835 +/* Make Linus happy.
76836 + Local variables:
76837 + c-indentation-style: "K&R"
76838 + mode-name: "LC"
76839 + c-basic-offset: 8
76840 + tab-width: 8
76841 + fill-column: 120
76842 + End:
76843 +*/
76844 diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.c linux-2.6.20/fs/reiser4/vfs_ops.c
76845 --- linux-2.6.20.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
76846 +++ linux-2.6.20/fs/reiser4/vfs_ops.c 2007-05-06 14:50:43.899038216 +0400
76847 @@ -0,0 +1,259 @@
76848 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76849 + * reiser4/README */
76850 +
76851 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76852 + here. */
76853 +
76854 +#include "forward.h"
76855 +#include "debug.h"
76856 +#include "dformat.h"
76857 +#include "coord.h"
76858 +#include "plugin/item/item.h"
76859 +#include "plugin/file/file.h"
76860 +#include "plugin/security/perm.h"
76861 +#include "plugin/disk_format/disk_format.h"
76862 +#include "plugin/plugin.h"
76863 +#include "plugin/plugin_set.h"
76864 +#include "plugin/object.h"
76865 +#include "txnmgr.h"
76866 +#include "jnode.h"
76867 +#include "znode.h"
76868 +#include "block_alloc.h"
76869 +#include "tree.h"
76870 +#include "vfs_ops.h"
76871 +#include "inode.h"
76872 +#include "page_cache.h"
76873 +#include "ktxnmgrd.h"
76874 +#include "super.h"
76875 +#include "reiser4.h"
76876 +#include "entd.h"
76877 +#include "status_flags.h"
76878 +#include "flush.h"
76879 +#include "dscale.h"
76880 +
76881 +#include <linux/profile.h>
76882 +#include <linux/types.h>
76883 +#include <linux/mount.h>
76884 +#include <linux/vfs.h>
76885 +#include <linux/mm.h>
76886 +#include <linux/buffer_head.h>
76887 +#include <linux/dcache.h>
76888 +#include <linux/list.h>
76889 +#include <linux/pagemap.h>
76890 +#include <linux/slab.h>
76891 +#include <linux/seq_file.h>
76892 +#include <linux/init.h>
76893 +#include <linux/module.h>
76894 +#include <linux/writeback.h>
76895 +#include <linux/blkdev.h>
76896 +#include <linux/quotaops.h>
76897 +#include <linux/security.h>
76898 +#include <linux/reboot.h>
76899 +#include <linux/rcupdate.h>
76900 +
76901 +/* update inode stat-data by calling plugin */
76902 +int reiser4_update_sd(struct inode *object)
76903 +{
76904 + file_plugin *fplug;
76905 +
76906 + assert("nikita-2338", object != NULL);
76907 + /* check for read-only file system. */
76908 + if (IS_RDONLY(object))
76909 + return 0;
76910 +
76911 + fplug = inode_file_plugin(object);
76912 + assert("nikita-2339", fplug != NULL);
76913 + return fplug->write_sd_by_inode(object);
76914 +}
76915 +
76916 +/* helper function: increase inode nlink count and call plugin method to save
76917 + updated stat-data.
76918 +
76919 + Used by link/create and during creation of dot and dotdot in mkdir
76920 +*/
76921 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76922 + struct inode *parent /* parent where new entry will be */
76923 + ,
76924 + int write_sd_p /* true if stat-data has to be
76925 + * updated */ )
76926 +{
76927 + file_plugin *fplug;
76928 + int result;
76929 +
76930 + assert("nikita-1351", object != NULL);
76931 +
76932 + fplug = inode_file_plugin(object);
76933 + assert("nikita-1445", fplug != NULL);
76934 +
76935 + /* ask plugin whether it can add yet another link to this
76936 + object */
76937 + if (!fplug->can_add_link(object))
76938 + return RETERR(-EMLINK);
76939 +
76940 + assert("nikita-2211", fplug->add_link != NULL);
76941 + /* call plugin to do actual addition of link */
76942 + result = fplug->add_link(object, parent);
76943 +
76944 + /* optionally update stat data */
76945 + if (result == 0 && write_sd_p)
76946 + result = fplug->write_sd_by_inode(object);
76947 + return result;
76948 +}
76949 +
76950 +/* helper function: decrease inode nlink count and call plugin method to save
76951 + updated stat-data.
76952 +
76953 + Used by unlink/create
76954 +*/
76955 +int reiser4_del_nlink(struct inode *object /* object from which link is
76956 + * removed */ ,
76957 + struct inode *parent /* parent where entry was */ ,
76958 + int write_sd_p /* true is stat-data has to be
76959 + * updated */ )
76960 +{
76961 + file_plugin *fplug;
76962 + int result;
76963 +
76964 + assert("nikita-1349", object != NULL);
76965 +
76966 + fplug = inode_file_plugin(object);
76967 + assert("nikita-1350", fplug != NULL);
76968 + assert("nikita-1446", object->i_nlink > 0);
76969 + assert("nikita-2210", fplug->rem_link != NULL);
76970 +
76971 + /* call plugin to do actual deletion of link */
76972 + result = fplug->rem_link(object, parent);
76973 +
76974 + /* optionally update stat data */
76975 + if (result == 0 && write_sd_p)
76976 + result = fplug->write_sd_by_inode(object);
76977 + return result;
76978 +}
76979 +
76980 +/* Release reiser4 dentry. This is d_op->d_release() method. */
76981 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76982 +{
76983 + reiser4_free_dentry_fsdata(dentry);
76984 +}
76985 +
76986 +/*
76987 + * Called by reiser4_sync_inodes(), during speculative write-back (through
76988 + * pdflush, or balance_dirty_pages()).
76989 + */
76990 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
76991 +{
76992 + long written = 0;
76993 + int repeats = 0;
76994 + int result;
76995 + struct address_space *mapping;
76996 +
76997 + /*
76998 + * Performs early flushing, trying to free some memory. If there is
76999 + * nothing to flush, commits some atoms.
77000 + */
77001 +
77002 + /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
77003 + sys_fsync(). */
77004 + if (wbc->sync_mode != WB_SYNC_NONE) {
77005 + txnmgr_force_commit_all(sb, 0);
77006 + return;
77007 + }
77008 +
77009 + BUG_ON(reiser4_get_super_fake(sb) == NULL);
77010 + mapping = reiser4_get_super_fake(sb)->i_mapping;
77011 + do {
77012 + long nr_submitted = 0;
77013 + jnode *node = NULL;
77014 +
77015 + /* do not put more requests to overload write queue */
77016 + if (wbc->nonblocking &&
77017 + bdi_write_congested(mapping->backing_dev_info)) {
77018 + blk_run_address_space(mapping);
77019 + wbc->encountered_congestion = 1;
77020 + break;
77021 + }
77022 + repeats++;
77023 + BUG_ON(wbc->nr_to_write <= 0);
77024 +
77025 + if (get_current_context()->entd) {
77026 + entd_context *ent = get_entd_context(sb);
77027 +
77028 + if (ent->cur_request->node)
77029 + /*
77030 + * this is ent thread and it managed to capture
77031 + * requested page itself - start flush from
77032 + * that page
77033 + */
77034 + node = jref(ent->cur_request->node);
77035 + }
77036 +
77037 + result = flush_some_atom(node, &nr_submitted, wbc,
77038 + JNODE_FLUSH_WRITE_BLOCKS);
77039 + if (result != 0)
77040 + warning("nikita-31001", "Flush failed: %i", result);
77041 + if (node)
77042 + jput(node);
77043 + if (!nr_submitted)
77044 + break;
77045 +
77046 + wbc->nr_to_write -= nr_submitted;
77047 + written += nr_submitted;
77048 + } while (wbc->nr_to_write > 0);
77049 +}
77050 +
77051 +void reiser4_throttle_write(struct inode *inode)
77052 +{
77053 + reiser4_txn_restart_current();
77054 + balance_dirty_pages_ratelimited(inode->i_mapping);
77055 +}
77056 +
77057 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77058 +const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
77059 + * beginning of device */
77060 +
77061 +/*
77062 + * Reiser4 initialization/shutdown.
77063 + *
77064 + * Code below performs global reiser4 initialization that is done either as
77065 + * part of kernel initialization (when reiser4 is statically built-in), or
77066 + * during reiser4 module load (when compiled as module).
77067 + */
77068 +
77069 +void reiser4_handle_error(void)
77070 +{
77071 + struct super_block *sb = reiser4_get_current_sb();
77072 +
77073 + if (!sb)
77074 + return;
77075 + reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77076 + "Filesystem error occured");
77077 + switch (get_super_private(sb)->onerror) {
77078 + case 0:
77079 + reiser4_panic("foobar-42", "Filesystem error occured\n");
77080 + case 1:
77081 + default:
77082 + if (sb->s_flags & MS_RDONLY)
77083 + return;
77084 + sb->s_flags |= MS_RDONLY;
77085 + break;
77086 + }
77087 +}
77088 +
77089 +struct dentry_operations reiser4_dentry_operations = {
77090 + .d_revalidate = NULL,
77091 + .d_hash = NULL,
77092 + .d_compare = NULL,
77093 + .d_delete = NULL,
77094 + .d_release = reiser4_d_release,
77095 + .d_iput = NULL,
77096 +};
77097 +
77098 +/* Make Linus happy.
77099 + Local variables:
77100 + c-indentation-style: "K&R"
77101 + mode-name: "LC"
77102 + c-basic-offset: 8
77103 + tab-width: 8
77104 + fill-column: 120
77105 + End:
77106 +*/
77107 diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.h linux-2.6.20/fs/reiser4/vfs_ops.h
77108 --- linux-2.6.20.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
77109 +++ linux-2.6.20/fs/reiser4/vfs_ops.h 2007-05-06 14:50:43.899038216 +0400
77110 @@ -0,0 +1,53 @@
77111 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77112 + * reiser4/README */
77113 +
77114 +/* vfs_ops.c's exported symbols */
77115 +
77116 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
77117 +#define __FS_REISER4_VFS_OPS_H__
77118 +
77119 +#include "forward.h"
77120 +#include "coord.h"
77121 +#include "seal.h"
77122 +#include "plugin/file/file.h"
77123 +#include "super.h"
77124 +#include "readahead.h"
77125 +
77126 +#include <linux/types.h> /* for loff_t */
77127 +#include <linux/fs.h> /* for struct address_space */
77128 +#include <linux/dcache.h> /* for struct dentry */
77129 +#include <linux/mm.h>
77130 +#include <linux/backing-dev.h>
77131 +
77132 +/* address space operations */
77133 +int reiser4_writepage(struct page *, struct writeback_control *);
77134 +int reiser4_set_page_dirty(struct page *);
77135 +void reiser4_invalidatepage(struct page *, unsigned long offset);
77136 +int reiser4_releasepage(struct page *, gfp_t);
77137 +
77138 +extern int reiser4_update_sd(struct inode *);
77139 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77140 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77141 +
77142 +extern int reiser4_start_up_io(struct page *page);
77143 +extern void reiser4_throttle_write(struct inode *);
77144 +extern int jnode_is_releasable(jnode *);
77145 +
77146 +#define CAPTURE_APAGE_BURST (1024l)
77147 +void reiser4_writeout(struct super_block *, struct writeback_control *);
77148 +
77149 +extern void reiser4_handle_error(void);
77150 +
77151 +/* __FS_REISER4_VFS_OPS_H__ */
77152 +#endif
77153 +
77154 +/* Make Linus happy.
77155 + Local variables:
77156 + c-indentation-style: "K&R"
77157 + mode-name: "LC"
77158 + c-basic-offset: 8
77159 + tab-width: 8
77160 + fill-column: 120
77161 + scroll-step: 1
77162 + End:
77163 +*/
77164 diff -urN linux-2.6.20.orig/fs/reiser4/wander.c linux-2.6.20/fs/reiser4/wander.c
77165 --- linux-2.6.20.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
77166 +++ linux-2.6.20/fs/reiser4/wander.c 2007-05-06 14:50:43.903039466 +0400
77167 @@ -0,0 +1,1797 @@
77168 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77169 + * reiser4/README */
77170 +
77171 +/* Reiser4 Wandering Log */
77172 +
77173 +/* You should read http://www.namesys.com/txn-doc.html
77174 +
77175 + That describes how filesystem operations are performed as atomic
77176 + transactions, and how we try to arrange it so that we can write most of the
77177 + data only once while performing the operation atomically.
77178 +
77179 + For the purposes of this code, it is enough for it to understand that it
77180 + has been told a given block should be written either once, or twice (if
77181 + twice then once to the wandered location and once to the real location).
77182 +
77183 + This code guarantees that those blocks that are defined to be part of an
77184 + atom either all take effect or none of them take effect.
77185 +
77186 + Relocate set nodes are submitted to write by the jnode_flush() routine, and
77187 + the overwrite set is submitted by reiser4_write_log(). This is because with
77188 + the overwrite set we seek to optimize writes, and with the relocate set we
77189 + seek to cause disk order to correlate with the parent first pre-order.
77190 +
77191 + reiser4_write_log() allocates and writes wandered blocks and maintains
77192 + additional on-disk structures of the atom as wander records (each wander
77193 + record occupies one block) for storing of the "wandered map" (a table which
77194 + contains a relation between wandered and real block numbers) and other
77195 + information which might be needed at transaction recovery time.
77196 +
77197 + The wander records are unidirectionally linked into a circle: each wander
77198 + record contains a block number of the next wander record, the last wander
77199 + record points to the first one.
77200 +
77201 + One wander record (named "tx head" in this file) has a format which is
77202 + different from the other wander records. The "tx head" has a reference to the
77203 + "tx head" block of the previously committed atom. Also, "tx head" contains
77204 + fs information (the free blocks counter, and the oid allocator state) which
77205 + is logged in a special way .
77206 +
77207 + There are two journal control blocks, named journal header and journal
77208 + footer which have fixed on-disk locations. The journal header has a
77209 + reference to the "tx head" block of the last committed atom. The journal
77210 + footer points to the "tx head" of the last flushed atom. The atom is
77211 + "played" when all blocks from its overwrite set are written to disk the
77212 + second time (i.e. written to their real locations).
77213 +
77214 + NOTE: People who know reiserfs internals and its journal structure might be
77215 + confused with these terms journal footer and journal header. There is a table
77216 + with terms of similar semantics in reiserfs (reiser3) and reiser4:
77217 +
77218 + REISER3 TERM | REISER4 TERM | DESCRIPTION
77219 + --------------------+-----------------------+----------------------------
77220 + commit record | journal header | atomic write of this record
77221 + | | ends transaction commit
77222 + --------------------+-----------------------+----------------------------
77223 + journal header | journal footer | atomic write of this record
77224 + | | ends post-commit writes.
77225 + | | After successful
77226 + | | writing of this journal
77227 + | | blocks (in reiser3) or
77228 + | | wandered blocks/records are
77229 + | | free for re-use.
77230 + --------------------+-----------------------+----------------------------
77231 +
77232 + The atom commit process is the following:
77233 +
77234 + 1. The overwrite set is taken from atom's clean list, and its size is
77235 + counted.
77236 +
77237 + 2. The number of necessary wander records (including tx head) is calculated,
77238 + and the wander record blocks are allocated.
77239 +
77240 + 3. Allocate wandered blocks and populate wander records by wandered map.
77241 +
77242 + 4. submit write requests for wander records and wandered blocks.
77243 +
77244 + 5. wait until submitted write requests complete.
77245 +
77246 + 6. update journal header: change the pointer to the block number of just
77247 + written tx head, submit an i/o for modified journal header block and wait
77248 + for i/o completion.
77249 +
77250 + NOTE: The special logging for bitmap blocks and some reiser4 super block
77251 + fields makes processes of atom commit, flush and recovering a bit more
77252 + complex (see comments in the source code for details).
77253 +
77254 + The atom playing process is the following:
77255 +
77256 + 1. Write atom's overwrite set in-place.
77257 +
77258 + 2. Wait on i/o.
77259 +
77260 + 3. Update journal footer: change the pointer to block number of tx head
77261 + block of the atom we currently flushing, submit an i/o, wait on i/o
77262 + completion.
77263 +
77264 + 4. Free disk space which was used for wandered blocks and wander records.
77265 +
77266 + After the freeing of wandered blocks and wander records we have that journal
77267 + footer points to the on-disk structure which might be overwritten soon.
77268 + Neither the log writer nor the journal recovery procedure use that pointer
77269 + for accessing the data. When the journal recovery procedure finds the oldest
77270 + transaction it compares the journal footer pointer value with the "prev_tx"
77271 + pointer value in tx head, if values are equal the oldest not flushed
77272 + transaction is found.
77273 +
77274 + NOTE on disk space leakage: the information about of what blocks and how many
77275 + blocks are allocated for wandered blocks, wandered records is not written to
77276 + the disk because of special logging for bitmaps and some super blocks
77277 + counters. After a system crash we the reiser4 does not remember those
77278 + objects allocation, thus we have no such a kind of disk space leakage.
77279 +*/
77280 +
77281 +/* Special logging of reiser4 super block fields. */
77282 +
77283 +/* There are some reiser4 super block fields (free block count and OID allocator
77284 + state (number of files and next free OID) which are logged separately from
77285 + super block to avoid unnecessary atom fusion.
77286 +
77287 + So, the reiser4 super block can be not captured by a transaction with
77288 + allocates/deallocates disk blocks or create/delete file objects. Moreover,
77289 + the reiser4 on-disk super block is not touched when such a transaction is
77290 + committed and flushed. Those "counters logged specially" are logged in "tx
77291 + head" blocks and in the journal footer block.
77292 +
77293 + A step-by-step description of special logging:
77294 +
77295 + 0. The per-atom information about deleted or created files and allocated or
77296 + freed blocks is collected during the transaction. The atom's
77297 + ->nr_objects_created and ->nr_objects_deleted are for object
77298 + deletion/creation tracking, the numbers of allocated and freed blocks are
77299 + calculated using atom's delete set and atom's capture list -- all new and
77300 + relocated nodes should be on atom's clean list and should have JNODE_RELOC
77301 + bit set.
77302 +
77303 + 1. The "logged specially" reiser4 super block fields have their "committed"
77304 + versions in the reiser4 in-memory super block. They get modified only at
77305 + atom commit time. The atom's commit thread has an exclusive access to those
77306 + "committed" fields because the log writer implementation supports only one
77307 + atom commit a time (there is a per-fs "commit" mutex). At
77308 + that time "committed" counters are modified using per-atom information
77309 + collected during the transaction. These counters are stored on disk as a
77310 + part of tx head block when atom is committed.
77311 +
77312 + 2. When the atom is flushed the value of the free block counter and the OID
77313 + allocator state get written to the journal footer block. A special journal
77314 + procedure (journal_recover_sb_data()) takes those values from the journal
77315 + footer and updates the reiser4 in-memory super block.
77316 +
77317 + NOTE: That means free block count and OID allocator state are logged
77318 + separately from the reiser4 super block regardless of the fact that the
77319 + reiser4 super block has fields to store both the free block counter and the
77320 + OID allocator.
77321 +
77322 + Writing the whole super block at commit time requires knowing true values of
77323 + all its fields without changes made by not yet committed transactions. It is
77324 + possible by having their "committed" version of the super block like the
77325 + reiser4 bitmap blocks have "committed" and "working" versions. However,
77326 + another scheme was implemented which stores special logged values in the
77327 + unused free space inside transaction head block. In my opinion it has an
77328 + advantage of not writing whole super block when only part of it was
77329 + modified. */
77330 +
77331 +#include "debug.h"
77332 +#include "dformat.h"
77333 +#include "txnmgr.h"
77334 +#include "jnode.h"
77335 +#include "znode.h"
77336 +#include "block_alloc.h"
77337 +#include "page_cache.h"
77338 +#include "wander.h"
77339 +#include "reiser4.h"
77340 +#include "super.h"
77341 +#include "vfs_ops.h"
77342 +#include "writeout.h"
77343 +#include "inode.h"
77344 +#include "entd.h"
77345 +
77346 +#include <linux/types.h>
77347 +#include <linux/fs.h> /* for struct super_block */
77348 +#include <linux/mm.h> /* for struct page */
77349 +#include <linux/pagemap.h>
77350 +#include <linux/bio.h> /* for struct bio */
77351 +#include <linux/blkdev.h>
77352 +
77353 +static int write_jnodes_to_disk_extent(
77354 + jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77355 +
77356 +/* The commit_handle is a container for objects needed at atom commit time */
77357 +struct commit_handle {
77358 + /* A pointer to atom's list of OVRWR nodes */
77359 + struct list_head *overwrite_set;
77360 + /* atom's overwrite set size */
77361 + int overwrite_set_size;
77362 + /* jnodes for wander record blocks */
77363 + struct list_head tx_list;
77364 + /* number of wander records */
77365 + __u32 tx_size;
77366 + /* 'committed' sb counters are saved here until atom is completely
77367 + flushed */
77368 + __u64 free_blocks;
77369 + __u64 nr_files;
77370 + __u64 next_oid;
77371 + /* A pointer to the atom which is being committed */
77372 + txn_atom *atom;
77373 + /* A pointer to current super block */
77374 + struct super_block *super;
77375 + /* The counter of modified bitmaps */
77376 + reiser4_block_nr nr_bitmap;
77377 +};
77378 +
77379 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77380 +{
77381 + memset(ch, 0, sizeof(struct commit_handle));
77382 + INIT_LIST_HEAD(&ch->tx_list);
77383 +
77384 + ch->atom = atom;
77385 + ch->super = reiser4_get_current_sb();
77386 +}
77387 +
77388 +static void done_commit_handle(struct commit_handle *ch)
77389 +{
77390 + assert("zam-690", list_empty(&ch->tx_list));
77391 +}
77392 +
77393 +static inline int reiser4_use_write_barrier(struct super_block * s)
77394 +{
77395 + return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77396 +}
77397 +
77398 +static void disable_write_barrier(struct super_block * s)
77399 +{
77400 + notice("zam-1055", "%s does not support write barriers,"
77401 + " using synchronous write instead.", s->s_id);
77402 + set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77403 +}
77404 +
77405 +/* fill journal header block data */
77406 +static void format_journal_header(struct commit_handle *ch)
77407 +{
77408 + struct reiser4_super_info_data *sbinfo;
77409 + struct journal_header *header;
77410 + jnode *txhead;
77411 +
77412 + sbinfo = get_super_private(ch->super);
77413 + assert("zam-479", sbinfo != NULL);
77414 + assert("zam-480", sbinfo->journal_header != NULL);
77415 +
77416 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77417 +
77418 + jload(sbinfo->journal_header);
77419 +
77420 + header = (struct journal_header *)jdata(sbinfo->journal_header);
77421 + assert("zam-484", header != NULL);
77422 +
77423 + put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77424 + &header->last_committed_tx);
77425 +
77426 + jrelse(sbinfo->journal_header);
77427 +}
77428 +
77429 +/* fill journal footer block data */
77430 +static void format_journal_footer(struct commit_handle *ch)
77431 +{
77432 + struct reiser4_super_info_data *sbinfo;
77433 + struct journal_footer *footer;
77434 + jnode *tx_head;
77435 +
77436 + sbinfo = get_super_private(ch->super);
77437 +
77438 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77439 +
77440 + assert("zam-493", sbinfo != NULL);
77441 + assert("zam-494", sbinfo->journal_header != NULL);
77442 +
77443 + check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77444 +
77445 + footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77446 + assert("zam-495", footer != NULL);
77447 +
77448 + put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77449 + &footer->last_flushed_tx);
77450 + put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77451 +
77452 + put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77453 + put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77454 +
77455 + jrelse(sbinfo->journal_footer);
77456 +}
77457 +
77458 +/* wander record capacity depends on current block size */
77459 +static int wander_record_capacity(const struct super_block *super)
77460 +{
77461 + return (super->s_blocksize -
77462 + sizeof(struct wander_record_header)) /
77463 + sizeof(struct wander_entry);
77464 +}
77465 +
77466 +/* Fill first wander record (tx head) in accordance with supplied given data */
77467 +static void format_tx_head(struct commit_handle *ch)
77468 +{
77469 + jnode *tx_head;
77470 + jnode *next;
77471 + struct tx_header *header;
77472 +
77473 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77474 + assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77475 +
77476 + next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77477 + if (&ch->tx_list == &next->capture_link)
77478 + next = tx_head;
77479 +
77480 + header = (struct tx_header *)jdata(tx_head);
77481 +
77482 + assert("zam-460", header != NULL);
77483 + assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77484 +
77485 + memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77486 + memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77487 +
77488 + put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77489 + put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77490 + &header->prev_tx);
77491 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77492 + put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77493 + put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77494 + put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77495 +}
77496 +
77497 +/* prepare ordinary wander record block (fill all service fields) */
77498 +static void
77499 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77500 +{
77501 + struct wander_record_header *LRH;
77502 + jnode *next;
77503 +
77504 + assert("zam-464", node != NULL);
77505 +
77506 + LRH = (struct wander_record_header *)jdata(node);
77507 + next = list_entry(node->capture_link.next, jnode, capture_link);
77508 +
77509 + if (&ch->tx_list == &next->capture_link)
77510 + next = list_entry(ch->tx_list.next, jnode, capture_link);
77511 +
77512 + assert("zam-465", LRH != NULL);
77513 + assert("zam-463",
77514 + ch->super->s_blocksize > sizeof(struct wander_record_header));
77515 +
77516 + memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77517 + memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77518 +
77519 + put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77520 + put_unaligned(cpu_to_le32(serial), &LRH->serial);
77521 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77522 +}
77523 +
77524 +/* add one wandered map entry to formatted wander record */
77525 +static void
77526 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
77527 + const reiser4_block_nr * b)
77528 +{
77529 + char *data;
77530 + struct wander_entry *pairs;
77531 +
77532 + data = jdata(node);
77533 + assert("zam-451", data != NULL);
77534 +
77535 + pairs =
77536 + (struct wander_entry *)(data + sizeof(struct wander_record_header));
77537 +
77538 + put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77539 + put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77540 +}
77541 +
77542 +/* currently, wander records contains contain only wandered map, which depend on
77543 + overwrite set size */
77544 +static void get_tx_size(struct commit_handle *ch)
77545 +{
77546 + assert("zam-440", ch->overwrite_set_size != 0);
77547 + assert("zam-695", ch->tx_size == 0);
77548 +
77549 + /* count all ordinary wander records
77550 + (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77551 + for tx head block */
77552 + ch->tx_size =
77553 + (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77554 + 2;
77555 +}
77556 +
77557 +/* A special structure for using in store_wmap_actor() for saving its state
77558 + between calls */
77559 +struct store_wmap_params {
77560 + jnode *cur; /* jnode of current wander record to fill */
77561 + int idx; /* free element index in wander record */
77562 + int capacity; /* capacity */
77563 +
77564 +#if REISER4_DEBUG
77565 + struct list_head *tx_list;
77566 +#endif
77567 +};
77568 +
77569 +/* an actor for use in blocknr_set_iterator routine which populates the list
77570 + of pre-formatted wander records by wandered map info */
77571 +static int
77572 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77573 + const reiser4_block_nr * b, void *data)
77574 +{
77575 + struct store_wmap_params *params = data;
77576 +
77577 + if (params->idx >= params->capacity) {
77578 + /* a new wander record should be taken from the tx_list */
77579 + params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77580 + assert("zam-454",
77581 + params->tx_list != &params->cur->capture_link);
77582 +
77583 + params->idx = 0;
77584 + }
77585 +
77586 + store_entry(params->cur, params->idx, a, b);
77587 + params->idx++;
77588 +
77589 + return 0;
77590 +}
77591 +
77592 +/* This function is called after Relocate set gets written to disk, Overwrite
77593 + set is written to wandered locations and all wander records are written
77594 + also. Updated journal header blocks contains a pointer (block number) to
77595 + first wander record of the just written transaction */
77596 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
77597 +{
77598 + struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77599 + jnode *jh = sbinfo->journal_header;
77600 + jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77601 + int ret;
77602 +
77603 + format_journal_header(ch);
77604 +
77605 + ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77606 + use_barrier ? WRITEOUT_BARRIER : 0);
77607 + if (ret)
77608 + return ret;
77609 +
77610 + // blk_run_address_space(sbinfo->fake->i_mapping);
77611 + /*blk_run_queues(); */
77612 +
77613 + ret = jwait_io(jh, WRITE);
77614 +
77615 + if (ret)
77616 + return ret;
77617 +
77618 + sbinfo->last_committed_tx = *jnode_get_block(head);
77619 +
77620 + return 0;
77621 +}
77622 +
77623 +/* This function is called after write-back is finished. We update journal
77624 + footer block and free blocks which were occupied by wandered blocks and
77625 + transaction wander records */
77626 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77627 +{
77628 + reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77629 +
77630 + jnode *jf = sbinfo->journal_footer;
77631 +
77632 + int ret;
77633 +
77634 + format_journal_footer(ch);
77635 +
77636 + ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77637 + use_barrier ? WRITEOUT_BARRIER : 0);
77638 + if (ret)
77639 + return ret;
77640 +
77641 + // blk_run_address_space(sbinfo->fake->i_mapping);
77642 + /*blk_run_queue(); */
77643 +
77644 + ret = jwait_io(jf, WRITE);
77645 + if (ret)
77646 + return ret;
77647 +
77648 + return 0;
77649 +}
77650 +
77651 +/* free block numbers of wander records of already written in place transaction */
77652 +static void dealloc_tx_list(struct commit_handle *ch)
77653 +{
77654 + while (!list_empty(&ch->tx_list)) {
77655 + jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77656 + list_del(&cur->capture_link);
77657 + ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77658 + reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77659 + BA_FORMATTED);
77660 +
77661 + unpin_jnode_data(cur);
77662 + reiser4_drop_io_head(cur);
77663 + }
77664 +}
77665 +
77666 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77667 + from atom's overwrite set. */
77668 +static int
77669 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77670 + const reiser4_block_nr * a UNUSED_ARG,
77671 + const reiser4_block_nr * b, void *data UNUSED_ARG)
77672 +{
77673 +
77674 + assert("zam-499", b != NULL);
77675 + assert("zam-500", *b != 0);
77676 + assert("zam-501", !reiser4_blocknr_is_fake(b));
77677 +
77678 + reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77679 + return 0;
77680 +}
77681 +
77682 +/* free wandered block locations of already written in place transaction */
77683 +static void dealloc_wmap(struct commit_handle *ch)
77684 +{
77685 + assert("zam-696", ch->atom != NULL);
77686 +
77687 + blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77688 + dealloc_wmap_actor, NULL, 1);
77689 +}
77690 +
77691 +/* helper function for alloc wandered blocks, which refill set of block
77692 + numbers needed for wandered blocks */
77693 +static int
77694 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77695 +{
77696 + reiser4_blocknr_hint hint;
77697 + int ret;
77698 +
77699 + reiser4_block_nr wide_len = count;
77700 +
77701 + /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77702 + ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77703 + reserved allocation area so as to get the best qualities of fixed
77704 + journals? */
77705 + reiser4_blocknr_hint_init(&hint);
77706 + hint.block_stage = BLOCK_GRABBED;
77707 +
77708 + ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77709 + BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77710 + *len = (int)wide_len;
77711 +
77712 + return ret;
77713 +}
77714 +
77715 +/*
77716 + * roll back changes made before issuing BIO in the case of IO error.
77717 + */
77718 +static void undo_bio(struct bio *bio)
77719 +{
77720 + int i;
77721 +
77722 + for (i = 0; i < bio->bi_vcnt; ++i) {
77723 + struct page *pg;
77724 + jnode *node;
77725 +
77726 + pg = bio->bi_io_vec[i].bv_page;
77727 + ClearPageWriteback(pg);
77728 + node = jprivate(pg);
77729 + spin_lock_jnode(node);
77730 + JF_CLR(node, JNODE_WRITEBACK);
77731 + JF_SET(node, JNODE_DIRTY);
77732 + spin_unlock_jnode(node);
77733 + }
77734 + bio_put(bio);
77735 +}
77736 +
77737 +/* put overwrite set back to atom's clean list */
77738 +static void put_overwrite_set(struct commit_handle *ch)
77739 +{
77740 + jnode *cur;
77741 +
77742 + list_for_each_entry(cur, ch->overwrite_set, capture_link)
77743 + jrelse_tail(cur);
77744 +}
77745 +
77746 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
77747 + Since we have a separate list for atom's overwrite set we just scan the list,
77748 + count bitmap and other not leaf nodes which wandered blocks allocation we
77749 + have to grab space for. */
77750 +static int get_overwrite_set(struct commit_handle *ch)
77751 +{
77752 + int ret;
77753 + jnode *cur;
77754 + __u64 nr_not_leaves = 0;
77755 +#if REISER4_DEBUG
77756 + __u64 nr_formatted_leaves = 0;
77757 + __u64 nr_unformatted_leaves = 0;
77758 +#endif
77759 +
77760 + assert("zam-697", ch->overwrite_set_size == 0);
77761 +
77762 + ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77763 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77764 +
77765 + while (ch->overwrite_set != &cur->capture_link) {
77766 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77767 +
77768 + /* Count bitmap locks for getting correct statistics what number
77769 + * of blocks were cleared by the transaction commit. */
77770 + if (jnode_get_type(cur) == JNODE_BITMAP)
77771 + ch->nr_bitmap++;
77772 +
77773 + assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77774 + || jnode_get_type(cur) == JNODE_BITMAP);
77775 +
77776 + if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77777 + /* we replace fake znode by another (real)
77778 + znode which is suggested by disk_layout
77779 + plugin */
77780 +
77781 + /* FIXME: it looks like fake znode should be
77782 + replaced by jnode supplied by
77783 + disk_layout. */
77784 +
77785 + struct super_block *s = reiser4_get_current_sb();
77786 + reiser4_super_info_data *sbinfo =
77787 + get_current_super_private();
77788 +
77789 + if (sbinfo->df_plug->log_super) {
77790 + jnode *sj = sbinfo->df_plug->log_super(s);
77791 +
77792 + assert("zam-593", sj != NULL);
77793 +
77794 + if (IS_ERR(sj))
77795 + return PTR_ERR(sj);
77796 +
77797 + spin_lock_jnode(sj);
77798 + JF_SET(sj, JNODE_OVRWR);
77799 + insert_into_atom_ovrwr_list(ch->atom, sj);
77800 + spin_unlock_jnode(sj);
77801 +
77802 + /* jload it as the rest of overwrite set */
77803 + jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
77804 +
77805 + ch->overwrite_set_size++;
77806 + }
77807 + spin_lock_jnode(cur);
77808 + reiser4_uncapture_block(cur);
77809 + jput(cur);
77810 +
77811 + } else {
77812 + int ret;
77813 + ch->overwrite_set_size++;
77814 + ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
77815 + if (ret)
77816 + reiser4_panic("zam-783",
77817 + "cannot load e-flushed jnode back (ret = %d)\n",
77818 + ret);
77819 + }
77820 +
77821 + /* Count not leaves here because we have to grab disk space
77822 + * for wandered blocks. They were not counted as "flush
77823 + * reserved". Counting should be done _after_ nodes are pinned
77824 + * into memory by jload(). */
77825 + if (!jnode_is_leaf(cur))
77826 + nr_not_leaves++;
77827 + else {
77828 +#if REISER4_DEBUG
77829 + /* at this point @cur either has JNODE_FLUSH_RESERVED
77830 + * or is eflushed. Locking is not strong enough to
77831 + * write an assertion checking for this. */
77832 + if (jnode_is_znode(cur))
77833 + nr_formatted_leaves++;
77834 + else
77835 + nr_unformatted_leaves++;
77836 +#endif
77837 + JF_CLR(cur, JNODE_FLUSH_RESERVED);
77838 + }
77839 +
77840 + cur = next;
77841 + }
77842 +
77843 + /* Grab space for writing (wandered blocks) of not leaves found in
77844 + * overwrite set. */
77845 + ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77846 + if (ret)
77847 + return ret;
77848 +
77849 + /* Disk space for allocation of wandered blocks of leaf nodes already
77850 + * reserved as "flush reserved", move it to grabbed space counter. */
77851 + spin_lock_atom(ch->atom);
77852 + assert("zam-940",
77853 + nr_formatted_leaves + nr_unformatted_leaves <=
77854 + ch->atom->flush_reserved);
77855 + flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77856 + spin_unlock_atom(ch->atom);
77857 +
77858 + return ch->overwrite_set_size;
77859 +}
77860 +
77861 +/**
77862 + * write_jnodes_to_disk_extent - submit write request
77863 + * @head:
77864 + * @first: first jnode of the list
77865 + * @nr: number of jnodes on the list
77866 + * @block_p:
77867 + * @fq:
77868 + * @flags: used to decide whether page is to get PG_reclaim flag
77869 + *
77870 + * Submits a write request for @nr jnodes beginning from the @first, other
77871 + * jnodes are after the @first on the double-linked "capture" list. All jnodes
77872 + * will be written to the disk region of @nr blocks starting with @block_p block
77873 + * number. If @fq is not NULL it means that waiting for i/o completion will be
77874 + * done more efficiently by using flush_queue_t objects.
77875 + * This function is the one which writes list of jnodes in batch mode. It does
77876 + * all low-level things as bio construction and page states manipulation.
77877 + *
77878 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77879 + * aggregated in this function instead of being left to the layers below
77880 + *
77881 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77882 + * Why that layer needed? Why BIOs cannot be constructed here?
77883 + */
77884 +static int write_jnodes_to_disk_extent(
77885 + jnode *first, int nr, const reiser4_block_nr *block_p,
77886 + flush_queue_t *fq, int flags)
77887 +{
77888 + struct super_block *super = reiser4_get_current_sb();
77889 + int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77890 + int max_blocks;
77891 + jnode *cur = first;
77892 + reiser4_block_nr block;
77893 +
77894 + assert("zam-571", first != NULL);
77895 + assert("zam-572", block_p != NULL);
77896 + assert("zam-570", nr > 0);
77897 +
77898 + block = *block_p;
77899 + max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77900 +
77901 + while (nr > 0) {
77902 + struct bio *bio;
77903 + int nr_blocks = min(nr, max_blocks);
77904 + int i;
77905 + int nr_used;
77906 +
77907 + bio = bio_alloc(GFP_NOIO, nr_blocks);
77908 + if (!bio)
77909 + return RETERR(-ENOMEM);
77910 +
77911 + bio->bi_bdev = super->s_bdev;
77912 + bio->bi_sector = block * (super->s_blocksize >> 9);
77913 + for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77914 + struct page *pg;
77915 +
77916 + pg = jnode_page(cur);
77917 + assert("zam-573", pg != NULL);
77918 +
77919 + page_cache_get(pg);
77920 +
77921 + lock_and_wait_page_writeback(pg);
77922 +
77923 + if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77924 + /*
77925 + * underlying device is satiated. Stop adding
77926 + * pages to the bio.
77927 + */
77928 + unlock_page(pg);
77929 + page_cache_release(pg);
77930 + break;
77931 + }
77932 +
77933 + spin_lock_jnode(cur);
77934 + assert("nikita-3166",
77935 + pg->mapping == jnode_get_mapping(cur));
77936 + assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77937 +#if REISER4_DEBUG
77938 + spin_lock(&cur->load);
77939 + assert("nikita-3165", !jnode_is_releasable(cur));
77940 + spin_unlock(&cur->load);
77941 +#endif
77942 + JF_SET(cur, JNODE_WRITEBACK);
77943 + JF_CLR(cur, JNODE_DIRTY);
77944 + ON_DEBUG(cur->written++);
77945 + spin_unlock_jnode(cur);
77946 +
77947 + ClearPageError(pg);
77948 + set_page_writeback(pg);
77949 +
77950 + if (get_current_context()->entd) {
77951 + /* this is ent thread */
77952 + entd_context *ent = get_entd_context(super);
77953 + struct wbq *rq, *next;
77954 +
77955 + spin_lock(&ent->guard);
77956 +
77957 + if (pg == ent->cur_request->page) {
77958 + /*
77959 + * entd is called for this page. This
77960 + * request is not in th etodo list
77961 + */
77962 + ent->cur_request->written = 1;
77963 + } else {
77964 + /*
77965 + * if we have written a page for which writepage
77966 + * is called for - move request to another list.
77967 + */
77968 + list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77969 + assert("", rq->magic == WBQ_MAGIC);
77970 + if (pg == rq->page) {
77971 + /*
77972 + * remove request from
77973 + * entd's queue, but do
77974 + * not wake up a thread
77975 + * which put this
77976 + * request
77977 + */
77978 + list_del_init(&rq->link);
77979 + ent->nr_todo_reqs --;
77980 + list_add_tail(&rq->link, &ent->done_list);
77981 + ent->nr_done_reqs ++;
77982 + rq->written = 1;
77983 + break;
77984 + }
77985 + }
77986 + }
77987 + spin_unlock(&ent->guard);
77988 + }
77989 +
77990 + clear_page_dirty_for_io(pg);
77991 +
77992 + unlock_page(pg);
77993 +
77994 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77995 + nr_used++;
77996 + }
77997 + if (nr_used > 0) {
77998 + assert("nikita-3453",
77999 + bio->bi_size == super->s_blocksize * nr_used);
78000 + assert("nikita-3454", bio->bi_vcnt == nr_used);
78001 +
78002 + /* Check if we are allowed to write at all */
78003 + if (super->s_flags & MS_RDONLY)
78004 + undo_bio(bio);
78005 + else {
78006 + int not_supported;
78007 +
78008 + add_fq_to_bio(fq, bio);
78009 + bio_get(bio);
78010 + reiser4_submit_bio(write_op, bio);
78011 + not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
78012 + bio_put(bio);
78013 + if (not_supported)
78014 + return -EOPNOTSUPP;
78015 + }
78016 +
78017 + block += nr_used - 1;
78018 + update_blocknr_hint_default(super, &block);
78019 + block += 1;
78020 + } else {
78021 + bio_put(bio);
78022 + }
78023 + nr -= nr_used;
78024 + }
78025 +
78026 + return 0;
78027 +}
78028 +
78029 +/* This is a procedure which recovers a contiguous sequences of disk block
78030 + numbers in the given list of j-nodes and submits write requests on this
78031 + per-sequence basis */
78032 +int
78033 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
78034 + long *nr_submitted, int flags)
78035 +{
78036 + int ret;
78037 + jnode *beg = list_entry(head->next, jnode, capture_link);
78038 +
78039 + while (head != &beg->capture_link) {
78040 + int nr = 1;
78041 + jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78042 +
78043 + while (head != &cur->capture_link) {
78044 + if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78045 + break;
78046 + ++nr;
78047 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78048 + }
78049 +
78050 + ret = write_jnodes_to_disk_extent(
78051 + beg, nr, jnode_get_block(beg), fq, flags);
78052 + if (ret)
78053 + return ret;
78054 +
78055 + if (nr_submitted)
78056 + *nr_submitted += nr;
78057 +
78058 + beg = cur;
78059 + }
78060 +
78061 + return 0;
78062 +}
78063 +
78064 +/* add given wandered mapping to atom's wandered map */
78065 +static int
78066 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78067 +{
78068 + int ret;
78069 + blocknr_set_entry *new_bsep = NULL;
78070 + reiser4_block_nr block;
78071 +
78072 + txn_atom *atom;
78073 +
78074 + assert("zam-568", block_p != NULL);
78075 + block = *block_p;
78076 + assert("zam-569", len > 0);
78077 +
78078 + while ((len--) > 0) {
78079 + do {
78080 + atom = get_current_atom_locked();
78081 + assert("zam-536",
78082 + !reiser4_blocknr_is_fake(jnode_get_block(cur)));
78083 + ret =
78084 + blocknr_set_add_pair(atom, &atom->wandered_map,
78085 + &new_bsep,
78086 + jnode_get_block(cur), &block);
78087 + } while (ret == -E_REPEAT);
78088 +
78089 + if (ret) {
78090 + /* deallocate blocks which were not added to wandered
78091 + map */
78092 + reiser4_block_nr wide_len = len;
78093 +
78094 + reiser4_dealloc_blocks(&block, &wide_len,
78095 + BLOCK_NOT_COUNTED,
78096 + BA_FORMATTED
78097 + /* formatted, without defer */ );
78098 +
78099 + return ret;
78100 + }
78101 +
78102 + spin_unlock_atom(atom);
78103 +
78104 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78105 + ++block;
78106 + }
78107 +
78108 + return 0;
78109 +}
78110 +
78111 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78112 + submit IO for allocated blocks. We assume that current atom is in a stage
78113 + when any atom fusion is impossible and atom is unlocked and it is safe. */
78114 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78115 +{
78116 + reiser4_block_nr block;
78117 +
78118 + int rest;
78119 + int len;
78120 + int ret;
78121 +
78122 + jnode *cur;
78123 +
78124 + assert("zam-534", ch->overwrite_set_size > 0);
78125 +
78126 + rest = ch->overwrite_set_size;
78127 +
78128 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78129 + while (ch->overwrite_set != &cur->capture_link) {
78130 + assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78131 +
78132 + ret = get_more_wandered_blocks(rest, &block, &len);
78133 + if (ret)
78134 + return ret;
78135 +
78136 + rest -= len;
78137 +
78138 + ret = add_region_to_wmap(cur, len, &block);
78139 + if (ret)
78140 + return ret;
78141 +
78142 + ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78143 + if (ret)
78144 + return ret;
78145 +
78146 + while ((len--) > 0) {
78147 + assert("zam-604",
78148 + ch->overwrite_set != &cur->capture_link);
78149 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78150 + }
78151 + }
78152 +
78153 + return 0;
78154 +}
78155 +
78156 +/* allocate given number of nodes over the journal area and link them into a
78157 + list, return pointer to the first jnode in the list */
78158 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78159 +{
78160 + reiser4_blocknr_hint hint;
78161 + reiser4_block_nr allocated = 0;
78162 + reiser4_block_nr first, len;
78163 + jnode *cur;
78164 + jnode *txhead;
78165 + int ret;
78166 + reiser4_context *ctx;
78167 + reiser4_super_info_data *sbinfo;
78168 +
78169 + assert("zam-698", ch->tx_size > 0);
78170 + assert("zam-699", list_empty_careful(&ch->tx_list));
78171 +
78172 + ctx = get_current_context();
78173 + sbinfo = get_super_private(ctx->super);
78174 +
78175 + while (allocated < (unsigned)ch->tx_size) {
78176 + len = (ch->tx_size - allocated);
78177 +
78178 + reiser4_blocknr_hint_init(&hint);
78179 +
78180 + hint.block_stage = BLOCK_GRABBED;
78181 +
78182 + /* FIXME: there should be some block allocation policy for
78183 + nodes which contain wander records */
78184 +
78185 + /* We assume that disk space for wandered record blocks can be
78186 + * taken from reserved area. */
78187 + ret = reiser4_alloc_blocks(&hint, &first, &len,
78188 + BA_FORMATTED | BA_RESERVED |
78189 + BA_USE_DEFAULT_SEARCH_START);
78190 + reiser4_blocknr_hint_done(&hint);
78191 +
78192 + if (ret)
78193 + return ret;
78194 +
78195 + allocated += len;
78196 +
78197 + /* create jnodes for all wander records */
78198 + while (len--) {
78199 + cur = reiser4_alloc_io_head(&first);
78200 +
78201 + if (cur == NULL) {
78202 + ret = RETERR(-ENOMEM);
78203 + goto free_not_assigned;
78204 + }
78205 +
78206 + ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
78207 +
78208 + if (ret != 0) {
78209 + jfree(cur);
78210 + goto free_not_assigned;
78211 + }
78212 +
78213 + pin_jnode_data(cur);
78214 +
78215 + list_add_tail(&cur->capture_link, &ch->tx_list);
78216 +
78217 + first++;
78218 + }
78219 + }
78220 +
78221 + { /* format a on-disk linked list of wander records */
78222 + int serial = 1;
78223 +
78224 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78225 + format_tx_head(ch);
78226 +
78227 + cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78228 + while (&ch->tx_list != &cur->capture_link) {
78229 + format_wander_record(ch, cur, serial++);
78230 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78231 + }
78232 + }
78233 +
78234 + { /* Fill wander records with Wandered Set */
78235 + struct store_wmap_params params;
78236 + txn_atom *atom;
78237 +
78238 + params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78239 +
78240 + params.idx = 0;
78241 + params.capacity =
78242 + wander_record_capacity(reiser4_get_current_sb());
78243 +
78244 + atom = get_current_atom_locked();
78245 + blocknr_set_iterator(atom, &atom->wandered_map,
78246 + &store_wmap_actor, &params, 0);
78247 + spin_unlock_atom(atom);
78248 + }
78249 +
78250 + { /* relse all jnodes from tx_list */
78251 + cur = list_entry(ch->tx_list.next, jnode, capture_link);
78252 + while (&ch->tx_list != &cur->capture_link) {
78253 + jrelse(cur);
78254 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78255 + }
78256 + }
78257 +
78258 + ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78259 +
78260 + return ret;
78261 +
78262 + free_not_assigned:
78263 + /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78264 + caller takes care about invalidating of tx list */
78265 + reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78266 +
78267 + return ret;
78268 +}
78269 +
78270 +static int commit_tx(struct commit_handle *ch)
78271 +{
78272 + flush_queue_t *fq;
78273 + int barrier;
78274 + int ret;
78275 +
78276 + /* Grab more space for wandered records. */
78277 + ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78278 + if (ret)
78279 + return ret;
78280 +
78281 + fq = get_fq_for_current_atom();
78282 + if (IS_ERR(fq))
78283 + return PTR_ERR(fq);
78284 +
78285 + spin_unlock_atom(fq->atom);
78286 + do {
78287 + ret = alloc_wandered_blocks(ch, fq);
78288 + if (ret)
78289 + break;
78290 + ret = alloc_tx(ch, fq);
78291 + if (ret)
78292 + break;
78293 + } while (0);
78294 +
78295 + reiser4_fq_put(fq);
78296 + if (ret)
78297 + return ret;
78298 + repeat_wo_barrier:
78299 + barrier = reiser4_use_write_barrier(ch->super);
78300 + if (!barrier) {
78301 + ret = current_atom_finish_all_fq();
78302 + if (ret)
78303 + return ret;
78304 + }
78305 + ret = update_journal_header(ch, barrier);
78306 + if (barrier) {
78307 + if (ret) {
78308 + if (ret == -EOPNOTSUPP) {
78309 + disable_write_barrier(ch->super);
78310 + goto repeat_wo_barrier;
78311 + }
78312 + return ret;
78313 + }
78314 + ret = current_atom_finish_all_fq();
78315 + }
78316 + return ret;
78317 +}
78318 +
78319 +static int write_tx_back(struct commit_handle * ch)
78320 +{
78321 + flush_queue_t *fq;
78322 + int ret;
78323 + int barrier;
78324 +
78325 + reiser4_post_commit_hook();
78326 + fq = get_fq_for_current_atom();
78327 + if (IS_ERR(fq))
78328 + return PTR_ERR(fq);
78329 + spin_unlock_atom(fq->atom);
78330 + ret = write_jnode_list(
78331 + ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78332 + reiser4_fq_put(fq);
78333 + if (ret)
78334 + return ret;
78335 + repeat_wo_barrier:
78336 + barrier = reiser4_use_write_barrier(ch->super);
78337 + if (!barrier) {
78338 + ret = current_atom_finish_all_fq();
78339 + if (ret)
78340 + return ret;
78341 + }
78342 + ret = update_journal_footer(ch, barrier);
78343 + if (barrier) {
78344 + if (ret) {
78345 + if (ret == -EOPNOTSUPP) {
78346 + disable_write_barrier(ch->super);
78347 + goto repeat_wo_barrier;
78348 + }
78349 + return ret;
78350 + }
78351 + ret = current_atom_finish_all_fq();
78352 + }
78353 + if (ret)
78354 + return ret;
78355 + reiser4_post_write_back_hook();
78356 + return 0;
78357 +}
78358 +
78359 +/* We assume that at this moment all captured blocks are marked as RELOC or
78360 + WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78361 + are submitted to write.
78362 +*/
78363 +
78364 +int reiser4_write_logs(long *nr_submitted)
78365 +{
78366 + txn_atom *atom;
78367 + struct super_block *super = reiser4_get_current_sb();
78368 + reiser4_super_info_data *sbinfo = get_super_private(super);
78369 + struct commit_handle ch;
78370 + int ret;
78371 +
78372 + writeout_mode_enable();
78373 +
78374 + /* block allocator may add j-nodes to the clean_list */
78375 + ret = reiser4_pre_commit_hook();
78376 + if (ret)
78377 + return ret;
78378 +
78379 + /* No locks are required if we take atom which stage >=
78380 + * ASTAGE_PRE_COMMIT */
78381 + atom = get_current_context()->trans->atom;
78382 + assert("zam-965", atom != NULL);
78383 +
78384 + /* relocate set is on the atom->clean_nodes list after
78385 + * current_atom_complete_writes() finishes. It can be safely
78386 + * uncaptured after commit_mutex is locked, because any atom that
78387 + * captures these nodes is guaranteed to commit after current one.
78388 + *
78389 + * This can only be done after reiser4_pre_commit_hook(), because it is where
78390 + * early flushed jnodes with CREATED bit are transferred to the
78391 + * overwrite list. */
78392 + reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
78393 + spin_lock_atom(atom);
78394 + /* There might be waiters for the relocate nodes which we have
78395 + * released, wake them up. */
78396 + reiser4_atom_send_event(atom);
78397 + spin_unlock_atom(atom);
78398 +
78399 + if (REISER4_DEBUG) {
78400 + int level;
78401 +
78402 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78403 + assert("nikita-3352",
78404 + list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78405 + }
78406 +
78407 + sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78408 + sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78409 +
78410 + init_commit_handle(&ch, atom);
78411 +
78412 + ch.free_blocks = sbinfo->blocks_free_committed;
78413 + ch.nr_files = sbinfo->nr_files_committed;
78414 + /* ZAM-FIXME-HANS: email me what the contention level is for the super
78415 + * lock. */
78416 + ch.next_oid = oid_next(super);
78417 +
78418 + /* count overwrite set and place it in a separate list */
78419 + ret = get_overwrite_set(&ch);
78420 +
78421 + if (ret <= 0) {
78422 + /* It is possible that overwrite set is empty here, it means
78423 + all captured nodes are clean */
78424 + goto up_and_ret;
78425 + }
78426 +
78427 + /* Inform the caller about what number of dirty pages will be
78428 + * submitted to disk. */
78429 + *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78430 +
78431 + /* count all records needed for storing of the wandered set */
78432 + get_tx_size(&ch);
78433 +
78434 + ret = commit_tx(&ch);
78435 + if (ret)
78436 + goto up_and_ret;
78437 +
78438 + spin_lock_atom(atom);
78439 + reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
78440 + spin_unlock_atom(atom);
78441 +
78442 + ret = write_tx_back(&ch);
78443 + reiser4_post_write_back_hook();
78444 +
78445 + up_and_ret:
78446 + if (ret) {
78447 + /* there could be fq attached to current atom; the only way to
78448 + remove them is: */
78449 + current_atom_finish_all_fq();
78450 + }
78451 +
78452 + /* free blocks of flushed transaction */
78453 + dealloc_tx_list(&ch);
78454 + dealloc_wmap(&ch);
78455 +
78456 + put_overwrite_set(&ch);
78457 +
78458 + done_commit_handle(&ch);
78459 +
78460 + writeout_mode_disable();
78461 +
78462 + return ret;
78463 +}
78464 +
78465 +/* consistency checks for journal data/control blocks: header, footer, log
78466 + records, transactions head blocks. All functions return zero on success. */
78467 +
78468 +static int check_journal_header(const jnode * node UNUSED_ARG)
78469 +{
78470 + /* FIXME: journal header has no magic field yet. */
78471 + return 0;
78472 +}
78473 +
78474 +/* wait for write completion for all jnodes from given list */
78475 +static int wait_on_jnode_list(struct list_head *head)
78476 +{
78477 + jnode *scan;
78478 + int ret = 0;
78479 +
78480 + list_for_each_entry(scan, head, capture_link) {
78481 + struct page *pg = jnode_page(scan);
78482 +
78483 + if (pg) {
78484 + if (PageWriteback(pg))
78485 + wait_on_page_writeback(pg);
78486 +
78487 + if (PageError(pg))
78488 + ret++;
78489 + }
78490 + }
78491 +
78492 + return ret;
78493 +}
78494 +
78495 +static int check_journal_footer(const jnode * node UNUSED_ARG)
78496 +{
78497 + /* FIXME: journal footer has no magic field yet. */
78498 + return 0;
78499 +}
78500 +
78501 +static int check_tx_head(const jnode * node)
78502 +{
78503 + struct tx_header *header = (struct tx_header *)jdata(node);
78504 +
78505 + if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78506 + warning("zam-627", "tx head at block %s corrupted\n",
78507 + sprint_address(jnode_get_block(node)));
78508 + return RETERR(-EIO);
78509 + }
78510 +
78511 + return 0;
78512 +}
78513 +
78514 +static int check_wander_record(const jnode * node)
78515 +{
78516 + struct wander_record_header *RH =
78517 + (struct wander_record_header *)jdata(node);
78518 +
78519 + if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78520 + 0) {
78521 + warning("zam-628", "wander record at block %s corrupted\n",
78522 + sprint_address(jnode_get_block(node)));
78523 + return RETERR(-EIO);
78524 + }
78525 +
78526 + return 0;
78527 +}
78528 +
78529 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
78530 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78531 +{
78532 + struct tx_header *TXH;
78533 + int ret;
78534 +
78535 + ret = jload(tx_head);
78536 + if (ret)
78537 + return ret;
78538 +
78539 + TXH = (struct tx_header *)jdata(tx_head);
78540 +
78541 + ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78542 + ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78543 + ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78544 +
78545 + jrelse(tx_head);
78546 +
78547 + list_add(&tx_head->capture_link, &ch->tx_list);
78548 +
78549 + return 0;
78550 +}
78551 +
78552 +/* replay one transaction: restore and write overwrite set in place */
78553 +static int replay_transaction(const struct super_block *s,
78554 + jnode * tx_head,
78555 + const reiser4_block_nr * log_rec_block_p,
78556 + const reiser4_block_nr * end_block,
78557 + unsigned int nr_wander_records)
78558 +{
78559 + reiser4_block_nr log_rec_block = *log_rec_block_p;
78560 + struct commit_handle ch;
78561 + LIST_HEAD(overwrite_set);
78562 + jnode *log;
78563 + int ret;
78564 +
78565 + init_commit_handle(&ch, NULL);
78566 + ch.overwrite_set = &overwrite_set;
78567 +
78568 + restore_commit_handle(&ch, tx_head);
78569 +
78570 + while (log_rec_block != *end_block) {
78571 + struct wander_record_header *header;
78572 + struct wander_entry *entry;
78573 +
78574 + int i;
78575 +
78576 + if (nr_wander_records == 0) {
78577 + warning("zam-631",
78578 + "number of wander records in the linked list"
78579 + " greater than number stored in tx head.\n");
78580 + ret = RETERR(-EIO);
78581 + goto free_ow_set;
78582 + }
78583 +
78584 + log = reiser4_alloc_io_head(&log_rec_block);
78585 + if (log == NULL)
78586 + return RETERR(-ENOMEM);
78587 +
78588 + ret = jload(log);
78589 + if (ret < 0) {
78590 + reiser4_drop_io_head(log);
78591 + return ret;
78592 + }
78593 +
78594 + ret = check_wander_record(log);
78595 + if (ret) {
78596 + jrelse(log);
78597 + reiser4_drop_io_head(log);
78598 + return ret;
78599 + }
78600 +
78601 + header = (struct wander_record_header *)jdata(log);
78602 + log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78603 +
78604 + entry = (struct wander_entry *)(header + 1);
78605 +
78606 + /* restore overwrite set from wander record content */
78607 + for (i = 0; i < wander_record_capacity(s); i++) {
78608 + reiser4_block_nr block;
78609 + jnode *node;
78610 +
78611 + block = le64_to_cpu(get_unaligned(&entry->wandered));
78612 + if (block == 0)
78613 + break;
78614 +
78615 + node = reiser4_alloc_io_head(&block);
78616 + if (node == NULL) {
78617 + ret = RETERR(-ENOMEM);
78618 + /*
78619 + * FIXME-VS:???
78620 + */
78621 + jrelse(log);
78622 + reiser4_drop_io_head(log);
78623 + goto free_ow_set;
78624 + }
78625 +
78626 + ret = jload(node);
78627 +
78628 + if (ret < 0) {
78629 + reiser4_drop_io_head(node);
78630 + /*
78631 + * FIXME-VS:???
78632 + */
78633 + jrelse(log);
78634 + reiser4_drop_io_head(log);
78635 + goto free_ow_set;
78636 + }
78637 +
78638 + block = le64_to_cpu(get_unaligned(&entry->original));
78639 +
78640 + assert("zam-603", block != 0);
78641 +
78642 + jnode_set_block(node, &block);
78643 +
78644 + list_add_tail(&node->capture_link, ch.overwrite_set);
78645 +
78646 + ++entry;
78647 + }
78648 +
78649 + jrelse(log);
78650 + reiser4_drop_io_head(log);
78651 +
78652 + --nr_wander_records;
78653 + }
78654 +
78655 + if (nr_wander_records != 0) {
78656 + warning("zam-632", "number of wander records in the linked list"
78657 + " less than number stored in tx head.\n");
78658 + ret = RETERR(-EIO);
78659 + goto free_ow_set;
78660 + }
78661 +
78662 + { /* write wandered set in place */
78663 + write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78664 + ret = wait_on_jnode_list(ch.overwrite_set);
78665 +
78666 + if (ret) {
78667 + ret = RETERR(-EIO);
78668 + goto free_ow_set;
78669 + }
78670 + }
78671 +
78672 + ret = update_journal_footer(&ch, 0);
78673 +
78674 + free_ow_set:
78675 +
78676 + while (!list_empty(ch.overwrite_set)) {
78677 + jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78678 + list_del_init(&cur->capture_link);
78679 + jrelse(cur);
78680 + reiser4_drop_io_head(cur);
78681 + }
78682 +
78683 + list_del_init(&tx_head->capture_link);
78684 +
78685 + done_commit_handle(&ch);
78686 +
78687 + return ret;
78688 +}
78689 +
78690 +/* find oldest committed and not played transaction and play it. The transaction
78691 + * was committed and journal header block was updated but the blocks from the
78692 + * process of writing the atom's overwrite set in-place and updating of journal
78693 + * footer block were not completed. This function completes the process by
78694 + * recovering the atom's overwrite set from their wandered locations and writes
78695 + * them in-place and updating the journal footer. */
78696 +static int replay_oldest_transaction(struct super_block *s)
78697 +{
78698 + reiser4_super_info_data *sbinfo = get_super_private(s);
78699 + jnode *jf = sbinfo->journal_footer;
78700 + unsigned int total;
78701 + struct journal_footer *F;
78702 + struct tx_header *T;
78703 +
78704 + reiser4_block_nr prev_tx;
78705 + reiser4_block_nr last_flushed_tx;
78706 + reiser4_block_nr log_rec_block = 0;
78707 +
78708 + jnode *tx_head;
78709 +
78710 + int ret;
78711 +
78712 + if ((ret = jload(jf)) < 0)
78713 + return ret;
78714 +
78715 + F = (struct journal_footer *)jdata(jf);
78716 +
78717 + last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78718 +
78719 + jrelse(jf);
78720 +
78721 + if (sbinfo->last_committed_tx == last_flushed_tx) {
78722 + /* all transactions are replayed */
78723 + return 0;
78724 + }
78725 +
78726 + prev_tx = sbinfo->last_committed_tx;
78727 +
78728 + /* searching for oldest not flushed transaction */
78729 + while (1) {
78730 + tx_head = reiser4_alloc_io_head(&prev_tx);
78731 + if (!tx_head)
78732 + return RETERR(-ENOMEM);
78733 +
78734 + ret = jload(tx_head);
78735 + if (ret < 0) {
78736 + reiser4_drop_io_head(tx_head);
78737 + return ret;
78738 + }
78739 +
78740 + ret = check_tx_head(tx_head);
78741 + if (ret) {
78742 + jrelse(tx_head);
78743 + reiser4_drop_io_head(tx_head);
78744 + return ret;
78745 + }
78746 +
78747 + T = (struct tx_header *)jdata(tx_head);
78748 +
78749 + prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78750 +
78751 + if (prev_tx == last_flushed_tx)
78752 + break;
78753 +
78754 + jrelse(tx_head);
78755 + reiser4_drop_io_head(tx_head);
78756 + }
78757 +
78758 + total = le32_to_cpu(get_unaligned(&T->total));
78759 + log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78760 +
78761 + pin_jnode_data(tx_head);
78762 + jrelse(tx_head);
78763 +
78764 + ret =
78765 + replay_transaction(s, tx_head, &log_rec_block,
78766 + jnode_get_block(tx_head), total - 1);
78767 +
78768 + unpin_jnode_data(tx_head);
78769 + reiser4_drop_io_head(tx_head);
78770 +
78771 + if (ret)
78772 + return ret;
78773 + return -E_REPEAT;
78774 +}
78775 +
78776 +/* The reiser4 journal current implementation was optimized to not to capture
78777 + super block if certain super blocks fields are modified. Currently, the set
78778 + is (<free block count>, <OID allocator>). These fields are logged by
78779 + special way which includes storing them in each transaction head block at
78780 + atom commit time and writing that information to journal footer block at
78781 + atom flush time. For getting info from journal footer block to the
78782 + in-memory super block there is a special function
78783 + reiser4_journal_recover_sb_data() which should be called after disk format
78784 + plugin re-reads super block after journal replaying.
78785 +*/
78786 +
78787 +/* get the information from journal footer in-memory super block */
78788 +int reiser4_journal_recover_sb_data(struct super_block *s)
78789 +{
78790 + reiser4_super_info_data *sbinfo = get_super_private(s);
78791 + struct journal_footer *jf;
78792 + int ret;
78793 +
78794 + assert("zam-673", sbinfo->journal_footer != NULL);
78795 +
78796 + ret = jload(sbinfo->journal_footer);
78797 + if (ret != 0)
78798 + return ret;
78799 +
78800 + ret = check_journal_footer(sbinfo->journal_footer);
78801 + if (ret != 0)
78802 + goto out;
78803 +
78804 + jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78805 +
78806 + /* was there at least one flushed transaction? */
78807 + if (jf->last_flushed_tx) {
78808 +
78809 + /* restore free block counter logged in this transaction */
78810 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78811 +
78812 + /* restore oid allocator state */
78813 + oid_init_allocator(s,
78814 + le64_to_cpu(get_unaligned(&jf->nr_files)),
78815 + le64_to_cpu(get_unaligned(&jf->next_oid)));
78816 + }
78817 + out:
78818 + jrelse(sbinfo->journal_footer);
78819 + return ret;
78820 +}
78821 +
78822 +/* reiser4 replay journal procedure */
78823 +int reiser4_journal_replay(struct super_block *s)
78824 +{
78825 + reiser4_super_info_data *sbinfo = get_super_private(s);
78826 + jnode *jh, *jf;
78827 + struct journal_header *header;
78828 + int nr_tx_replayed = 0;
78829 + int ret;
78830 +
78831 + assert("zam-582", sbinfo != NULL);
78832 +
78833 + jh = sbinfo->journal_header;
78834 + jf = sbinfo->journal_footer;
78835 +
78836 + if (!jh || !jf) {
78837 + /* it is possible that disk layout does not support journal
78838 + structures, we just warn about this */
78839 + warning("zam-583",
78840 + "journal control blocks were not loaded by disk layout plugin. "
78841 + "journal replaying is not possible.\n");
78842 + return 0;
78843 + }
78844 +
78845 + /* Take free block count from journal footer block. The free block
78846 + counter value corresponds the last flushed transaction state */
78847 + ret = jload(jf);
78848 + if (ret < 0)
78849 + return ret;
78850 +
78851 + ret = check_journal_footer(jf);
78852 + if (ret) {
78853 + jrelse(jf);
78854 + return ret;
78855 + }
78856 +
78857 + jrelse(jf);
78858 +
78859 + /* store last committed transaction info in reiser4 in-memory super
78860 + block */
78861 + ret = jload(jh);
78862 + if (ret < 0)
78863 + return ret;
78864 +
78865 + ret = check_journal_header(jh);
78866 + if (ret) {
78867 + jrelse(jh);
78868 + return ret;
78869 + }
78870 +
78871 + header = (struct journal_header *)jdata(jh);
78872 + sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78873 +
78874 + jrelse(jh);
78875 +
78876 + /* replay committed transactions */
78877 + while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78878 + nr_tx_replayed++;
78879 +
78880 + return ret;
78881 +}
78882 +
78883 +/* load journal control block (either journal header or journal footer block) */
78884 +static int
78885 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78886 +{
78887 + int ret;
78888 +
78889 + *node = reiser4_alloc_io_head(block);
78890 + if (!(*node))
78891 + return RETERR(-ENOMEM);
78892 +
78893 + ret = jload(*node);
78894 +
78895 + if (ret) {
78896 + reiser4_drop_io_head(*node);
78897 + *node = NULL;
78898 + return ret;
78899 + }
78900 +
78901 + pin_jnode_data(*node);
78902 + jrelse(*node);
78903 +
78904 + return 0;
78905 +}
78906 +
78907 +/* unload journal header or footer and free jnode */
78908 +static void unload_journal_control_block(jnode ** node)
78909 +{
78910 + if (*node) {
78911 + unpin_jnode_data(*node);
78912 + reiser4_drop_io_head(*node);
78913 + *node = NULL;
78914 + }
78915 +}
78916 +
78917 +/* release journal control blocks */
78918 +void reiser4_done_journal_info(struct super_block *s)
78919 +{
78920 + reiser4_super_info_data *sbinfo = get_super_private(s);
78921 +
78922 + assert("zam-476", sbinfo != NULL);
78923 +
78924 + unload_journal_control_block(&sbinfo->journal_header);
78925 + unload_journal_control_block(&sbinfo->journal_footer);
78926 + rcu_barrier();
78927 +}
78928 +
78929 +/* load journal control blocks */
78930 +int reiser4_init_journal_info(struct super_block *s)
78931 +{
78932 + reiser4_super_info_data *sbinfo = get_super_private(s);
78933 + journal_location *loc;
78934 + int ret;
78935 +
78936 + loc = &sbinfo->jloc;
78937 +
78938 + assert("zam-651", loc != NULL);
78939 + assert("zam-652", loc->header != 0);
78940 + assert("zam-653", loc->footer != 0);
78941 +
78942 + ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78943 +
78944 + if (ret)
78945 + return ret;
78946 +
78947 + ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78948 +
78949 + if (ret) {
78950 + unload_journal_control_block(&sbinfo->journal_header);
78951 + }
78952 +
78953 + return ret;
78954 +}
78955 +
78956 +/* Make Linus happy.
78957 + Local variables:
78958 + c-indentation-style: "K&R"
78959 + mode-name: "LC"
78960 + c-basic-offset: 8
78961 + tab-width: 8
78962 + fill-column: 80
78963 + End:
78964 +*/
78965 diff -urN linux-2.6.20.orig/fs/reiser4/wander.h linux-2.6.20/fs/reiser4/wander.h
78966 --- linux-2.6.20.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
78967 +++ linux-2.6.20/fs/reiser4/wander.h 2007-05-06 14:50:43.903039466 +0400
78968 @@ -0,0 +1,135 @@
78969 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78970 +
78971 +#if !defined (__FS_REISER4_WANDER_H__)
78972 +#define __FS_REISER4_WANDER_H__
78973 +
78974 +#include "dformat.h"
78975 +
78976 +#include <linux/fs.h> /* for struct super_block */
78977 +
78978 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78979 +
78980 +#define TX_HEADER_MAGIC "TxMagic4"
78981 +#define WANDER_RECORD_MAGIC "LogMagc4"
78982 +
78983 +#define TX_HEADER_MAGIC_SIZE (8)
78984 +#define WANDER_RECORD_MAGIC_SIZE (8)
78985 +
78986 +/* journal header block format */
78987 +struct journal_header {
78988 + /* last written transaction head location */
78989 + d64 last_committed_tx;
78990 +};
78991 +
78992 +typedef struct journal_location {
78993 + reiser4_block_nr footer;
78994 + reiser4_block_nr header;
78995 +} journal_location;
78996 +
78997 +/* The wander.c head comment describes usage and semantic of all these structures */
78998 +/* journal footer block format */
78999 +struct journal_footer {
79000 + /* last flushed transaction location. */
79001 + /* This block number is no more valid after the transaction it points
79002 + to gets flushed, this number is used only at journal replaying time
79003 + for detection of the end of on-disk list of committed transactions
79004 + which were not flushed completely */
79005 + d64 last_flushed_tx;
79006 +
79007 + /* free block counter is written in journal footer at transaction
79008 + flushing , not in super block because free blocks counter is logged
79009 + by another way than super block fields (root pointer, for
79010 + example). */
79011 + d64 free_blocks;
79012 +
79013 + /* number of used OIDs and maximal used OID are logged separately from
79014 + super block */
79015 + d64 nr_files;
79016 + d64 next_oid;
79017 +};
79018 +
79019 +/* Each wander record (except the first one) has unified format with wander
79020 + record header followed by an array of log entries */
79021 +struct wander_record_header {
79022 + /* when there is no predefined location for wander records, this magic
79023 + string should help reiser4fsck. */
79024 + char magic[WANDER_RECORD_MAGIC_SIZE];
79025 +
79026 + /* transaction id */
79027 + d64 id;
79028 +
79029 + /* total number of wander records in current transaction */
79030 + d32 total;
79031 +
79032 + /* this block number in transaction */
79033 + d32 serial;
79034 +
79035 + /* number of previous block in commit */
79036 + d64 next_block;
79037 +};
79038 +
79039 +/* The first wander record (transaction head) of written transaction has the
79040 + special format */
79041 +struct tx_header {
79042 + /* magic string makes first block in transaction different from other
79043 + logged blocks, it should help fsck. */
79044 + char magic[TX_HEADER_MAGIC_SIZE];
79045 +
79046 + /* transaction id */
79047 + d64 id;
79048 +
79049 + /* total number of records (including this first tx head) in the
79050 + transaction */
79051 + d32 total;
79052 +
79053 + /* align next field to 8-byte boundary; this field always is zero */
79054 + d32 padding;
79055 +
79056 + /* block number of previous transaction head */
79057 + d64 prev_tx;
79058 +
79059 + /* next wander record location */
79060 + d64 next_block;
79061 +
79062 + /* committed versions of free blocks counter */
79063 + d64 free_blocks;
79064 +
79065 + /* number of used OIDs (nr_files) and maximal used OID are logged
79066 + separately from super block */
79067 + d64 nr_files;
79068 + d64 next_oid;
79069 +};
79070 +
79071 +/* A transaction gets written to disk as a set of wander records (each wander
79072 + record size is fs block) */
79073 +
79074 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79075 + by zeroes */
79076 +struct wander_entry {
79077 + d64 original; /* block original location */
79078 + d64 wandered; /* block wandered location */
79079 +};
79080 +
79081 +/* REISER4 JOURNAL WRITER FUNCTIONS */
79082 +
79083 +extern int reiser4_write_logs(long *);
79084 +extern int reiser4_journal_replay(struct super_block *);
79085 +extern int reiser4_journal_recover_sb_data(struct super_block *);
79086 +
79087 +extern int reiser4_init_journal_info(struct super_block *);
79088 +extern void reiser4_done_journal_info(struct super_block *);
79089 +
79090 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79091 +
79092 +#endif /* __FS_REISER4_WANDER_H__ */
79093 +
79094 +/* Make Linus happy.
79095 + Local variables:
79096 + c-indentation-style: "K&R"
79097 + mode-name: "LC"
79098 + c-basic-offset: 8
79099 + tab-width: 8
79100 + fill-column: 80
79101 + scroll-step: 1
79102 + End:
79103 +*/
79104 diff -urN linux-2.6.20.orig/fs/reiser4/writeout.h linux-2.6.20/fs/reiser4/writeout.h
79105 --- linux-2.6.20.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
79106 +++ linux-2.6.20/fs/reiser4/writeout.h 2007-05-06 14:50:43.907040716 +0400
79107 @@ -0,0 +1,21 @@
79108 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
79109 +
79110 +#if !defined (__FS_REISER4_WRITEOUT_H__)
79111 +
79112 +#define WRITEOUT_SINGLE_STREAM (0x1)
79113 +#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
79114 +#define WRITEOUT_BARRIER (0x4)
79115 +
79116 +extern int reiser4_get_writeout_flags(void);
79117 +
79118 +#endif /* __FS_REISER4_WRITEOUT_H__ */
79119 +
79120 +/* Make Linus happy.
79121 + Local variables:
79122 + c-indentation-style: "K&R"
79123 + mode-name: "LC"
79124 + c-basic-offset: 8
79125 + tab-width: 8
79126 + fill-column: 80
79127 + End:
79128 +*/
79129 diff -urN linux-2.6.20.orig/fs/reiser4/znode.c linux-2.6.20/fs/reiser4/znode.c
79130 --- linux-2.6.20.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
79131 +++ linux-2.6.20/fs/reiser4/znode.c 2007-05-06 14:50:43.907040716 +0400
79132 @@ -0,0 +1,1029 @@
79133 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79134 + * reiser4/README */
79135 +/* Znode manipulation functions. */
79136 +/* Znode is the in-memory header for a tree node. It is stored
79137 + separately from the node itself so that it does not get written to
79138 + disk. In this respect znode is like buffer head or page head. We
79139 + also use znodes for additional reiser4 specific purposes:
79140 +
79141 + . they are organized into tree structure which is a part of whole
79142 + reiser4 tree.
79143 + . they are used to implement node grained locking
79144 + . they are used to keep additional state associated with a
79145 + node
79146 + . they contain links to lists used by the transaction manager
79147 +
79148 + Znode is attached to some variable "block number" which is instance of
79149 + fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79150 + appropriate node being actually loaded in memory. Existence of znode itself
79151 + is regulated by reference count (->x_count) in it. Each time thread
79152 + acquires reference to znode through call to zget(), ->x_count is
79153 + incremented and decremented on call to zput(). Data (content of node) are
79154 + brought in memory through call to zload(), which also increments ->d_count
79155 + reference counter. zload can block waiting on IO. Call to zrelse()
79156 + decreases this counter. Also, ->c_count keeps track of number of child
79157 + znodes and prevents parent znode from being recycled until all of its
79158 + children are. ->c_count is decremented whenever child goes out of existence
79159 + (being actually recycled in zdestroy()) which can be some time after last
79160 + reference to this child dies if we support some form of LRU cache for
79161 + znodes.
79162 +
79163 +*/
79164 +/* EVERY ZNODE'S STORY
79165 +
79166 + 1. His infancy.
79167 +
79168 + Once upon a time, the znode was born deep inside of zget() by call to
79169 + zalloc(). At the return from zget() znode had:
79170 +
79171 + . reference counter (x_count) of 1
79172 + . assigned block number, marked as used in bitmap
79173 + . pointer to parent znode. Root znode parent pointer points
79174 + to its father: "fake" znode. This, in turn, has NULL parent pointer.
79175 + . hash table linkage
79176 + . no data loaded from disk
79177 + . no node plugin
79178 + . no sibling linkage
79179 +
79180 + 2. His childhood
79181 +
79182 + Each node is either brought into memory as a result of tree traversal, or
79183 + created afresh, creation of the root being a special case of the latter. In
79184 + either case it's inserted into sibling list. This will typically require
79185 + some ancillary tree traversing, but ultimately both sibling pointers will
79186 + exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79187 + zjnode.state.
79188 +
79189 + 3. His youth.
79190 +
79191 + If znode is bound to already existing node in a tree, its content is read
79192 + from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79193 + in zjnode.state and zdata() function starts to return non null for this
79194 + znode. zload() further calls zparse() that determines which node layout
79195 + this node is rendered in, and sets ->nplug on success.
79196 +
79197 + If znode is for new node just created, memory for it is allocated and
79198 + zinit_new() function is called to initialise data, according to selected
79199 + node layout.
79200 +
79201 + 4. His maturity.
79202 +
79203 + After this point, znode lingers in memory for some time. Threads can
79204 + acquire references to znode either by blocknr through call to zget(), or by
79205 + following a pointer to unallocated znode from internal item. Each time
79206 + reference to znode is obtained, x_count is increased. Thread can read/write
79207 + lock znode. Znode data can be loaded through calls to zload(), d_count will
79208 + be increased appropriately. If all references to znode are released
79209 + (x_count drops to 0), znode is not recycled immediately. Rather, it is
79210 + still cached in the hash table in the hope that it will be accessed
79211 + shortly.
79212 +
79213 + There are two ways in which znode existence can be terminated:
79214 +
79215 + . sudden death: node bound to this znode is removed from the tree
79216 + . overpopulation: znode is purged out of memory due to memory pressure
79217 +
79218 + 5. His death.
79219 +
79220 + Death is complex process.
79221 +
79222 + When we irrevocably commit ourselves to decision to remove node from the
79223 + tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79224 + znode. This is done either in ->kill_hook() of internal item or in
79225 + reiser4_kill_root() function when tree root is removed.
79226 +
79227 + At this moment znode still has:
79228 +
79229 + . locks held on it, necessary write ones
79230 + . references to it
79231 + . disk block assigned to it
79232 + . data loaded from the disk
79233 + . pending requests for lock
79234 +
79235 + But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79236 + deletion. Node deletion includes two phases. First all ways to get
79237 + references to that znode (sibling and parent links and hash lookup using
79238 + block number stored in parent node) should be deleted -- it is done through
79239 + sibling_list_remove(), also we assume that nobody uses down link from
79240 + parent node due to its nonexistence or proper parent node locking and
79241 + nobody uses parent pointers from children due to absence of them. Second we
79242 + invalidate all pending lock requests which still are on znode's lock
79243 + request queue, this is done by reiser4_invalidate_lock(). Another
79244 + JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
79245 + Once it set all requesters are forced to return -EINVAL from
79246 + longterm_lock_znode(). Future locking attempts are not possible because all
79247 + ways to get references to that znode are removed already. Last, node is
79248 + uncaptured from transaction.
79249 +
79250 + When last reference to the dying znode is just about to be released,
79251 + block number for this lock is released and znode is removed from the
79252 + hash table.
79253 +
79254 + Now znode can be recycled.
79255 +
79256 + [it's possible to free bitmap block and remove znode from the hash
79257 + table when last lock is released. This will result in having
79258 + referenced but completely orphaned znode]
79259 +
79260 + 6. Limbo
79261 +
79262 + As have been mentioned above znodes with reference counter 0 are
79263 + still cached in a hash table. Once memory pressure increases they are
79264 + purged out of there [this requires something like LRU list for
79265 + efficient implementation. LRU list would also greatly simplify
79266 + implementation of coord cache that would in this case morph to just
79267 + scanning some initial segment of LRU list]. Data loaded into
79268 + unreferenced znode are flushed back to the durable storage if
79269 + necessary and memory is freed. Znodes themselves can be recycled at
79270 + this point too.
79271 +
79272 +*/
79273 +
79274 +#include "debug.h"
79275 +#include "dformat.h"
79276 +#include "key.h"
79277 +#include "coord.h"
79278 +#include "plugin/plugin_header.h"
79279 +#include "plugin/node/node.h"
79280 +#include "plugin/plugin.h"
79281 +#include "txnmgr.h"
79282 +#include "jnode.h"
79283 +#include "znode.h"
79284 +#include "block_alloc.h"
79285 +#include "tree.h"
79286 +#include "tree_walk.h"
79287 +#include "super.h"
79288 +#include "reiser4.h"
79289 +
79290 +#include <linux/pagemap.h>
79291 +#include <linux/spinlock.h>
79292 +#include <linux/slab.h>
79293 +#include <linux/err.h>
79294 +
79295 +static z_hash_table *get_htable(reiser4_tree *,
79296 + const reiser4_block_nr * const blocknr);
79297 +static z_hash_table *znode_get_htable(const znode *);
79298 +static void zdrop(znode *);
79299 +
79300 +/* hash table support */
79301 +
79302 +/* compare two block numbers for equality. Used by hash-table macros */
79303 +static inline int
79304 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79305 +{
79306 + assert("nikita-534", b1 != NULL);
79307 + assert("nikita-535", b2 != NULL);
79308 +
79309 + return *b1 == *b2;
79310 +}
79311 +
79312 +/* Hash znode by block number. Used by hash-table macros */
79313 +/* Audited by: umka (2002.06.11) */
79314 +static inline __u32
79315 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79316 +{
79317 + assert("nikita-536", b != NULL);
79318 +
79319 + return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79320 +}
79321 +
79322 +/* The hash table definition */
79323 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
79324 +#define KFREE(ptr, size) kfree(ptr)
79325 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79326 + blknrhashfn, blknreq);
79327 +#undef KFREE
79328 +#undef KMALLOC
79329 +
79330 +/* slab for znodes */
79331 +static struct kmem_cache *znode_cache;
79332 +
79333 +int znode_shift_order;
79334 +
79335 +/**
79336 + * init_znodes - create znode cache
79337 + *
79338 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79339 + */
79340 +int init_znodes(void)
79341 +{
79342 + znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79343 + SLAB_HWCACHE_ALIGN |
79344 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79345 + if (znode_cache == NULL)
79346 + return RETERR(-ENOMEM);
79347 +
79348 + for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79349 + ++znode_shift_order);
79350 + --znode_shift_order;
79351 + return 0;
79352 +}
79353 +
79354 +/**
79355 + * done_znodes - delete znode cache
79356 + *
79357 + * This is called on reiser4 module unloading or system shutdown.
79358 + */
79359 +void done_znodes(void)
79360 +{
79361 + destroy_reiser4_cache(&znode_cache);
79362 +}
79363 +
79364 +/* call this to initialise tree of znodes */
79365 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79366 +{
79367 + int result;
79368 + assert("umka-050", tree != NULL);
79369 +
79370 + rwlock_init(&tree->dk_lock);
79371 +
79372 + result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79373 + if (result != 0)
79374 + return result;
79375 + result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79376 + return result;
79377 +}
79378 +
79379 +/* free this znode */
79380 +void zfree(znode * node /* znode to free */ )
79381 +{
79382 + assert("nikita-465", node != NULL);
79383 + assert("nikita-2120", znode_page(node) == NULL);
79384 + assert("nikita-2301", list_empty_careful(&node->lock.owners));
79385 + assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79386 + assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79387 + NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79388 + assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79389 + assert("nikita-3293", !znode_is_right_connected(node));
79390 + assert("nikita-3294", !znode_is_left_connected(node));
79391 + assert("nikita-3295", node->left == NULL);
79392 + assert("nikita-3296", node->right == NULL);
79393 +
79394 + /* not yet phash_jnode_destroy(ZJNODE(node)); */
79395 +
79396 + kmem_cache_free(znode_cache, node);
79397 +}
79398 +
79399 +/* call this to free tree of znodes */
79400 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79401 +{
79402 + znode *node;
79403 + znode *next;
79404 + z_hash_table *ztable;
79405 +
79406 + /* scan znode hash-tables and kill all znodes, then free hash tables
79407 + * themselves. */
79408 +
79409 + assert("nikita-795", tree != NULL);
79410 +
79411 + ztable = &tree->zhash_table;
79412 +
79413 + if (ztable->_table != NULL) {
79414 + for_all_in_htable(ztable, z, node, next) {
79415 + node->c_count = 0;
79416 + node->in_parent.node = NULL;
79417 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79418 + zdrop(node);
79419 + }
79420 +
79421 + z_hash_done(&tree->zhash_table);
79422 + }
79423 +
79424 + ztable = &tree->zfake_table;
79425 +
79426 + if (ztable->_table != NULL) {
79427 + for_all_in_htable(ztable, z, node, next) {
79428 + node->c_count = 0;
79429 + node->in_parent.node = NULL;
79430 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79431 + zdrop(node);
79432 + }
79433 +
79434 + z_hash_done(&tree->zfake_table);
79435 + }
79436 +}
79437 +
79438 +/* ZNODE STRUCTURES */
79439 +
79440 +/* allocate fresh znode */
79441 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79442 +{
79443 + znode *node;
79444 +
79445 + node = kmem_cache_alloc(znode_cache, gfp_flag);
79446 + return node;
79447 +}
79448 +
79449 +/* Initialize fields of znode
79450 + @node: znode to initialize;
79451 + @parent: parent znode;
79452 + @tree: tree we are in. */
79453 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79454 +{
79455 + assert("nikita-466", node != NULL);
79456 + assert("umka-268", current_tree != NULL);
79457 +
79458 + memset(node, 0, sizeof *node);
79459 +
79460 + assert("umka-051", tree != NULL);
79461 +
79462 + jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79463 + reiser4_init_lock(&node->lock);
79464 + init_parent_coord(&node->in_parent, parent);
79465 +}
79466 +
79467 +/*
79468 + * remove znode from indices. This is called jput() when last reference on
79469 + * znode is released.
79470 + */
79471 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79472 +{
79473 + assert("nikita-2108", node != NULL);
79474 + assert("nikita-470", node->c_count == 0);
79475 + assert_rw_write_locked(&(tree->tree_lock));
79476 +
79477 + /* remove reference to this znode from cbk cache */
79478 + cbk_cache_invalidate(node, tree);
79479 +
79480 + /* update c_count of parent */
79481 + if (znode_parent(node) != NULL) {
79482 + assert("nikita-472", znode_parent(node)->c_count > 0);
79483 + /* father, onto your hands I forward my spirit... */
79484 + znode_parent(node)->c_count--;
79485 + node->in_parent.node = NULL;
79486 + } else {
79487 + /* orphaned znode?! Root? */
79488 + }
79489 +
79490 + /* remove znode from hash-table */
79491 + z_hash_remove_rcu(znode_get_htable(node), node);
79492 +}
79493 +
79494 +/* zdrop() -- Remove znode from the tree.
79495 +
79496 + This is called when znode is removed from the memory. */
79497 +static void zdrop(znode * node /* znode to finish with */ )
79498 +{
79499 + jdrop(ZJNODE(node));
79500 +}
79501 +
79502 +/*
79503 + * put znode into right place in the hash table. This is called by relocate
79504 + * code.
79505 + */
79506 +int znode_rehash(znode * node /* node to rehash */ ,
79507 + const reiser4_block_nr * new_block_nr /* new block number */ )
79508 +{
79509 + z_hash_table *oldtable;
79510 + z_hash_table *newtable;
79511 + reiser4_tree *tree;
79512 +
79513 + assert("nikita-2018", node != NULL);
79514 +
79515 + tree = znode_get_tree(node);
79516 + oldtable = znode_get_htable(node);
79517 + newtable = get_htable(tree, new_block_nr);
79518 +
79519 + write_lock_tree(tree);
79520 + /* remove znode from hash-table */
79521 + z_hash_remove_rcu(oldtable, node);
79522 +
79523 + /* assertion no longer valid due to RCU */
79524 + /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79525 +
79526 + /* update blocknr */
79527 + znode_set_block(node, new_block_nr);
79528 + node->zjnode.key.z = *new_block_nr;
79529 +
79530 + /* insert it into hash */
79531 + z_hash_insert_rcu(newtable, node);
79532 + write_unlock_tree(tree);
79533 + return 0;
79534 +}
79535 +
79536 +/* ZNODE LOOKUP, GET, PUT */
79537 +
79538 +/* zlook() - get znode with given block_nr in a hash table or return NULL
79539 +
79540 + If result is non-NULL then the znode's x_count is incremented. Internal version
79541 + accepts pre-computed hash index. The hash table is accessed under caller's
79542 + tree->hash_lock.
79543 +*/
79544 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79545 +{
79546 + znode *result;
79547 + __u32 hash;
79548 + z_hash_table *htable;
79549 +
79550 + assert("jmacd-506", tree != NULL);
79551 + assert("jmacd-507", blocknr != NULL);
79552 +
79553 + htable = get_htable(tree, blocknr);
79554 + hash = blknrhashfn(htable, blocknr);
79555 +
79556 + rcu_read_lock();
79557 + result = z_hash_find_index(htable, hash, blocknr);
79558 +
79559 + if (result != NULL) {
79560 + add_x_ref(ZJNODE(result));
79561 + result = znode_rip_check(tree, result);
79562 + }
79563 + rcu_read_unlock();
79564 +
79565 + return result;
79566 +}
79567 +
79568 +/* return hash table where znode with block @blocknr is (or should be)
79569 + * stored */
79570 +static z_hash_table *get_htable(reiser4_tree * tree,
79571 + const reiser4_block_nr * const blocknr)
79572 +{
79573 + z_hash_table *table;
79574 + if (is_disk_addr_unallocated(blocknr))
79575 + table = &tree->zfake_table;
79576 + else
79577 + table = &tree->zhash_table;
79578 + return table;
79579 +}
79580 +
79581 +/* return hash table where znode @node is (or should be) stored */
79582 +static z_hash_table *znode_get_htable(const znode * node)
79583 +{
79584 + return get_htable(znode_get_tree(node), znode_get_block(node));
79585 +}
79586 +
79587 +/* zget() - get znode from hash table, allocating it if necessary.
79588 +
79589 + First a call to zlook, locating a x-referenced znode if one
79590 + exists. If znode is not found, allocate new one and return. Result
79591 + is returned with x_count reference increased.
79592 +
79593 + LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79594 + LOCK ORDERING: NONE
79595 +*/
79596 +znode *zget(reiser4_tree * tree,
79597 + const reiser4_block_nr * const blocknr,
79598 + znode * parent, tree_level level, gfp_t gfp_flag)
79599 +{
79600 + znode *result;
79601 + __u32 hashi;
79602 +
79603 + z_hash_table *zth;
79604 +
79605 + assert("jmacd-512", tree != NULL);
79606 + assert("jmacd-513", blocknr != NULL);
79607 + assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79608 +
79609 + zth = get_htable(tree, blocknr);
79610 + hashi = blknrhashfn(zth, blocknr);
79611 +
79612 + /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79613 + implemented. */
79614 +
79615 + z_hash_prefetch_bucket(zth, hashi);
79616 +
79617 + rcu_read_lock();
79618 + /* Find a matching BLOCKNR in the hash table. If the znode is found,
79619 + we obtain an reference (x_count) but the znode remains unlocked.
79620 + Have to worry about race conditions later. */
79621 + result = z_hash_find_index(zth, hashi, blocknr);
79622 + /* According to the current design, the hash table lock protects new
79623 + znode references. */
79624 + if (result != NULL) {
79625 + add_x_ref(ZJNODE(result));
79626 + /* NOTE-NIKITA it should be so, but special case during
79627 + creation of new root makes such assertion highly
79628 + complicated. */
79629 + assert("nikita-2131", 1 || znode_parent(result) == parent ||
79630 + (ZF_ISSET(result, JNODE_ORPHAN)
79631 + && (znode_parent(result) == NULL)));
79632 + result = znode_rip_check(tree, result);
79633 + }
79634 +
79635 + rcu_read_unlock();
79636 +
79637 + if (!result) {
79638 + znode *shadow;
79639 +
79640 + result = zalloc(gfp_flag);
79641 + if (!result) {
79642 + return ERR_PTR(RETERR(-ENOMEM));
79643 + }
79644 +
79645 + zinit(result, parent, tree);
79646 + ZJNODE(result)->blocknr = *blocknr;
79647 + ZJNODE(result)->key.z = *blocknr;
79648 + result->level = level;
79649 +
79650 + write_lock_tree(tree);
79651 +
79652 + shadow = z_hash_find_index(zth, hashi, blocknr);
79653 + if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79654 + jnode_list_remove(ZJNODE(result));
79655 + zfree(result);
79656 + result = shadow;
79657 + } else {
79658 + result->version = znode_build_version(tree);
79659 + z_hash_insert_index_rcu(zth, hashi, result);
79660 +
79661 + if (parent != NULL)
79662 + ++parent->c_count;
79663 + }
79664 +
79665 + add_x_ref(ZJNODE(result));
79666 +
79667 + write_unlock_tree(tree);
79668 + }
79669 +#if REISER4_DEBUG
79670 + if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
79671 + reiser4_check_block(blocknr, 1);
79672 +#endif
79673 + /* Check for invalid tree level, return -EIO */
79674 + if (unlikely(znode_get_level(result) != level)) {
79675 + warning("jmacd-504",
79676 + "Wrong level for cached block %llu: %i expecting %i",
79677 + (unsigned long long)(*blocknr), znode_get_level(result),
79678 + level);
79679 + zput(result);
79680 + return ERR_PTR(RETERR(-EIO));
79681 + }
79682 +
79683 + assert("nikita-1227", znode_invariant(result));
79684 +
79685 + return result;
79686 +}
79687 +
79688 +/* ZNODE PLUGINS/DATA */
79689 +
79690 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79691 + stored at the fixed offset from the beginning of the node. */
79692 +static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79693 + * plugin of */ )
79694 +{
79695 + reiser4_tree *tree;
79696 +
79697 + assert("nikita-1053", node != NULL);
79698 + assert("nikita-1055", zdata(node) != NULL);
79699 +
79700 + tree = znode_get_tree(node);
79701 + assert("umka-053", tree != NULL);
79702 +
79703 + if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79704 + return tree->nplug;
79705 + } else {
79706 + return node_plugin_by_disk_id
79707 + (tree, &((common_node_header *) zdata(node))->plugin_id);
79708 +#ifdef GUESS_EXISTS
79709 + reiser4_plugin *plugin;
79710 +
79711 + /* NOTE-NIKITA add locking here when dynamic plugins will be
79712 + * implemented */
79713 + for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79714 + if ((plugin->u.node.guess != NULL)
79715 + && plugin->u.node.guess(node))
79716 + return plugin;
79717 + }
79718 + warning("nikita-1057", "Cannot guess node plugin");
79719 + print_znode("node", node);
79720 + return NULL;
79721 +#endif
79722 + }
79723 +}
79724 +
79725 +/* parse node header and install ->node_plugin */
79726 +int zparse(znode * node /* znode to parse */ )
79727 +{
79728 + int result;
79729 +
79730 + assert("nikita-1233", node != NULL);
79731 + assert("nikita-2370", zdata(node) != NULL);
79732 +
79733 + if (node->nplug == NULL) {
79734 + node_plugin *nplug;
79735 +
79736 + nplug = znode_guess_plugin(node);
79737 + if (likely(nplug != NULL)) {
79738 + result = nplug->parse(node);
79739 + if (likely(result == 0))
79740 + node->nplug = nplug;
79741 + } else {
79742 + result = RETERR(-EIO);
79743 + }
79744 + } else
79745 + result = 0;
79746 + return result;
79747 +}
79748 +
79749 +/* zload with readahead */
79750 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79751 +{
79752 + int result;
79753 +
79754 + assert("nikita-484", node != NULL);
79755 + assert("nikita-1377", znode_invariant(node));
79756 + assert("jmacd-7771", !znode_above_root(node));
79757 + assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
79758 + assert("nikita-3016", reiser4_schedulable());
79759 +
79760 + if (info)
79761 + formatted_readahead(node, info);
79762 +
79763 + result = jload(ZJNODE(node));
79764 + assert("nikita-1378", znode_invariant(node));
79765 + return result;
79766 +}
79767 +
79768 +/* load content of node into memory */
79769 +int zload(znode * node)
79770 +{
79771 + return zload_ra(node, NULL);
79772 +}
79773 +
79774 +/* call node plugin to initialise newly allocated node. */
79775 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79776 +{
79777 + return jinit_new(ZJNODE(node), gfp_flags);
79778 +}
79779 +
79780 +/* drop reference to node data. When last reference is dropped, data are
79781 + unloaded. */
79782 +void zrelse(znode * node /* znode to release references to */ )
79783 +{
79784 + assert("nikita-1381", znode_invariant(node));
79785 +
79786 + jrelse(ZJNODE(node));
79787 +}
79788 +
79789 +/* returns free space in node */
79790 +unsigned znode_free_space(znode * node /* znode to query */ )
79791 +{
79792 + assert("nikita-852", node != NULL);
79793 + return node_plugin_by_node(node)->free_space(node);
79794 +}
79795 +
79796 +/* left delimiting key of znode */
79797 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79798 +{
79799 + assert("nikita-958", node != NULL);
79800 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79801 + assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79802 + assert("nikita-30671", node->rd_key_version != 0);
79803 + return &node->rd_key;
79804 +}
79805 +
79806 +/* right delimiting key of znode */
79807 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79808 +{
79809 + assert("nikita-974", node != NULL);
79810 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79811 + assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79812 + assert("nikita-30681", node->ld_key_version != 0);
79813 + return &node->ld_key;
79814 +}
79815 +
79816 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79817 + )
79818 +
79819 +/* update right-delimiting key of @node */
79820 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79821 +{
79822 + assert("nikita-2937", node != NULL);
79823 + assert("nikita-2939", key != NULL);
79824 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79825 + assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79826 + assert("nikita-2944",
79827 + znode_is_any_locked(node) ||
79828 + znode_get_level(node) != LEAF_LEVEL ||
79829 + keyge(key, &node->rd_key) ||
79830 + keyeq(&node->rd_key, reiser4_min_key()) ||
79831 + ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79832 +
79833 + node->rd_key = *key;
79834 + ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79835 + return &node->rd_key;
79836 +}
79837 +
79838 +/* update left-delimiting key of @node */
79839 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79840 +{
79841 + assert("nikita-2940", node != NULL);
79842 + assert("nikita-2941", key != NULL);
79843 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79844 + assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79845 + assert("nikita-2943",
79846 + znode_is_any_locked(node) || keyeq(&node->ld_key,
79847 + reiser4_min_key()));
79848 +
79849 + node->ld_key = *key;
79850 + ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79851 + return &node->ld_key;
79852 +}
79853 +
79854 +/* true if @key is inside key range for @node */
79855 +int znode_contains_key(znode * node /* znode to look in */ ,
79856 + const reiser4_key * key /* key to look for */ )
79857 +{
79858 + assert("nikita-1237", node != NULL);
79859 + assert("nikita-1238", key != NULL);
79860 +
79861 + /* left_delimiting_key <= key <= right_delimiting_key */
79862 + return keyle(znode_get_ld_key(node), key)
79863 + && keyle(key, znode_get_rd_key(node));
79864 +}
79865 +
79866 +/* same as znode_contains_key(), but lock dk lock */
79867 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
79868 + const reiser4_key * key /* key to look for */ )
79869 +{
79870 + int result;
79871 +
79872 + assert("umka-056", node != NULL);
79873 + assert("umka-057", key != NULL);
79874 +
79875 + read_lock_dk(znode_get_tree(node));
79876 + result = znode_contains_key(node, key);
79877 + read_unlock_dk(znode_get_tree(node));
79878 + return result;
79879 +}
79880 +
79881 +/* get parent pointer, assuming tree is not locked */
79882 +znode *znode_parent_nolock(const znode * node /* child znode */ )
79883 +{
79884 + assert("nikita-1444", node != NULL);
79885 + return node->in_parent.node;
79886 +}
79887 +
79888 +/* get parent pointer of znode */
79889 +znode *znode_parent(const znode * node /* child znode */ )
79890 +{
79891 + assert("nikita-1226", node != NULL);
79892 + assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79893 + return znode_parent_nolock(node);
79894 +}
79895 +
79896 +/* detect uber znode used to protect in-superblock tree root pointer */
79897 +int znode_above_root(const znode * node /* znode to query */ )
79898 +{
79899 + assert("umka-059", node != NULL);
79900 +
79901 + return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79902 +}
79903 +
79904 +/* check that @node is root---that its block number is recorder in the tree as
79905 + that of root node */
79906 +#if REISER4_DEBUG
79907 +static int znode_is_true_root(const znode * node /* znode to query */ )
79908 +{
79909 + assert("umka-060", node != NULL);
79910 + assert("umka-061", current_tree != NULL);
79911 +
79912 + return disk_addr_eq(znode_get_block(node),
79913 + &znode_get_tree(node)->root_block);
79914 +}
79915 +#endif
79916 +
79917 +/* check that @node is root */
79918 +int znode_is_root(const znode * node /* znode to query */ )
79919 +{
79920 + assert("nikita-1206", node != NULL);
79921 +
79922 + return znode_get_level(node) == znode_get_tree(node)->height;
79923 +}
79924 +
79925 +/* Returns true is @node was just created by zget() and wasn't ever loaded
79926 + into memory. */
79927 +/* NIKITA-HANS: yes */
79928 +int znode_just_created(const znode * node)
79929 +{
79930 + assert("nikita-2188", node != NULL);
79931 + return (znode_page(node) == NULL);
79932 +}
79933 +
79934 +/* obtain updated ->znode_epoch. See seal.c for description. */
79935 +__u64 znode_build_version(reiser4_tree * tree)
79936 +{
79937 + __u64 result;
79938 +
79939 + spin_lock(&tree->epoch_lock);
79940 + result = ++tree->znode_epoch;
79941 + spin_unlock(&tree->epoch_lock);
79942 + return result;
79943 +}
79944 +
79945 +void init_load_count(load_count * dh)
79946 +{
79947 + assert("nikita-2105", dh != NULL);
79948 + memset(dh, 0, sizeof *dh);
79949 +}
79950 +
79951 +void done_load_count(load_count * dh)
79952 +{
79953 + assert("nikita-2106", dh != NULL);
79954 + if (dh->node != NULL) {
79955 + for (; dh->d_ref > 0; --dh->d_ref)
79956 + zrelse(dh->node);
79957 + dh->node = NULL;
79958 + }
79959 +}
79960 +
79961 +static int incr_load_count(load_count * dh)
79962 +{
79963 + int result;
79964 +
79965 + assert("nikita-2110", dh != NULL);
79966 + assert("nikita-2111", dh->node != NULL);
79967 +
79968 + result = zload(dh->node);
79969 + if (result == 0)
79970 + ++dh->d_ref;
79971 + return result;
79972 +}
79973 +
79974 +int incr_load_count_znode(load_count * dh, znode * node)
79975 +{
79976 + assert("nikita-2107", dh != NULL);
79977 + assert("nikita-2158", node != NULL);
79978 + assert("nikita-2109",
79979 + ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79980 +
79981 + dh->node = node;
79982 + return incr_load_count(dh);
79983 +}
79984 +
79985 +int incr_load_count_jnode(load_count * dh, jnode * node)
79986 +{
79987 + if (jnode_is_znode(node)) {
79988 + return incr_load_count_znode(dh, JZNODE(node));
79989 + }
79990 + return 0;
79991 +}
79992 +
79993 +void copy_load_count(load_count * new, load_count * old)
79994 +{
79995 + int ret = 0;
79996 + done_load_count(new);
79997 + new->node = old->node;
79998 + new->d_ref = 0;
79999 +
80000 + while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
80001 + }
80002 +
80003 + assert("jmacd-87589", ret == 0);
80004 +}
80005 +
80006 +void move_load_count(load_count * new, load_count * old)
80007 +{
80008 + done_load_count(new);
80009 + new->node = old->node;
80010 + new->d_ref = old->d_ref;
80011 + old->node = NULL;
80012 + old->d_ref = 0;
80013 +}
80014 +
80015 +/* convert parent pointer into coord */
80016 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
80017 +{
80018 + assert("nikita-3204", pcoord != NULL);
80019 + assert("nikita-3205", coord != NULL);
80020 +
80021 + coord_init_first_unit_nocheck(coord, pcoord->node);
80022 + coord_set_item_pos(coord, pcoord->item_pos);
80023 + coord->between = AT_UNIT;
80024 +}
80025 +
80026 +/* pack coord into parent_coord_t */
80027 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80028 +{
80029 + assert("nikita-3206", pcoord != NULL);
80030 + assert("nikita-3207", coord != NULL);
80031 +
80032 + pcoord->node = coord->node;
80033 + pcoord->item_pos = coord->item_pos;
80034 +}
80035 +
80036 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80037 + look for comments there) */
80038 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80039 +{
80040 + pcoord->node = (znode *) node;
80041 + pcoord->item_pos = (unsigned short)~0;
80042 +}
80043 +
80044 +#if REISER4_DEBUG
80045 +
80046 +/* debugging aid: znode invariant */
80047 +static int znode_invariant_f(const znode * node /* znode to check */ ,
80048 + char const **msg /* where to store error
80049 + * message, if any */ )
80050 +{
80051 +#define _ergo(ant, con) \
80052 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80053 +
80054 +#define _equi(e1, e2) \
80055 + ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80056 +
80057 +#define _check(exp) ((*msg) = #exp, (exp))
80058 +
80059 + return jnode_invariant_f(ZJNODE(node), msg) &&
80060 + /* [znode-fake] invariant */
80061 + /* fake znode doesn't have a parent, and */
80062 + _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80063 + /* there is another way to express this very check, and */
80064 + _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80065 + /* it has special block number, and */
80066 + _ergo(znode_get_level(node) == 0,
80067 + disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80068 + /* it is the only znode with such block number, and */
80069 + _ergo(!znode_above_root(node) && znode_is_loaded(node),
80070 + !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80071 + /* it is parent of the tree root node */
80072 + _ergo(znode_is_true_root(node),
80073 + znode_above_root(znode_parent(node))) &&
80074 + /* [znode-level] invariant */
80075 + /* level of parent znode is one larger than that of child,
80076 + except for the fake znode, and */
80077 + _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80078 + znode_get_level(znode_parent(node)) ==
80079 + znode_get_level(node) + 1) &&
80080 + /* left neighbor is at the same level, and */
80081 + _ergo(znode_is_left_connected(node) && node->left != NULL,
80082 + znode_get_level(node) == znode_get_level(node->left)) &&
80083 + /* right neighbor is at the same level */
80084 + _ergo(znode_is_right_connected(node) && node->right != NULL,
80085 + znode_get_level(node) == znode_get_level(node->right)) &&
80086 + /* [znode-connected] invariant */
80087 + _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80088 + _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80089 + _ergo(!znode_is_root(node) && node->left != NULL,
80090 + znode_is_right_connected(node->left) &&
80091 + node->left->right == node) &&
80092 + _ergo(!znode_is_root(node) && node->right != NULL,
80093 + znode_is_left_connected(node->right) &&
80094 + node->right->left == node) &&
80095 + /* [znode-c_count] invariant */
80096 + /* for any znode, c_count of its parent is greater than 0 */
80097 + _ergo(znode_parent(node) != NULL &&
80098 + !znode_above_root(znode_parent(node)),
80099 + znode_parent(node)->c_count > 0) &&
80100 + /* leaves don't have children */
80101 + _ergo(znode_get_level(node) == LEAF_LEVEL,
80102 + node->c_count == 0) &&
80103 + _check(node->zjnode.jnodes.prev != NULL) &&
80104 + _check(node->zjnode.jnodes.next != NULL) &&
80105 + /* orphan doesn't have a parent */
80106 + _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80107 + /* [znode-modify] invariant */
80108 + /* if znode is not write-locked, its checksum remains
80109 + * invariant */
80110 + /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80111 + * cannot check this. */
80112 + /* [znode-refs] invariant */
80113 + /* only referenced znode can be long-term locked */
80114 + _ergo(znode_is_locked(node),
80115 + atomic_read(&ZJNODE(node)->x_count) != 0);
80116 +}
80117 +
80118 +/* debugging aid: check znode invariant and panic if it doesn't hold */
80119 +int znode_invariant(znode * node /* znode to check */ )
80120 +{
80121 + char const *failed_msg;
80122 + int result;
80123 +
80124 + assert("umka-063", node != NULL);
80125 + assert("umka-064", current_tree != NULL);
80126 +
80127 + spin_lock_znode(node);
80128 + read_lock_tree(znode_get_tree(node));
80129 + result = znode_invariant_f(node, &failed_msg);
80130 + if (!result) {
80131 + /* print_znode("corrupted node", node); */
80132 + warning("jmacd-555", "Condition %s failed", failed_msg);
80133 + }
80134 + read_unlock_tree(znode_get_tree(node));
80135 + spin_unlock_znode(node);
80136 + return result;
80137 +}
80138 +
80139 +/* return non-0 iff data are loaded into znode */
80140 +int znode_is_loaded(const znode * node /* znode to query */ )
80141 +{
80142 + assert("nikita-497", node != NULL);
80143 + return jnode_is_loaded(ZJNODE(node));
80144 +}
80145 +
80146 +unsigned long znode_times_locked(const znode * z)
80147 +{
80148 + return z->times_locked;
80149 +}
80150 +
80151 +#endif /* REISER4_DEBUG */
80152 +
80153 +/* Make Linus happy.
80154 + Local variables:
80155 + c-indentation-style: "K&R"
80156 + mode-name: "LC"
80157 + c-basic-offset: 8
80158 + tab-width: 8
80159 + fill-column: 120
80160 + End:
80161 +*/
80162 diff -urN linux-2.6.20.orig/fs/reiser4/znode.h linux-2.6.20/fs/reiser4/znode.h
80163 --- linux-2.6.20.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
80164 +++ linux-2.6.20/fs/reiser4/znode.h 2007-05-06 14:50:43.907040716 +0400
80165 @@ -0,0 +1,434 @@
80166 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80167 + * reiser4/README */
80168 +
80169 +/* Declaration of znode (Zam's node). See znode.c for more details. */
80170 +
80171 +#ifndef __ZNODE_H__
80172 +#define __ZNODE_H__
80173 +
80174 +#include "forward.h"
80175 +#include "debug.h"
80176 +#include "dformat.h"
80177 +#include "key.h"
80178 +#include "coord.h"
80179 +#include "plugin/node/node.h"
80180 +#include "jnode.h"
80181 +#include "lock.h"
80182 +#include "readahead.h"
80183 +
80184 +#include <linux/types.h>
80185 +#include <linux/spinlock.h>
80186 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
80187 +#include <asm/atomic.h>
80188 +#include <asm/semaphore.h>
80189 +
80190 +/* znode tracks its position within parent (internal item in a parent node,
80191 + * that contains znode's block number). */
80192 +typedef struct parent_coord {
80193 + znode *node;
80194 + pos_in_node_t item_pos;
80195 +} parent_coord_t;
80196 +
80197 +/* &znode - node in a reiser4 tree.
80198 +
80199 + NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80200 + cacheline pressure.
80201 +
80202 + Locking:
80203 +
80204 + Long term: data in a disk node attached to this znode are protected
80205 + by long term, deadlock aware lock ->lock;
80206 +
80207 + Spin lock: the following fields are protected by the spin lock:
80208 +
80209 + ->lock
80210 +
80211 + Following fields are protected by the global tree lock:
80212 +
80213 + ->left
80214 + ->right
80215 + ->in_parent
80216 + ->c_count
80217 +
80218 + Following fields are protected by the global delimiting key lock (dk_lock):
80219 +
80220 + ->ld_key (to update ->ld_key long-term lock on the node is also required)
80221 + ->rd_key
80222 +
80223 + Following fields are protected by the long term lock:
80224 +
80225 + ->nr_items
80226 +
80227 + ->node_plugin is never changed once set. This means that after code made
80228 + itself sure that field is valid it can be accessed without any additional
80229 + locking.
80230 +
80231 + ->level is immutable.
80232 +
80233 + Invariants involving this data-type:
80234 +
80235 + [znode-fake]
80236 + [znode-level]
80237 + [znode-connected]
80238 + [znode-c_count]
80239 + [znode-refs]
80240 + [jnode-refs]
80241 + [jnode-queued]
80242 + [znode-modify]
80243 +
80244 + For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80245 + Suggestions for how to do that are desired.*/
80246 +struct znode {
80247 + /* Embedded jnode. */
80248 + jnode zjnode;
80249 +
80250 + /* contains three subfields, node, pos_in_node, and pos_in_unit.
80251 +
80252 + pos_in_node and pos_in_unit are only hints that are cached to
80253 + speed up lookups during balancing. They are not required to be up to
80254 + date. Synched in find_child_ptr().
80255 +
80256 + This value allows us to avoid expensive binary searches.
80257 +
80258 + in_parent->node points to the parent of this node, and is NOT a
80259 + hint.
80260 + */
80261 + parent_coord_t in_parent;
80262 +
80263 + /*
80264 + * sibling list pointers
80265 + */
80266 +
80267 + /* left-neighbor */
80268 + znode *left;
80269 + /* right-neighbor */
80270 + znode *right;
80271 +
80272 + /* long term lock on node content. This lock supports deadlock
80273 + detection. See lock.c
80274 + */
80275 + zlock lock;
80276 +
80277 + /* You cannot remove from memory a node that has children in
80278 + memory. This is because we rely on the fact that parent of given
80279 + node can always be reached without blocking for io. When reading a
80280 + node into memory you must increase the c_count of its parent, when
80281 + removing it from memory you must decrease the c_count. This makes
80282 + the code simpler, and the cases where it is suboptimal are truly
80283 + obscure.
80284 + */
80285 + int c_count;
80286 +
80287 + /* plugin of node attached to this znode. NULL if znode is not
80288 + loaded. */
80289 + node_plugin *nplug;
80290 +
80291 + /* version of znode data. This is increased on each modification. This
80292 + * is necessary to implement seals (see seal.[ch]) efficiently. */
80293 + __u64 version;
80294 +
80295 + /* left delimiting key. Necessary to efficiently perform
80296 + balancing with node-level locking. Kept in memory only. */
80297 + reiser4_key ld_key;
80298 + /* right delimiting key. */
80299 + reiser4_key rd_key;
80300 +
80301 + /* znode's tree level */
80302 + __u16 level;
80303 + /* number of items in this node. This field is modified by node
80304 + * plugin. */
80305 + __u16 nr_items;
80306 +
80307 +#if REISER4_DEBUG
80308 + void *creator;
80309 + reiser4_key first_key;
80310 + unsigned long times_locked;
80311 + int left_version; /* when node->left was updated */
80312 + int right_version; /* when node->right was updated */
80313 + int ld_key_version; /* when node->ld_key was updated */
80314 + int rd_key_version; /* when node->rd_key was updated */
80315 +#endif
80316 +
80317 +} __attribute__ ((aligned(16)));
80318 +
80319 +ON_DEBUG(extern atomic_t delim_key_version;
80320 + )
80321 +
80322 +/* In general I think these macros should not be exposed. */
80323 +#define znode_is_locked(node) (lock_is_locked(&node->lock))
80324 +#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80325 +#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80326 +#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80327 +#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80328 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80329 +/* Macros for accessing the znode state. */
80330 +#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80331 +#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80332 +#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80333 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80334 + znode * parent, tree_level level, gfp_t gfp_flag);
80335 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80336 +extern int zload(znode * node);
80337 +extern int zload_ra(znode * node, ra_info_t * info);
80338 +extern int zinit_new(znode * node, gfp_t gfp_flags);
80339 +extern void zrelse(znode * node);
80340 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80341 +
80342 +/* size of data in znode */
80343 +static inline unsigned
80344 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80345 +{
80346 + assert("nikita-1416", node != NULL);
80347 + return PAGE_CACHE_SIZE;
80348 +}
80349 +
80350 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80351 + coord_t * coord);
80352 +extern void coord_to_parent_coord(const coord_t * coord,
80353 + parent_coord_t * pcoord);
80354 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80355 +
80356 +extern unsigned znode_free_space(znode * node);
80357 +
80358 +extern reiser4_key *znode_get_rd_key(znode * node);
80359 +extern reiser4_key *znode_get_ld_key(znode * node);
80360 +
80361 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80362 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80363 +
80364 +/* `connected' state checks */
80365 +static inline int znode_is_right_connected(const znode * node)
80366 +{
80367 + return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80368 +}
80369 +
80370 +static inline int znode_is_left_connected(const znode * node)
80371 +{
80372 + return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80373 +}
80374 +
80375 +static inline int znode_is_connected(const znode * node)
80376 +{
80377 + return znode_is_right_connected(node) && znode_is_left_connected(node);
80378 +}
80379 +
80380 +extern int znode_shift_order;
80381 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80382 +extern void znode_remove(znode *, reiser4_tree *);
80383 +extern znode *znode_parent(const znode * node);
80384 +extern znode *znode_parent_nolock(const znode * node);
80385 +extern int znode_above_root(const znode * node);
80386 +extern int init_znodes(void);
80387 +extern void done_znodes(void);
80388 +extern int znodes_tree_init(reiser4_tree * ztree);
80389 +extern void znodes_tree_done(reiser4_tree * ztree);
80390 +extern int znode_contains_key(znode * node, const reiser4_key * key);
80391 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80392 +extern unsigned znode_save_free_space(znode * node);
80393 +extern unsigned znode_recover_free_space(znode * node);
80394 +extern znode *zalloc(gfp_t gfp_flag);
80395 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
80396 +extern int zparse(znode * node);
80397 +
80398 +extern int znode_just_created(const znode * node);
80399 +
80400 +extern void zfree(znode * node);
80401 +
80402 +#if REISER4_DEBUG
80403 +extern void print_znode(const char *prefix, const znode * node);
80404 +#else
80405 +#define print_znode( p, n ) noop
80406 +#endif
80407 +
80408 +/* Make it look like various znode functions exist instead of treating znodes as
80409 + jnodes in znode-specific code. */
80410 +#define znode_page(x) jnode_page ( ZJNODE(x) )
80411 +#define zdata(x) jdata ( ZJNODE(x) )
80412 +#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80413 +#define znode_created(x) jnode_created ( ZJNODE(x) )
80414 +#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80415 +#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80416 +#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80417 +
80418 +#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80419 +#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80420 +#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80421 +#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80422 +
80423 +#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80424 +#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80425 +#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80426 +#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80427 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80428 +
80429 +#if REISER4_DEBUG
80430 +extern int znode_x_count_is_protected(const znode * node);
80431 +extern int znode_invariant(znode * node);
80432 +#endif
80433 +
80434 +/* acquire reference to @node */
80435 +static inline znode *zref(znode * node)
80436 +{
80437 + /* change of x_count from 0 to 1 is protected by tree spin-lock */
80438 + return JZNODE(jref(ZJNODE(node)));
80439 +}
80440 +
80441 +/* release reference to @node */
80442 +static inline void zput(znode * node)
80443 +{
80444 + assert("nikita-3564", znode_invariant(node));
80445 + jput(ZJNODE(node));
80446 +}
80447 +
80448 +/* get the level field for a znode */
80449 +static inline tree_level znode_get_level(const znode * node)
80450 +{
80451 + return node->level;
80452 +}
80453 +
80454 +/* get the level field for a jnode */
80455 +static inline tree_level jnode_get_level(const jnode * node)
80456 +{
80457 + if (jnode_is_znode(node))
80458 + return znode_get_level(JZNODE(node));
80459 + else
80460 + /* unformatted nodes are all at the LEAF_LEVEL and for
80461 + "semi-formatted" nodes like bitmaps, level doesn't matter. */
80462 + return LEAF_LEVEL;
80463 +}
80464 +
80465 +/* true if jnode is on leaf level */
80466 +static inline int jnode_is_leaf(const jnode * node)
80467 +{
80468 + if (jnode_is_znode(node))
80469 + return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80470 + if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80471 + return 1;
80472 + return 0;
80473 +}
80474 +
80475 +/* return znode's tree */
80476 +static inline reiser4_tree *znode_get_tree(const znode * node)
80477 +{
80478 + assert("nikita-2692", node != NULL);
80479 + return jnode_get_tree(ZJNODE(node));
80480 +}
80481 +
80482 +/* resolve race with zput */
80483 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80484 +{
80485 + jnode *j;
80486 +
80487 + j = jnode_rip_sync(tree, ZJNODE(node));
80488 + if (likely(j != NULL))
80489 + node = JZNODE(j);
80490 + else
80491 + node = NULL;
80492 + return node;
80493 +}
80494 +
80495 +#if defined(REISER4_DEBUG)
80496 +int znode_is_loaded(const znode * node /* znode to query */ );
80497 +#endif
80498 +
80499 +extern __u64 znode_build_version(reiser4_tree * tree);
80500 +
80501 +/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80502 + must load the data for a node in many places. We could do this by simply calling
80503 + zload() everywhere, the difficulty arises when we must release the loaded data by
80504 + calling zrelse. In a function with many possible error/return paths, it requires extra
80505 + work to figure out which exit paths must call zrelse and those which do not. The data
80506 + handle automatically calls zrelse for every zload that it is responsible for. In that
80507 + sense, it acts much like a lock_handle.
80508 +*/
80509 +typedef struct load_count {
80510 + znode *node;
80511 + int d_ref;
80512 +} load_count;
80513 +
80514 +extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80515 +extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80516 +extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80517 +extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80518 + * incr_load_count_znode, otherwise do nothing (unformatted nodes
80519 + * don't require zload/zrelse treatment). */
80520 +extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80521 +extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80522 +
80523 +/* Variable initializers for load_count. */
80524 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80525 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80526 +/* A convenience macro for use in assertions or debug-only code, where loaded
80527 + data is only required to perform the debugging check. This macro
80528 + encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80529 +#define WITH_DATA( node, exp ) \
80530 +({ \
80531 + long __with_dh_result; \
80532 + znode *__with_dh_node; \
80533 + \
80534 + __with_dh_node = ( node ); \
80535 + __with_dh_result = zload( __with_dh_node ); \
80536 + if( __with_dh_result == 0 ) { \
80537 + __with_dh_result = ( long )( exp ); \
80538 + zrelse( __with_dh_node ); \
80539 + } \
80540 + __with_dh_result; \
80541 +})
80542 +
80543 +/* Same as above, but accepts a return value in case zload fails. */
80544 +#define WITH_DATA_RET( node, ret, exp ) \
80545 +({ \
80546 + int __with_dh_result; \
80547 + znode *__with_dh_node; \
80548 + \
80549 + __with_dh_node = ( node ); \
80550 + __with_dh_result = zload( __with_dh_node ); \
80551 + if( __with_dh_result == 0 ) { \
80552 + __with_dh_result = ( int )( exp ); \
80553 + zrelse( __with_dh_node ); \
80554 + } else \
80555 + __with_dh_result = ( ret ); \
80556 + __with_dh_result; \
80557 +})
80558 +
80559 +#define WITH_COORD(coord, exp) \
80560 +({ \
80561 + coord_t *__coord; \
80562 + \
80563 + __coord = (coord); \
80564 + coord_clear_iplug(__coord); \
80565 + WITH_DATA(__coord->node, exp); \
80566 +})
80567 +
80568 +#if REISER4_DEBUG
80569 +#define STORE_COUNTERS \
80570 + reiser4_lock_counters_info __entry_counters = \
80571 + *reiser4_lock_counters()
80572 +#define CHECK_COUNTERS \
80573 +ON_DEBUG_CONTEXT( \
80574 +({ \
80575 + __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
80576 + __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
80577 + __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
80578 + assert("nikita-2159", \
80579 + !memcmp(&__entry_counters, reiser4_lock_counters(), \
80580 + sizeof __entry_counters)); \
80581 +}) )
80582 +
80583 +#else
80584 +#define STORE_COUNTERS
80585 +#define CHECK_COUNTERS noop
80586 +#endif
80587 +
80588 +/* __ZNODE_H__ */
80589 +#endif
80590 +
80591 +/* Make Linus happy.
80592 + Local variables:
80593 + c-indentation-style: "K&R"
80594 + mode-name: "LC"
80595 + c-basic-offset: 8
80596 + tab-width: 8
80597 + fill-column: 120
80598 + End:
80599 +*/
80600 diff -urN linux-2.6.20.orig/include/linux/fs.h linux-2.6.20/include/linux/fs.h
80601 --- linux-2.6.20.orig/include/linux/fs.h 2007-05-06 15:04:41.352625543 +0400
80602 +++ linux-2.6.20/include/linux/fs.h 2007-05-06 14:50:43.911041966 +0400
80603 @@ -1165,6 +1165,8 @@
80604 void (*clear_inode) (struct inode *);
80605 void (*umount_begin) (struct vfsmount *, int);
80606
80607 + void (*sync_inodes) (struct super_block *sb,
80608 + struct writeback_control *wbc);
80609 int (*show_options)(struct seq_file *, struct vfsmount *);
80610 int (*show_stats)(struct seq_file *, struct vfsmount *);
80611 #ifdef CONFIG_QUOTA
80612 @@ -1583,6 +1585,7 @@
80613 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80614 pgoff_t start, pgoff_t end);
80615 extern int write_inode_now(struct inode *, int);
80616 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80617 extern int filemap_fdatawrite(struct address_space *);
80618 extern int filemap_flush(struct address_space *);
80619 extern int filemap_fdatawait(struct address_space *);
80620 diff -urN linux-2.6.20.orig/lib/radix-tree.c linux-2.6.20/lib/radix-tree.c
80621 --- linux-2.6.20.orig/lib/radix-tree.c 2007-05-06 15:04:42.096858012 +0400
80622 +++ linux-2.6.20/lib/radix-tree.c 2007-05-06 14:50:43.915043216 +0400
80623 @@ -151,6 +151,7 @@
80624 out:
80625 return ret;
80626 }
80627 +EXPORT_SYMBOL(radix_tree_preload);
80628
80629 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
80630 int offset)
80631 diff -urN linux-2.6.20.orig/mm/filemap.c linux-2.6.20/mm/filemap.c
80632 --- linux-2.6.20.orig/mm/filemap.c 2007-05-06 15:04:42.108861762 +0400
80633 +++ linux-2.6.20/mm/filemap.c 2007-05-06 14:50:43.919044465 +0400
80634 @@ -121,6 +121,7 @@
80635 mapping->nrpages--;
80636 __dec_zone_page_state(page, NR_FILE_PAGES);
80637 }
80638 +EXPORT_SYMBOL(__remove_from_page_cache);
80639
80640 void remove_from_page_cache(struct page *page)
80641 {
80642 @@ -132,6 +133,7 @@
80643 __remove_from_page_cache(page);
80644 write_unlock_irq(&mapping->tree_lock);
80645 }
80646 +EXPORT_SYMBOL(remove_from_page_cache);
80647
80648 static int sync_page(void *word)
80649 {
80650 @@ -738,6 +740,7 @@
80651 read_unlock_irq(&mapping->tree_lock);
80652 return ret;
80653 }
80654 +EXPORT_SYMBOL(add_to_page_cache_lru);
80655
80656 /**
80657 * find_get_pages_contig - gang contiguous pagecache lookup
80658 @@ -798,6 +801,7 @@
80659 read_unlock_irq(&mapping->tree_lock);
80660 return ret;
80661 }
80662 +EXPORT_SYMBOL(find_get_pages);
80663
80664 /**
80665 * grab_cache_page_nowait - returns locked page at given index in given cache
80666 @@ -855,6 +859,7 @@
80667
80668 ra->ra_pages /= 4;
80669 }
80670 +EXPORT_SYMBOL(find_get_pages_tag);
80671
80672 /**
80673 * do_generic_mapping_read - generic file read routine
80674 diff -urN linux-2.6.20.orig/mm/readahead.c linux-2.6.20/mm/readahead.c
80675 --- linux-2.6.20.orig/mm/readahead.c 2007-05-06 15:04:42.144873010 +0400
80676 +++ linux-2.6.20/mm/readahead.c 2007-05-06 14:50:43.919044465 +0400
80677 @@ -568,6 +568,7 @@
80678 ra->flags &= ~RA_FLAG_INCACHE;
80679 ra->cache_hit = 0;
80680 }
80681 +EXPORT_SYMBOL_GPL(handle_ra_miss);
80682
80683 /*
80684 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
80685 Files linux-2.6.20.orig/scripts/kconfig/mconf and linux-2.6.20/scripts/kconfig/mconf differ